In [1]:
from pyspark.sql import SparkSession

In [16]:
#FixMe: some imports are doubled, we have to check and fix them
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

from helpers import createDFFromFileAndSchema


In [3]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

In [4]:
spark

In [6]:
SCHEMAS_FOLDER = './schemas/'
FILES_FOLDER = './assets/parsedData/'

# Load and clean Paper DF

In [17]:
### load paper into schema
paper_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}papers.csv', f'{SCHEMAS_FOLDER}paper.csv')
paper_df.show()

# dtypes = pd.read_csv('./schemas/paper.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# paper_df = spark.read.option('header', 'true').csv('./assets/parsedData/papers.csv', header=True, schema=schema)

File path: ./assets/parsedData/papers.csv, schema path: ./schemas/paper.csv
[('paper_id', 'Integer'), ('title', 'String'), ('year', 'Integer')]
+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|Direct file organ...|1984|
|     130|An introduction t...|1983|
|     195|On solving almost...|1984|
|     260|Connections betwe...|1984|
|     325|Computers and pen...|1984|
|     390|Relativizations c...|1984|
|     455|On the optimum ch...|1984|
|     520|All points addres...|1984|
|     585|Optimum Head Sepa...|1984|
|     650|A parallel-design...|1984|
|     715|Computer - IEEE C...|1984|
|     780|Experience with G...|1984|
|     845|Code generation a...|1984|
|     910|On estimating acc...|1984|
|     975|A distributed alt...|1985|
|    1040|A comparison of t...|1984|
|    1105|Generalizing spec...|1985|
|    1170|Real time graphic...|1984|
|    1235|Common and uncomm...|1984|
|    1300|Foundations of co...|1985|
+----

In [None]:
paper_df.show()

In [None]:
### data cleaning for paper schema

### remove spaces from values of the columns
paper_df = paper_df.withColumn("paper_id", trim(paper_df.paper_id))
paper_df = paper_df.withColumn("title", trim(paper_df.title))
paper_df = paper_df.withColumn("year", trim(paper_df.year))

In [None]:
### check for the data types
paper_df.printSchema()
### change the data type of year to Integer
paper_df = paper_df.withColumn("year",paper_df["year"].cast(IntegerType()))

In [None]:
### check for nonsense null data
null_values_paper_df = paper_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
   )
### save the ids of papers whose title is missing to clean up the other dataframes
null_paper_ids = paper_df.filter(paper_df['title'].isNull())
null_paper_ids_list=null_paper_ids.select('paper_id').rdd.flatMap(lambda x: x).collect()
null_paper_ids_list = [int(item) for item in null_paper_ids_list]

In [None]:
### after checking the below dataframes, all papers whose title is missing have the authors besides paper_id = 748056
### decision: fill missing titles with : Missing Title

paper_df=paper_df.na.fill('Missing Title', ['title'])

In [None]:
### remove special characters
paper_df=paper_df.withColumn('title', regexp_replace('title', '"', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', ';', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', ':', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\}', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\~', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\/', ''))

In [None]:
paper_df.columns

In [None]:
### check if there are duplicate rows
paper_df.join(paper_df.groupBy(paper_df.columns).agg((F.count("*")>1).cast("int").alias("Duplicate_indicator")),
on=paper_df.columns,how="inner").show()
###there are no duplicates
paper_df.groupby(['paper_id']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
# paper_df.filter(unique_paper_author_df['title'].like("%%")).show(20,False)

# Load and clean Affiliations df

In [13]:
### load affiliation into schema

affiliation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}affiliations.csv', f'{SCHEMAS_FOLDER}affiliation.csv')
affiliation_df.show()

# dtypes = pd.read_csv('./schemas/affiliation.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# affiliation_df = spark.read.option('header', 'true').csv('./assets/parsedData/affiliations.csv', header=True, schema=schema)
# affiliation_df.show()

File path: ./assets/parsedData/affiliations.csv, schema path: ./schemas/affiliation.csv
[('affiliations', 'String'), ('paper_id', 'Integer')]
+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows



In [None]:
### remove leading and trailing spaces
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))
affiliation_df = affiliation_df.withColumn("paper_id", trim(affiliation_df.paper_id))
affiliation_df = affiliation_df.withColumn("paper_id",affiliation_df["paper_id"].cast(IntegerType()))

affiliation_df.printSchema()
affiliation_df.show()

In [None]:
### check for null values in the affiliations column
null_values_affiliations=affiliation_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns]
   )

In [None]:
### This df is used to count papers per unique affiliation, so if the affiliation is missing, it doesnt make sense
### drop all rows where affiliation is null

affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [None]:
affiliation_df.filter(affiliation_df.affiliations.contains('-')).collect()

In [None]:
# ### check if affiliations are missing as well for the ids whose title was missing in paper_df
# for rows in affiliation_df.select("affiliations","paper_id").collect():
#     if rows[1] in null_paper_ids_list:
#         print(rows[0], rows[1])

In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_affiliations_df = affiliation_df.select(F.col("paper_id"), F.explode(F.split(F.col("affiliations"),";")).alias("affiliation"))
unique_affiliations_df.show(20, False)
affiliation_df.show(20, False)

In [None]:
### check for special nonsense characters "-", If the affiliation is missing, there is no point of keeping the rows
###unique_affiliations_df.filter(unique_affiliations_df.affiliations=='-').collect()
unique_affiliations_df=unique_affiliations_df.where(unique_affiliations_df.affiliation!='-')

In [None]:
unique_affiliations_df.show()

In [None]:
### check for duplicate rows:
unique_affiliations_df.groupby(['paper_id', 'affiliation']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### drop duplicate rows since here we need unique affiliations
unique_affiliations_df=unique_affiliations_df.dropDuplicates()


In [None]:
unique_affiliations_df.count()

# Load and clean paper_authors

In [14]:
### load paper_authors into schema
paper_author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}paper_authors.csv', f'{SCHEMAS_FOLDER}paper_authors.csv')
paper_author_df.show()

# dtypes = pd.read_csv('./schemas/paper_authors.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# paper_author_df = spark.read.option('header', 'true').csv('./assets/parsedData/paper_authors.csv', header=True, schema=schema)

File path: ./assets/parsedData/paper_authors.csv, schema path: ./schemas/paper_authors.csv
[('authors', 'String'), ('paper_id', 'Integer')]
+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows



In [None]:
### remove leadind and trailing spaces
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))
paper_author_df = paper_author_df.withColumn("paper_id", trim(paper_author_df.paper_id))

### change data type for paper_id to Integer
paper_author_df = paper_author_df.withColumn("paper_id",paper_author_df["paper_id"].cast(IntegerType()))

paper_author_df.show()
paper_author_df.printSchema()

In [None]:
### remove special letters
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'íîìïīį', 'i'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÎÏÍĪĮÌ', 'I'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'àáâäæãåā', 'a'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÀÁÂÄÆÃÅĀ', 'A'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'èéêëēėę', 'e'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÈÉÊËĒĖĘ', 'E'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ûüùúū', 'u'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÛÜÙÚŪ', 'U'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÔÖÒÓŒØŌÕ', 'O'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'Ÿ', 'Y'))

In [None]:
null_values_paper_authors=paper_author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_author_df.columns]
   )

In [None]:
### check if authors are missing as well for the ids whose title was missing in paper_df
###for rows in paper_author_df.select("authors","paper_id").collect():
###    if rows[1] in null_paper_ids_list:
###        print(rows[0], rows[1])

In [None]:
### split authors so we can have clean data and seperate records {paper_id; author}
unique_paper_author_df = paper_author_df.select(F.col("paper_id"), F.explode(F.split(F.col("authors"),";")).alias("author"))
unique_paper_author_df.show(20, False)
paper_author_df.show(20, False)

In [None]:
### remove leadind and trailing spaces
unique_paper_author_df = unique_paper_author_df.withColumn("author", trim(unique_paper_author_df.author))
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id", trim(unique_paper_author_df.paper_id))
### change data type for paper_id to Integer
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id",unique_paper_author_df["paper_id"].cast(IntegerType()))


In [None]:
### remove special characters
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '"', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', ';', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', ':', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\}', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\~', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\/', ''))

In [None]:
unique_paper_author_df.printSchema()

In [None]:
### check for duplicate rows:
unique_paper_author_df.groupby(['paper_id', 'author']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### drop duplicate rows since here we need unique paper-author relation
unique_paper_author_df=unique_paper_author_df.dropDuplicates()

In [None]:
unique_paper_author_df.show(truncate=False)

# Load and clean Publication_venues df

In [9]:
### load publication_venues into schema

publication_venue_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}publication_venues.csv', f'{SCHEMAS_FOLDER}publication_venues.csv')
publication_venue_df.show()

# dtypes = pd.read_csv('./schemas/publication_venues.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# publication_venue_df = spark.read.option('header', 'true').csv('./assets/parsedData/publication_venues.csv', header=True, schema=schema)

File path: ./assets/parsedData/publication_venues.csv, schema path: ./schemas/publication_venues.csv
[('paper_id', 'Integer'), ('publication_venue', 'String')]
+--------+--------------------+
|paper_id|   publication_venue|
+--------+--------------------+
|      65|Information Techn...|
|     130|Proc. of the symp...|
|     195|ACM Transactions ...|
|     260|Information and C...|
|     325|Computers and pen...|
|     390|Information and C...|
|     455|SIAM Journal on C...|
|     520|IBM Journal of Re...|
|     585|Journal of the AC...|
|     650|Theoretical Compu...|
|     715|            Computer|
|     780|ACM Transactions ...|
|     845|Methods and tools...|
|     910|Information Proce...|
|     975|ACM Transactions ...|
|    1040|Information Proce...|
|    1105|ACM Transactions ...|
|    1170|Proc. of the 2nd ...|
|    1235|Proc. of the inte...|
|    1300|Foundations of co...|
+--------+--------------------+
only showing top 20 rows



In [None]:
publication_venue_df = publication_venue_df.withColumn("publication_venue", trim(publication_venue_df.publication_venue))
publication_venue_df = publication_venue_df.withColumn("paper_id", trim(publication_venue_df.paper_id))
publication_venue_df = publication_venue_df.withColumn("paper_id",publication_venue_df["paper_id"].cast(IntegerType()))
publication_venue_df.show()

In [None]:
null_values_publication_venue=publication_venue_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns]
   )

In [None]:
publication_venue_df.filter(publication_venue_df['publication_venue'].isNull()).show()

# Load and clean Citations df

In [10]:
### load affiliation into schema

citation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}citations.csv', f'{SCHEMAS_FOLDER}citations.csv')
citation_df.show()

# dtypes = pd.read_csv('./schemas/citations.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# citation_df = spark.read.option('header', 'true').csv('./assets/parsedData/citations.csv', header=True, schema=schema)

File path: ./assets/parsedData/citations.csv, schema path: ./schemas/citations.csv
[('paper_id', 'Integer'), ('ref_ids', 'String')]
+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [None]:
citation_df.show()

In [None]:
### remove leading and trailing spaces
citation_df = citation_df.withColumn("ref_ids", trim(citation_df.ref_ids))
citation_df = citation_df.withColumn("paper_id", trim(citation_df.paper_id))
### change data type of paper_id to Integer
citation_df = citation_df.withColumn("paper_id",citation_df["paper_id"].cast(IntegerType()))
citation_df.show()

In [None]:
### check for duplicate rows
citation_df.groupby(['paper_id', 'ref_ids']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### split citations so we can have clean data and seperate records {paper_id; ref_id}
unique_citation_df = citation_df.select(F.col("paper_id"), F.explode(F.split(F.col("ref_ids"),";")).alias("ref_id"))
unique_citation_df.show(20, False)
citation_df.show(20, False)

In [None]:
### remove leading and trailing spaces
unique_citation_df = unique_citation_df.withColumn("ref_id", trim(unique_citation_df.ref_id))
unique_citation_df = unique_citation_df.withColumn("paper_id", trim(unique_citation_df.paper_id))
### change datat type of ref_id to Integer
unique_citation_df = unique_citation_df.withColumn("ref_id",unique_citation_df["ref_id"].cast(IntegerType()))

In [None]:
unique_citation_df.printSchema()

In [None]:
### check for duplicate rows
unique_citation_df.groupby(['paper_id', 'ref_id']).count().where('count > 1').sort('count', ascending=False).show()

# Load and clean Author df

In [11]:
### load author into schema

author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}authors.csv', f'{SCHEMAS_FOLDER}author.csv')
author_df.show()

# dtypes = pd.read_csv('./schemas/author.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# author_df = spark.read.option('header', 'true').csv('./assets/parsedData/authors.csv', header=True, schema=schema)

File path: ./assets/parsedData/authors.csv, schema path: ./schemas/author.csv
[('author_id', 'Integer'), ('citation_count', 'Integer'), ('h_index', 'Integer'), ('name', 'String'), ('paper_count', 'Integer')]
+---------+--------------+-------+--------------------+-----------+
|author_id|citation_count|h_index|                name|paper_count|
+---------+--------------+-------+--------------------+-----------+
|       17|             0|      0|     J. Michael Howe|          1|
|       34|             0|      0|        Haitham Gabr|          2|
|       51|             4|      1|         Emma Tonkin|          8|
|       68|             1|      1|        Woochul Shin|          4|
|       85|             0|      0|           S Improta|          1|
|      102|             8|      2|       Richard Ferri|          5|
|      119|             0|      0|            Qing Liu|          1|
|      136|             0|      0|      Artur Gramacki|          2|
|      153|             0|      0|Olumuyiwa 

In [None]:
author_df.printSchema()

In [None]:
### remove spaces from values of the columns
author_df = author_df.withColumn("author_id", trim(author_df.author_id))
author_df = author_df.withColumn("citation_count", trim(author_df.citation_count))
author_df = author_df.withColumn("h_index", trim(author_df.h_index))
author_df = author_df.withColumn("name", trim(author_df.name))
author_df = author_df.withColumn("paper_count", trim(author_df.paper_count))

In [None]:
### change data type of author_id, paper_count, citation_count, h_index to Integer
author_df = author_df.withColumn("author_id",author_df["author_id"].cast(IntegerType()))
author_df = author_df.withColumn("citation_count",author_df["citation_count"].cast(IntegerType()))
author_df = author_df.withColumn("h_index",author_df["h_index"].cast(IntegerType()))
author_df = author_df.withColumn("paper_count",author_df["paper_count"].cast(IntegerType()))

In [None]:
author_df.show(20,False)

In [None]:
### check for nonsense null data
null_values_author_df = author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns]
   )
null_values_author_df.show()

In [None]:
### Decided to drop rows whose author--name is missing (2 authors)
### At the moment we can evaluate precomputed paper_count and citation_count only if we have the author_names

author_df=author_df.na.drop(how="any", subset=['name'])

In [None]:
author_df.printSchema()

In [None]:
### fill empty paper_count, citation_count, h_index to 0   (just one author)
author_df=author_df.na.fill(value=0, subset='paper_count')
author_df=author_df.na.fill(value=0, subset='citation_count')
author_df=author_df.na.fill(value=0, subset='h_index')

In [None]:
### check for nonsense null data
null_values_author_df = author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns]
   )
null_values_author_df.show()

In [None]:
### We noticed that there are many similar names using different symbols/characters like the example below {Antonio García, Antonio Garcia}
author_df.filter(author_df['name'].like("%Ö%")).show(20,False)

### remove special characters like í, â, é
from pyspark.sql.functions import regexp_replace
author_df=author_df.withColumn('name', translate('name', 'íîìïīį', 'i'))
author_df=author_df.withColumn('name', translate('name', 'ÎÏÍĪĮÌ', 'I'))
author_df=author_df.withColumn('name', translate('name', 'àáâäæãåā', 'a'))
author_df=author_df.withColumn('name', translate('name', 'ÀÁÂÄÆÃÅĀ', 'A'))
author_df=author_df.withColumn('name', translate('name', 'èéêëēėę', 'e'))
author_df=author_df.withColumn('name', translate('name', 'ÈÉÊËĒĖĘ', 'E'))
author_df=author_df.withColumn('name', translate('name', 'ûüùúū', 'u'))
author_df=author_df.withColumn('name', translate('name', 'ÛÜÙÚŪ', 'U'))
author_df=author_df.withColumn('name', translate('name', 'ÔÖÒÓŒØŌÕ', 'O'))
author_df=author_df.withColumn('name', translate('name', 'Ÿ', 'Y')) 


In [None]:
### remove special characters
author_df=author_df.withColumn('name', regexp_replace('name', '"', ''))
author_df=author_df.withColumn('name', regexp_replace('name', ';', ''))
author_df=author_df.withColumn('name', regexp_replace('name', ':', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\}', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\~', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\/', ''))

In [None]:
### check if there are duplicate author_ids
author_df.groupby(['name']).count().where('count > 1').sort('count', ascending=False)

In [None]:
grouped_author_duplicates_df=author_df.groupby(['name'])
unique_authors_df=grouped_author_duplicates_df.agg(
    round(F.avg("paper_count")).alias("paper_count"),
    round(F.avg("citation_count")).alias("citation_count"),
    round(F.avg("h_index")).alias("h_index"))

In [None]:
unique_authors_df.count()

# Load and clean Research_interests in df

In [12]:
### load research_interests into schema

research_interests_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}research_interests.csv', f'{SCHEMAS_FOLDER}research_interests.csv')
research_interests_df.show()

# dtypes = pd.read_csv('./schemas/research_interests.csv').to_records(index=False).tolist()
# print(dtypes)
# fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
# schema = StructType(fields)
# research_interests_df = spark.read.option('header', 'true').csv('./assets/parsedData/research_interests.csv', header=True, schema=schema)

File path: ./assets/parsedData/research_interests.csv, schema path: ./schemas/research_interests.csv
[('author_id', 'Integer'), ('research_interests', 'String')]
+---------+--------------------+
|author_id|  research_interests|
+---------+--------------------+
|       17|HIV disease;Inter...|
|       34|associate polynom...|
|       51|metadata element;...|
|       68|Web Service;conte...|
|       85|intermediate key;...|
|      102|feedback loop;dif...|
|      119|Rough Set;nomal C...|
|      136|MATLAB toolbox;li...|
|      153|Byzantine agreeme...|
|      170|Ein objektorienti...|
|      187|portable device;A...|
|      204|Integer-valued pr...|
|      221|stock price;stock...|
|      238|Hypermedia Synchr...|
|      255|computer-mediated...|
|      272|Dijkstra method;o...|
|      289|low-frequency act...|
|      306|copyright process...|
|      323|uncertain informa...|
|      340|histology image;s...|
+---------+--------------------+
only showing top 20 rows



In [None]:
### remove leading and trailing spaces
research_interests_df = research_interests_df.withColumn("author_id", trim(research_interests_df.author_id))
research_interests_df = research_interests_df.withColumn("research_interests", trim(research_interests_df.research_interests))

### change data type to Integer for author_id
research_interests_df = research_interests_df.withColumn("author_id",research_interests_df["author_id"].cast(IntegerType()))

research_interests_df.printSchema()
research_interests_df.show()

In [None]:
### check for null values in the affiliations column
research_interests_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in research_interests_df.columns]).show()
### drop null values since we dont need research_interests for any computation
research_interests_df=research_interests_df.na.drop(how="any", subset=['research_interests'])


In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_research_interests_df = research_interests_df.select(F.col("author_id"), F.explode(F.split(F.col("research_interests"),";")).alias("research_interest"))
unique_research_interests_df.show(20, False)
research_interests_df.show(20, False)


In [None]:
# ### remove leading and trailing spaces
unique_research_interests_df = unique_research_interests_df.withColumn("author_id", trim(unique_research_interests_df.author_id))
unique_research_interests_df = unique_research_interests_df.withColumn("research_interest", trim(unique_research_interests_df.research_interests))

# ### change data type to Integer for author_id
unique_research_interests_df = unique_research_interests_df.withColumn("author_id",unique_research_interests_df["author_id"].cast(IntegerType()))

In [None]:
# unique_research_interests_df.show()

In [None]:
# ### check for duplicate rows:
# unique_research_interests_df.groupby(['author_id', 'research_interest']).count().where('count > 1').sort('count', ascending=False).show()
# ### drop duplicates
# unique_research_interests_df=unique_research_interests_df.dropDuplicates()


In [None]:
# unique_research_interests_df.show()

# Run Queries

### Q1.2 Compute paper count per unique affiliation

In [None]:
## Fact table
unique_affiliations_with_paper_count_df = unique_affiliations_df\
    .groupBy('affiliation')\
    .count()\
    .withColumnRenamed("count", "papers_count")
print(unique_affiliations_with_paper_count_df.show())

### Q1.1 Validate precomputed paper counts, citation (ref) counts and h-indexes (per author)

#### How to compute h-index for a specific author
1. Retrieve all publications of the author (in unique_paper_author_df)
2. Calculate the number of references per publication
3. Sort the results in descending order
4. Find a threshold N, where N top publications have at least N references each. N is the h-index of the author.


In [None]:
# Calculate the number of references per publication
refs_per_paper_count_df = unique_citation_df.groupBy("paper_id").count().withColumnRenamed("count","paper_references")
print(refs_per_paper_count_df.show())

In [None]:
# Join [papers per author] with [references per paper] and sort the results in descending order
author_papers_with_ref_count = unique_paper_author_df.join(refs_per_paper_count_df, 'paper_id').sort(col("paper_references").desc())
print(author_papers_with_ref_count.show())

In [None]:
window = Window.partitionBy(author_papers_with_ref_count['author']).orderBy(desc("paper_references"), desc("paper_id"))
indexed_grouped_papers_df = author_papers_with_ref_count.select('*', rank().over(window).alias('index'))

In [None]:
h_indexed_papers = indexed_grouped_papers_df.withColumn("possible_h_index", when(indexed_grouped_papers_df.index <= indexed_grouped_papers_df.paper_references, indexed_grouped_papers_df.index).otherwise(0))
print(h_indexed_papers.show(100, False))

In [None]:
h_indexed_grouped_by_author_papers_df = h_indexed_papers.groupBy('author')

In [None]:
h_indexed_aggregated_papers_df = h_indexed_grouped_by_author_papers_df.agg(\
        F.count("paper_id").alias("computed_paper_count"),
        F.sum("paper_references").alias("computed_citation_count"),
        F.max("possible_h_index").alias("computed_h_index")
    )

In [None]:
h_indexed_aggregated_papers_df = h_indexed_aggregated_papers_df.withColumnRenamed("author", "name")

In [None]:
## Join the real authors with the validated data from paper dataset
unique_authors_with_validated_cols_df = unique_authors_df.join(h_indexed_aggregated_papers_df, 'name')

In [None]:
print(unique_authors_with_validated_cols_df.show(truncate=False))

# Save cleaned & computed data into the csv files

In [None]:
CLEAN_DATA_FOLDER = './assets/cleanedDFsData/'
def saveDFIntoCSVFile(df, fileName):
#     df = sqlContext.createDataFrame(dictsArray)
    print(df.show())
    # Save data to csv file
    df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(fileName)

In [None]:
# Saving cleaned & unique papers data
saveDFIntoCSVFile(paper_df, f'{CLEAN_DATA_FOLDER}papers.csv')

In [None]:
# Saving cleaned & unique paper author data
saveDFIntoCSVFile(unique_paper_author_df, f'{CLEAN_DATA_FOLDER}paper_author.csv')

In [None]:
# Saving cleaned & unique affiliations data with computed paper_count
saveDFIntoCSVFile(unique_affiliations_with_paper_count_df, f'{CLEAN_DATA_FOLDER}affiliations.csv')

In [None]:
# Saving cleaned & unique publication venues data 
saveDFIntoCSVFile(publication_venue_df, f'{CLEAN_DATA_FOLDER}publication_venues.csv')

In [None]:
# Saving cleaned & unique citations data
saveDFIntoCSVFile(unique_citation_df, f'{CLEAN_DATA_FOLDER}citations.csv')

In [None]:
# Saving cleaned & computed & unique authors data
saveDFIntoCSVFile(unique_authors_with_validated_cols_df, f'{CLEAN_DATA_FOLDER}authors.csv')

In [None]:
# Saving cleaned & computed & research interests data
saveDFIntoCSVFile(unique_research_interests_df, f'{CLEAN_DATA_FOLDER}research_interests.csv')