In [1]:
from pyspark.sql import SparkSession

In [2]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window
from helpers import createDFFromFileAndSchema, clean_special_letters, clean_special_character

In [3]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/20 19:18:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark

In [5]:
SCHEMAS_FOLDER = './schemas/'
FILES_FOLDER = './assets/parsedData/'

# Load and clean Paper DF

In [6]:
### load paper into schema
paper_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}papers.csv', f'{SCHEMAS_FOLDER}paper.csv')
paper_df.show()

File path: ./assets/parsedData/papers.csv, schema path: ./schemas/paper.csv
Types from schema: [('paper_id', 'Integer'), ('title', 'String'), ('year', 'Integer')]


[Stage 0:>                                                          (0 + 1) / 1]

+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|Direct file organ...|1984|
|     130|An introduction t...|1983|
|     195|On solving almost...|1984|
|     260|Connections betwe...|1984|
|     325|Computers and pen...|1984|
|     390|Relativizations c...|1984|
|     455|On the optimum ch...|1984|
|     520|All points addres...|1984|
|     585|Optimum Head Sepa...|1984|
|     650|A parallel-design...|1984|
|     715|Computer - IEEE C...|1984|
|     780|Experience with G...|1984|
|     845|Code generation a...|1984|
|     910|On estimating acc...|1984|
|     975|A distributed alt...|1985|
|    1040|A comparison of t...|1984|
|    1105|Generalizing spec...|1985|
|    1170|Real time graphic...|1984|
|    1235|Common and uncomm...|1984|
|    1300|Foundations of co...|1985|
+--------+--------------------+----+
only showing top 20 rows



                                                                                

In [7]:
### data cleaning for paper schema

### remove spaces from values of the columns
paper_df = paper_df.withColumn("paper_id", trim(paper_df.paper_id))
paper_df = paper_df.withColumn("title", trim(paper_df.title))
paper_df = paper_df.withColumn("year", trim(paper_df.year))

In [8]:
### check for the data types
paper_df.printSchema()
### change the data type of year to Integer
paper_df = paper_df.withColumn("year",paper_df["year"].cast(IntegerType()))

root
 |-- paper_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [9]:
### check for nonsense null data
null_values_paper_df = paper_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
   )
### save the ids of papers whose title is missing to clean up the other dataframes
null_paper_ids = paper_df.filter(paper_df['title'].isNull())
null_paper_ids_list=null_paper_ids.select('paper_id').rdd.flatMap(lambda x: x).collect()
null_paper_ids_list = [int(item) for item in null_paper_ids_list]

                                                                                

In [10]:
### after checking the below dataframes, all papers whose title is missing have the authors besides paper_id = 748056
### decision: fill missing titles with : Missing Title

paper_df=paper_df.na.fill('Missing Title', ['title'])

In [11]:
### remove special characters
paper_df=clean_special_character(paper_df,'title')

In [12]:
paper_df.columns

['paper_id', 'title', 'year']

In [13]:
### check if there are duplicate rows
paper_df.join(paper_df.groupBy(paper_df.columns).agg((F.count("*")>1).cast("int").alias("Duplicate_indicator")),
on=paper_df.columns,how="inner").show()
###there are no duplicates
paper_df.groupby(['paper_id']).count().where('count > 1').sort('count', ascending=False).show()

22/01/20 19:19:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:19:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:19:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

+--------+--------------------+----+-------------------+
|paper_id|               title|year|Duplicate_indicator|
+--------+--------------------+----+-------------------+
|      10|The Three-Machine...|1984|                  0|
|     100|A structured appr...|1983|                  0|
| 1000003|Allocation tolera...|2007|                  0|
| 1000006|Numerical and exp...|2007|                  0|
|  100001|The operation and...|1991|                  0|
| 1000012|An accurate and f...|2007|                  0|
| 1000015|Increasing the ef...|2007|                  0|
| 1000023|Modeling and line...|2007|                  0|
| 1000035|Asymmetric pricin...|2007|                  0|
| 1000041|Practical conside...|2007|                  0|
| 1000049|A data simulation...|2007|                  0|
| 1000052|An integrated met...|2007|                  0|
| 1000054|Towards a grid si...|2007|                  0|
| 1000056|Shape efficiency ...|2007|                  0|
|  100006|Implications of n...|

[Stage 9:>                                                          (0 + 4) / 4]

+--------+-----+
|paper_id|count|
+--------+-----+
+--------+-----+



                                                                                

In [14]:
paper_df.count()

                                                                                

2092356

In [15]:
# paper_df.filter(unique_paper_author_df['title'].like("%%")).show(20,False)

# Load and clean paper_authors

In [16]:
### load paper_authors into schema
paper_author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}paper_authors.csv', f'{SCHEMAS_FOLDER}paper_authors.csv')
paper_author_df.show()

File path: ./assets/parsedData/paper_authors.csv, schema path: ./schemas/paper_authors.csv
Types from schema: [('authors', 'String'), ('paper_id', 'Integer')]
+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows



In [17]:
### remove leadind and trailing spaces
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))
paper_author_df = paper_author_df.withColumn("paper_id", trim(paper_author_df.paper_id))

In [18]:
### change data type for paper_id to Integer
paper_author_df = paper_author_df.withColumn("paper_id",paper_author_df["paper_id"].cast(IntegerType()))


In [19]:
### remove special letters
paper_author_df=clean_special_letters(paper_author_df, 'authors')

In [20]:
###null_values_paper_authors=paper_author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_author_df.columns])

### check if authors are missing as well for the ids whose title was missing in paper_df
###for rows in paper_author_df.select("authors","paper_id").collect():
###    if rows[1] in null_paper_ids_list:
###        print(rows[0], rows[1])

In [21]:
### split authors so we can have clean data and seperate records {paper_id; author}
unique_paper_author_df = paper_author_df.select(F.col("paper_id"), F.explode(F.split(F.col("authors"),";")).alias("author"))
unique_paper_author_df.show(20, False)
paper_author_df.show(20, False)

+--------+---------------------+
|paper_id|author               |
+--------+---------------------+
|65      |K Devine             |
|65      |F J. Smith           |
|130     |J Wolff von Gudenberg|
|195     |J. K. Reid           |
|195     |A. Jennings          |
|260     |William G. Golson    |
|260     |William C. Rounds    |
|325     |Stein Schjolberg     |
|390     |W Ian Gasarch        |
|390     |Steven Homer         |
|455     |Sam Toueg            |
|455     |zalp Babaoğlu        |
|520     |Frederick H. Dill    |
|520     |Satish Gupta         |
|520     |Daniel T. Ling       |
|520     |Richard E. Matick    |
|585     |A. R. Calderbank     |
|585     |E. G. Coffman, Jr.   |
|585     |L. Flatto            |
|650     |Uzi Vishkin          |
+--------+---------------------+
only showing top 20 rows

+---------------------------------------------------------------+--------+
|authors                                                        |paper_id|
+-------------------------------

In [22]:
### remove leadind and trailing spaces
unique_paper_author_df = unique_paper_author_df.withColumn("author", trim(unique_paper_author_df.author))
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id", trim(unique_paper_author_df.paper_id))
### change data type for paper_id to Integer
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id",unique_paper_author_df["paper_id"].cast(IntegerType()))


In [23]:
### remove special characters
unique_paper_author_df=clean_special_character(unique_paper_author_df, 'author')

In [24]:
### lowercase author-name
unique_paper_author_df=unique_paper_author_df.withColumn('author', lower(col('author')))

In [25]:
### check for duplicate rows:
unique_paper_author_df.groupby(['paper_id', 'author']).count().where('count > 1').sort('count', ascending=False).show()

### drop duplicate rows since here we need unique paper-author relation
unique_paper_author_df=unique_paper_author_df.dropDuplicates()

22/01/20 19:20:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:20:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:21:00 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+--------------------+-----+
|paper_id|              author|count|
+--------+--------------------+-----+
| 1523221|        dongkun shin|    4|
| 2040206|        mingshu wang|    3|
| 2059316|       han chuanfeng|    3|
| 2040206|        anchun cheng|    3|
| 1202294|           n. sharma|    3|
| 2042230|             lu leng|    3|
| 1198797|    alladi venkatesh|    2|
| 1503841|   a. a. khoroshilov|    2|
| 1939475|        nicola ferro|    2|
|  128862|         yang w. lee|    2|
| 1423835|deogratias harori...|    2|
| 1440869|  mehdi khosrow-pour|    2|
| 1018879|       bruce spencer|    2|
|  390835|       jeff van west|    2|
| 1668760|       pradeep kumar|    2|
| 1900087|              cao yi|    2|
| 1900218|        dongqiang xu|    2|
| 2083391|      m. chidambaram|    2|
| 1167734| theodore a. bakalar|    2|
| 1423827|            xin geng|    2|
+--------+--------------------+-----+
only showing top 20 rows





In [26]:
### split author-name so we keep only the surname
unique_paper_author_cleaned_df = unique_paper_author_df.select(F.col("paper_id"), F.trim(F.element_at \
                    (F.split(F.col("author")," "),-1)).alias('name'))
unique_paper_author_cleaned_df.show()



+--------+---------+
|paper_id|     name|
+--------+---------+
|     470|soderlund|
|    1700|     ceri|
|    1821|     suri|
|    1895|     plum|
|    3259|      iii|
|    3523| cardelli|
|    3582| williams|
|    3717|silvester|
|    5083|prodinger|
|    5151|   verity|
|    7290|  swieten|
|    8388|     huet|
|    8581| stoddart|
|    9505| korfhage|
|   10149| phillips|
|   10610|     saad|
|   11188|     maio|
|   11391|    tracz|
|   11703|   landau|
|   11831|   reeves|
+--------+---------+
only showing top 20 rows



[Stage 21:>                                                         (0 + 1) / 1]                                                                                

In [27]:
unique_paper_author_df.show(truncate=False)



+--------+----------------------+
|paper_id|author                |
+--------+----------------------+
|470     |lars soderlund        |
|1700    |s. ceri               |
|1821    |rajan suri            |
|1895    |thomas plum           |
|3259    |george m. whitson, iii|
|3523    |luca cardelli         |
|3582    |martha e. williams    |
|3717    |p. p. silvester       |
|5083    |h prodinger           |
|5151    |john w verity         |
|7290    |a c m van swieten     |
|8388    |g huet                |
|8581    |bill stoddart         |
|9505    |r r korfhage          |
|10149   |t n phillips          |
|10610   |youcef saad           |
|11188   |d maio                |
|11391   |w tracz               |
|11703   |marie-claude landau   |
|11831   |thomas c reeves       |
+--------+----------------------+
only showing top 20 rows



[Stage 24:>                                                         (0 + 1) / 1]                                                                                

In [28]:
unique_paper_author_cleaned_df.count()

                                                                                

5237613

# Clean and Load Authors df

In [29]:
### load author into schema
author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}authors.csv', f'{SCHEMAS_FOLDER}author.csv')
author_df.show()

author_df.printSchema()

File path: ./assets/parsedData/authors.csv, schema path: ./schemas/author.csv
Types from schema: [('author_id', 'Integer'), ('citation_count', 'Integer'), ('h_index', 'Integer'), ('name', 'String'), ('paper_count', 'Integer')]
+---------+--------------+-------+--------------------+-----------+
|author_id|citation_count|h_index|                name|paper_count|
+---------+--------------+-------+--------------------+-----------+
|       17|             0|      0|     J. Michael Howe|          1|
|       34|             0|      0|        Haitham Gabr|          2|
|       51|             4|      1|         Emma Tonkin|          8|
|       68|             1|      1|        Woochul Shin|          4|
|       85|             0|      0|           S Improta|          1|
|      102|             8|      2|       Richard Ferri|          5|
|      119|             0|      0|            Qing Liu|          1|
|      136|             0|      0|      Artur Gramacki|          2|
|      153|             0

In [30]:
### remove spaces from values of the columns
author_df = author_df.withColumn("author_id", trim(author_df.author_id))
author_df = author_df.withColumn("citation_count", trim(author_df.citation_count))
author_df = author_df.withColumn("h_index", trim(author_df.h_index))
author_df = author_df.withColumn("name", trim(author_df.name))
author_df = author_df.withColumn("paper_count", trim(author_df.paper_count))

In [31]:

### change data type of author_id, paper_count, citation_count, h_index to Integer
author_df = author_df.withColumn("author_id",author_df["author_id"].cast(IntegerType()))
author_df = author_df.withColumn("citation_count",author_df["citation_count"].cast(IntegerType()))
author_df = author_df.withColumn("h_index",author_df["h_index"].cast(IntegerType()))
author_df = author_df.withColumn("paper_count",author_df["paper_count"].cast(IntegerType()))

In [32]:
### check for nonsense null data
null_values_author_df = author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns]
   )
null_values_author_df.show()

### Decided to drop rows whose author--name is missing (2 authors)
### At the moment we can evaluate precomputed paper_count and citation_count only if we have the author_names

### author_df=author_df.na.drop(how="any", subset=['name'])



+---------+--------------+-------+----+-----------+
|author_id|citation_count|h_index|name|paper_count|
+---------+--------------+-------+----+-----------+
|        0|             2|      2|   3|          2|
+---------+--------------+-------+----+-----------+



                                                                                

In [33]:
### fill empty paper_count, citation_count, h_index to 0   (just one author)
author_df=author_df.na.fill(value=0, subset='paper_count')
author_df=author_df.na.fill(value=0, subset='citation_count')
author_df=author_df.na.fill(value=0, subset='h_index')

In [34]:
### remove special characters like í, â, é
author_df=clean_special_letters(author_df, 'name')

In [35]:
### remove special characters
author_df=clean_special_character(author_df,'name')

In [36]:
### lowercase author-name
author_df=author_df.withColumn('name', lower(col('name')))

In [37]:
author_df.dropDuplicates().show()



+---------+--------------+-------+----------------+-----------+
|author_id|citation_count|h_index|            name|paper_count|
+---------+--------------+-------+----------------+-----------+
|     2465|             0|      0|  kamel lecheheb|          1|
|     4199|             0|      0|  nathan burrows|          1|
|     6018|             0|      0|leila jalalzadeh|          1|
|     6341|             0|      0|      v.b. singh|          3|
|     8942|             0|      0|       siqi song|          1|
|    17000|             0|      0|   xianzhong cui|          1|
|    19397|             0|      0|       m. pabrai|          1|
|    19958|             0|      0|    julien badie|          2|
|    21607|             1|      1| evgenij dashkov|          1|
|    24803|           480|     10| dinesh c. verma|         60|
|    25466|            34|      1|   r. verstappen|          7|
|    27251|             1|      1|      r. mannell|          1|
|    28526|             0|      0|  a. m

                                                                                

In [38]:
### check if there are duplicate author_ids
### author_df.groupby(['author_id']).count().where('count > 1').sort('count', ascending=False).show()

# grouped_author_duplicates_df=author_df.groupby(['name'])
# unique_authors_df=grouped_author_duplicates_df.agg(
#     round(F.avg("paper_count")).alias("paper_count"),
#     round(F.avg("citation_count")).alias("citation_count"),
#     round(F.avg("h_index")).alias("h_index"))

### unique_authors_df.count()

# Join author and paper dataframes to ensure consistency

Load and clean Author2Paper (from the supplement txt file)

In [39]:
### load file into schema
dtypes = pd.read_csv('./schemas/paper_author_id.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
author_id_2_paper_id_df = spark.read.options(delimiter='\t').csv('./assets/AMiner-Author2Paper.txt', header=False,schema=schema)

[('index', 'Integer'), ('author_id', 'Integer'), ('paper_id', 'Integer'), ('author_position', 'Integer')]


In [40]:
author_id_2_paper_id_df.count()

                                                                                

5192998

In [41]:
### check for duplicates between paper_id and author_id
author_id_2_paper_id_df.groupby(['author_id', 'paper_id']).count().where('count > 1').sort('count', ascending=False).show(10,False)


22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:24:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:25:03 WARN RowBasedKeyValueBatch: Calling spill() on

+---------+--------+-----+
|author_id|paper_id|count|
+---------+--------+-----+
+---------+--------+-----+



                                                                                

Join the suplementary df with author_df

In [42]:
author_id_2_paper_id_extended_df = author_id_2_paper_id_df.join(author_df, 'author_id', 'left').drop('index')

In [43]:
### split the author-name to keep only the surname ---> cleaning data
author_id_2_paper_id_cleaned_df = author_id_2_paper_id_extended_df.select(F.col("paper_id"), F.col("author_id"), \
                                F.col("citation_count"), F.col("h_index"), F.col("paper_count"),   \
                                F.trim(F.element_at(F.split(F.col("name")," "),-1)).alias('name'))

Join the cleaned paper_2_author_cleaned_df to unique_paper_author_cleaned_df

In [44]:
final_paper_author_id_df = unique_paper_author_cleaned_df.join(author_id_2_paper_id_cleaned_df, \
                                                               ['name', 'paper_id'], 'inner')

In [45]:
final_paper_author_id_df.dropDuplicates().count()

                                                                                

5114137

In [46]:
final_paper_author_id_df.show()

[Stage 78:>                                                         (0 + 1) / 1]

+-----------------+--------+---------+--------------+-------+-----------+
|             name|paper_id|author_id|citation_count|h_index|paper_count|
+-----------------+--------+---------+--------------+-------+-----------+
|            $#199| 1922579|   301421|             0|      0|          1|
|               %a|  678557|  1572031|             1|      1|          1|
|               %a|  678743|  1572147|             0|      0|          1|
|               %a|  678759|  1571593|             4|      1|          1|
|               %a|  678918|  1572148|             0|      0|          1|
|               %a|  678925|  1571591|             1|      1|          1|
|  &#193brah$#225m| 1745458|   525597|             0|      0|          1|
|       &#199aglar|  954357|   408677|             0|      0|          1|
|   &#199etintemel|  278973|  1085120|             0|      0|          1|
|&#214stergaringrd|  316468|   509919|             1|      1|          1|
|    &#214zen&#231| 1471484|  1282995|

                                                                                

# Load and clean Affiliations df

In [None]:
### load affiliation into schema
affiliation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}affiliations.csv', f'{SCHEMAS_FOLDER}affiliation.csv')
affiliation_df.show()

In [None]:
### remove leading and trailing spaces
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))
affiliation_df = affiliation_df.withColumn("paper_id", trim(affiliation_df.paper_id))
affiliation_df = affiliation_df.withColumn("paper_id",affiliation_df["paper_id"].cast(IntegerType()))

affiliation_df.printSchema()
affiliation_df.show()

In [None]:
### check for null values in the affiliations column
###null_values_affiliations=affiliation_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns])

In [None]:
### This df is used to count papers per unique affiliation, so if the affiliation is missing, it doesnt make sense
### drop all rows where affiliation is null

affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [None]:
### affiliation_df.filter(affiliation_df.affiliations.contains('-')).collect()

In [None]:
# ### check if affiliations are missing as well for the ids whose title was missing in paper_df
# for rows in affiliation_df.select("affiliations","paper_id").collect():
#     if rows[1] in null_paper_ids_list:
#         print(rows[0], rows[1])

In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_affiliations_df = affiliation_df.select(F.col("paper_id"), F.explode(F.split(F.col("affiliations"),";")).alias("affiliation"))
unique_affiliations_df.show(20, False)
affiliation_df.show(20, False)

In [None]:
### check for special nonsense characters "-", If the affiliation is missing, there is no point of keeping the rows
###unique_affiliations_df.filter(unique_affiliations_df.affiliations=='-').collect()
unique_affiliations_df=unique_affiliations_df.where(unique_affiliations_df.affiliation!='-')

In [None]:
unique_affiliations_df.show()

In [None]:
### check for duplicate rows:
unique_affiliations_df.groupby(['paper_id', 'affiliation']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### drop duplicate rows since here we need unique affiliations
unique_affiliations_df=unique_affiliations_df.dropDuplicates()


In [None]:
### remove special characters
unique_affiliations_df=clean_special_character(unique_affiliations_df,'affiliation')
### remove special letters
unique_affiliations_df=clean_special_letters(unique_affiliations_df, 'affiliation')

In [None]:
unique_affiliations_df=unique_affiliations_df.withColumn('affiliation', lower(col('affiliation')))


In [None]:
### check for duplicate rows:
unique_affiliations_df.groupby(['paper_id', 'affiliation']).count().where('count > 1').sort('count', ascending=False).show(truncate=False)

In [None]:
unique_affiliations_df.count()

In [None]:
### drop duplicates
unique_affiliations_df=unique_affiliations_df.dropDuplicates()

In [None]:
### Filter out paper_ids that are not part of the final_paper_author_id_df

final_affiliations_df = unique_affiliations_df.join(final_paper_author_id_df, \
                            ['paper_id'], 'inner').select(F.col('paper_id'), F.col('affiliation')).dropDuplicates()

# Load and clean Publication_venues df

In [None]:
### load publication_venues into schema
publication_venue_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}publication_venues.csv', f'{SCHEMAS_FOLDER}publication_venues.csv')
publication_venue_df.show()

In [None]:
publication_venue_df = publication_venue_df.withColumn("publication_venue", trim(publication_venue_df.publication_venue))
publication_venue_df = publication_venue_df.withColumn("paper_id", trim(publication_venue_df.paper_id))
publication_venue_df = publication_venue_df.withColumn("paper_id",publication_venue_df["paper_id"].cast(IntegerType()))
publication_venue_df.show()

In [None]:
null_values_publication_venue=publication_venue_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns]
   )

In [None]:
publication_venue_df.filter(publication_venue_df['publication_venue'].isNull()).show()
### drop null values 
publication_venue_df=publication_venue_df.na.drop(how="any", subset=['publication_venue'])

In [None]:
## check for duplicate rows
## publication_venue_df.groupby(['paper_id', 'publication_venue']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
final_publication_venues_df = publication_venue_df.join(final_paper_author_id_df, \
                            ['paper_id'], 'inner').select(F.col('paper_id'), F.col('publication_venue')).dropDuplicates()

# Load and clean Citations df

In [47]:
### load affiliation into schema
citation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}citations.csv', f'{SCHEMAS_FOLDER}citations.csv')
citation_df.show()

File path: ./assets/parsedData/citations.csv, schema path: ./schemas/citations.csv
Types from schema: [('paper_id', 'Integer'), ('ref_ids', 'String')]
+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [None]:
citation_df.show()

In [48]:
### remove leading and trailing spaces
citation_df = citation_df.withColumn("ref_ids", trim(citation_df.ref_ids))
citation_df = citation_df.withColumn("paper_id", trim(citation_df.paper_id))
### change data type of paper_id to Integer
citation_df = citation_df.withColumn("paper_id",citation_df["paper_id"].cast(IntegerType()))
citation_df.show()

+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [None]:
citation_df.select(countDistinct('paper_id')).show()

In [None]:
### check for duplicate rows
citation_df.groupby(['paper_id', 'ref_ids']).count().where('count > 1').sort('count', ascending=False).show()

In [49]:
### split citations so we can have clean data and seperate records {paper_id; ref_id}
unique_citation_df = citation_df.select(F.col("paper_id"), F.explode_outer(F.split(F.col("ref_ids"),";")).alias("ref_id"))
unique_citation_df.show(20, False)
citation_df.show(20, False)

+--------+------+
|paper_id|ref_id|
+--------+------+
|65      |null  |
|130     |null  |
|195     |317424|
|195     |317425|
|195     |317573|
|260     |null  |
|325     |null  |
|390     |null  |
|455     |null  |
|520     |318368|
|520     |323493|
|585     |null  |
|650     |null  |
|715     |null  |
|780     |318420|
|780     |319233|
|780     |319290|
|780     |319579|
|780     |320813|
|845     |null  |
+--------+------+
only showing top 20 rows

+--------+------------------------------------------------+
|paper_id|ref_ids                                         |
+--------+------------------------------------------------+
|65      |null                                            |
|130     |null                                            |
|195     |317424;317425;317573                            |
|260     |null                                            |
|325     |null                                            |
|390     |null                                            |
|4

In [None]:
unique_citation_df.select(countDistinct('paper_id')).show()

In [50]:
### remove leading and trailing spaces
unique_citation_df = unique_citation_df.withColumn("ref_id", trim(unique_citation_df.ref_id))
unique_citation_df = unique_citation_df.withColumn("paper_id", trim(unique_citation_df.paper_id))
### change datat type of ref_id to Integer
unique_citation_df = unique_citation_df.withColumn("ref_id",unique_citation_df["ref_id"].cast(IntegerType()))

In [None]:
unique_citation_df.printSchema()

In [None]:
### check for duplicate rows
unique_citation_df.groupby(['paper_id', 'ref_id']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### drop null values if any
unique_citation_df.filter(unique_citation_df['ref_id'].isNull()).show()

In [51]:
# clean up ref ids
final_citation_df = unique_citation_df.join(final_paper_author_id_df, \
                            ['paper_id'], 'inner').select(F.col('paper_id'), F.col('ref_id')).dropDuplicates()

# Load and Research_interests in df

In [None]:
### load research_interests into schema
research_interests_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}research_interests.csv', f'{SCHEMAS_FOLDER}research_interests.csv')
research_interests_df.show()

In [None]:
### remove leading and trailing spaces
research_interests_df = research_interests_df.withColumn("author_id", trim(research_interests_df.author_id))
research_interests_df = research_interests_df.withColumn("research_interests", trim(research_interests_df.research_interests))

### change data type to Integer for author_id
research_interests_df = research_interests_df.withColumn("author_id",research_interests_df["author_id"].cast(IntegerType()))

research_interests_df.printSchema()
research_interests_df.show()

In [None]:
### check for null values in the affiliations column
research_interests_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in research_interests_df.columns]).show()
### drop null values since we dont need research_interests for any computation
research_interests_df=research_interests_df.na.drop(how="any", subset=['research_interests'])


In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_research_interests_df = research_interests_df.select(F.col("author_id"), F.explode(F.split(F.col("research_interests"),";")).alias("research_interest"))
unique_research_interests_df.show(20, False)
research_interests_df.show(20, False)


In [None]:
### remove leading and trailing spaces
unique_research_interests_df = unique_research_interests_df.withColumn("author_id", trim(unique_research_interests_df.author_id))
unique_research_interests_df = unique_research_interests_df.withColumn("research_interest", trim(unique_research_interests_df.research_interest))

### change data type to Integer for author_id
unique_research_interests_df = unique_research_interests_df.withColumn("author_id",unique_research_interests_df["author_id"].cast(IntegerType()))

In [None]:
#### clean special characters and special letters
unique_research_interests_df = clean_special_character(unique_research_interests_df, 'research_interest')
unique_research_interests_df = clean_special_letters(unique_research_interests_df, 'research_interest')
### lowercase research_interests
unique_research_interests_df=unique_research_interests_df.withColumn('research_interest', lower(col('research_interest')))

In [None]:
### check for duplicate rows:
### unique_research_interests_df.groupby(['author_id', 'research_interest']).count().where('count > 1').sort('count', ascending=False).show()
### drop duplicates
unique_research_interests_df=unique_research_interests_df.dropDuplicates()


In [None]:
unique_research_interests_df.show()

In [None]:
# clean up research interests
final_research_interests_df = unique_research_interests_df.join(final_paper_author_id_df, \
                            ['author_id'], 'inner').select(F.col('author_id'),\
                                                           F.col('research_interest')).dropDuplicates()

# Run Queries

### Q1.2 Compute paper count per unique affiliation

In [None]:
## Fact table
paper_count_per_affiliation_df = final_affiliations_df.groupBy('affiliation').count().withColumnRenamed("count", "papers_count")
print(paper_count_per_affiliation_df.show())

In [None]:
paper_count_per_affiliation_df.count()

In [None]:
paper_count_per_affiliation_df.show(truncate=False)

### Q1.1 Validate precomputed paper counts, citation (ref) counts and h-indexes (per author)

#### How to compute h-index for a specific author
1. Retrieve all publications of the author (in unique_paper_author_df)
2. Calculate the number of references per publication
3. Sort the results in descending order
4. Find a threshold N, where N top publications have at least N references each. N is the h-index of the author.


In [52]:
# Calculate the number of references per publication
refs_per_paper_count_df = final_citation_df.groupBy("paper_id").count().withColumnRenamed("count","paper_references")
print(refs_per_paper_count_df.show())

22/01/20 19:31:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:31:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+--------+----------------+
|paper_id|paper_references|
+--------+----------------+
|  100010|               1|
| 1000240|              15|
| 1000280|               6|
| 1000665|              12|
| 1000795|               1|
| 1000839|              22|
| 1000888|               1|
|  100140|               1|
| 1002011|               7|
| 1002185|               1|
|  100227|               1|
|  100263|               1|
| 1002783|               1|
| 1002883|               1|
| 1002887|               1|
|  100320|               1|
| 1003202|               1|
| 1003366|               1|
| 1003397|               1|
| 1003663|               1|
+--------+----------------+
only showing top 20 rows

None


                                                                                

In [None]:
refs_per_paper_count_df.count()

In [53]:
# Join [papers per author] with [references per paper] and sort the results in descending order
author_papers_with_ref_count = final_paper_author_id_df.join(refs_per_paper_count_df, 'paper_id')\
    .sort(col("paper_references").desc())
print(author_papers_with_ref_count.show())


22/01/20 19:33:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:33:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+--------+------------+---------+--------------+-------+-----------+----------------+
|paper_id|        name|author_id|citation_count|h_index|paper_count|paper_references|
+--------+------------+---------+--------------+-------+-----------+----------------+
| 2015219|      dahlin|   541094|             0|      0|          1|             806|
| 2015219|    kaminsky|   794803|           868|     14|         48|             806|
|  719353|   grabmeier|   565318|           109|      4|         13|             772|
|  719353|    kaltofen|    84799|          1252|     21|        119|             772|
|  719353|weispfenning|   631885|           559|     12|         46|             772|
| 1221204|        wang|  1051702|            16|      1|          1|             555|
| 1583653|       meyer|   545269|            26|      3|         16|             527|
| 1583653|      kerren|   244574|           149|      7|         39|             527|
| 1583653|       ebert|   521293|            61|      



In [54]:
author_papers_with_ref_count.select(countDistinct('author_id')).show()



+-------------------------+
|count(DISTINCT author_id)|
+-------------------------+
|                  1712364|
+-------------------------+



                                                                                

In [55]:
window = Window.partitionBy(author_papers_with_ref_count['author_id']).orderBy(desc("paper_references"), desc("paper_id"))
indexed_grouped_papers_df = author_papers_with_ref_count.select('*', rank().over(window).alias('index'))


In [56]:
h_indexed_papers = indexed_grouped_papers_df.withColumn("possible_h_index", when(indexed_grouped_papers_df.index <= indexed_grouped_papers_df.paper_references, indexed_grouped_papers_df.index).otherwise(0))
print(h_indexed_papers.show(100, False))

22/01/20 19:41:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:41:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
[Stage 376:>                                                    

+--------+----------+---------+--------------+-------+-----------+----------------+-----+----------------+
|paper_id|name      |author_id|citation_count|h_index|paper_count|paper_references|index|possible_h_index|
+--------+----------+---------+--------------+-------+-----------+----------------+-----+----------------+
|973466  |willum    |1        |0             |0      |1          |1               |1    |1               |
|370032  |bloom     |12       |0             |0      |1          |1               |1    |1               |
|1620013 |weaver    |13       |14            |2      |9          |16              |1    |1               |
|1694598 |weaver    |13       |14            |2      |9          |13              |2    |2               |
|1487562 |weaver    |13       |14            |2      |9          |8               |3    |3               |
|1384282 |weaver    |13       |14            |2      |9          |8               |4    |4               |
|1098969 |weaver    |13       |14    

                                                                                

In [57]:
h_indexed_grouped_by_author_papers_df = h_indexed_papers.groupBy('author_id')

In [58]:
h_indexed_aggregated_papers_df = h_indexed_grouped_by_author_papers_df.agg(\
        F.count("paper_id").alias("validated_paper_count"),
        F.sum("paper_references").alias("validated_citation_count"),
        F.max("possible_h_index").alias("validated_h_index")
    )

In [None]:
h_indexed_aggregated_papers_df.count()

In [59]:
unique_authors_with_validated_cols_df = h_indexed_aggregated_papers_df.join(author_df, 'author_id', 'inner' )

In [None]:
unique_authors_with_validated_cols_df.count()

[Stage 693:>  (0 + 4) / 4][Stage 694:>  (0 + 0) / 4][Stage 695:>  (0 + 0) / 4]4]

In [60]:
unique_authors_with_validated_cols_df.filter(\
    unique_authors_with_validated_cols_df["validated_h_index"] != unique_authors_with_validated_cols_df["h_index"]).count()

22/01/20 19:44:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:44:42 WARN RowBasedKeyValueBatch: Calling spill() on

1230014

In [61]:
unique_authors_with_validated_cols_df.filter(\
    unique_authors_with_validated_cols_df["validated_citation_count"] != unique_authors_with_validated_cols_df["citation_count"]).count()

22/01/20 19:47:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:47:21 WARN RowBasedKeyValueBatch: Calling spill() on

1603734

In [62]:
unique_authors_with_validated_cols_df.filter(\
    unique_authors_with_validated_cols_df["validated_paper_count"] != unique_authors_with_validated_cols_df["paper_count"]).count()

22/01/20 19:49:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:49:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:49:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:49:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/20 19:50:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

89755

In [None]:
unique_authors_with_validated_cols_df.show()

In [None]:
print(unique_authors_with_validated_cols_df.show(truncate=False))