In [1]:
# /**
# *                   _oo0oo_
# *                  o8888888o
# *                  88" . "88
# *                  (| -_- |)
# *                  0\  =  /0
# *                ___/`---'\___
# *              .' \\|     |// '.
# *             / \\|||  :  |||// \
# *            / _||||| -:- |||||- \
# *           |   | \\\  -  /// |   |
# *           | \_|  ''\---/''  |_/ |
# *           \  .-\__  '-'  ___/-. /
# *         ___'. .'  /--.--\  `. .'___
# *      ."" '<  `.___\_<|>_/___.' >' "".
# *     | | :  `- \`.;`\ _ /`;.`/ - ` : | |
# *     \  \ `_.   \_ __\ /__ _/   .-` /  /
# * =====`-.____`.___ \_____/___.-`___.-'=====
# *                   `=---='
# *
# *
# * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# *
# *   Buddha blesses your code to be bug free
# */

In [2]:
from pyspark.sql import SparkSession

In [3]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window
from helpers import createDFFromFileAndSchema, clean_special_letters, clean_special_character

In [4]:
import glob
import shutil

In [5]:
spark=SparkSession.builder.appName('Clean up the data and perform the queries').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/21 17:44:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark

In [7]:
SCHEMAS_FOLDER = './schemas/'
FILES_FOLDER = './assets/parsedData/'

# Load and clean Paper DF

In [8]:
### Load paper csv into schema
paper_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}papers.csv', f'{SCHEMAS_FOLDER}paper.csv')
paper_df.show()

File path: ./assets/parsedData/papers.csv, schema path: ./schemas/paper.csv
Types from schema: [('paper_id', 'Integer'), ('title', 'String'), ('year', 'Integer')]


                                                                                

+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|Direct file organ...|1984|
|     130|An introduction t...|1983|
|     195|On solving almost...|1984|
|     260|Connections betwe...|1984|
|     325|Computers and pen...|1984|
|     390|Relativizations c...|1984|
|     455|On the optimum ch...|1984|
|     520|All points addres...|1984|
|     585|Optimum Head Sepa...|1984|
|     650|A parallel-design...|1984|
|     715|Computer - IEEE C...|1984|
|     780|Experience with G...|1984|
|     845|Code generation a...|1984|
|     910|On estimating acc...|1984|
|     975|A distributed alt...|1985|
|    1040|A comparison of t...|1984|
|    1105|Generalizing spec...|1985|
|    1170|Real time graphic...|1984|
|    1235|Common and uncomm...|1984|
|    1300|Foundations of co...|1985|
+--------+--------------------+----+
only showing top 20 rows



### Data cleaning for paper schema

In [9]:
### remove spaces from values of the columns
paper_df = paper_df.withColumn("title", trim(paper_df.title))

In [10]:
### check for the correct data types
paper_df.printSchema()

root
 |-- paper_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [11]:
### check for nonsense null data
null_values_paper_df = paper_df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
)

In [12]:
null_values_paper_df.show()



+--------+-----+----+
|paper_id|title|year|
+--------+-----+----+
|       0|   24| 157|
+--------+-----+----+



                                                                                

In [13]:
### after checking the below dataframes,
### we have seen that all papers, whose title is missing, have the authors (besides paper_id = 748056)
### Decision: fill missing titles with: "Missing Title"

paper_df=paper_df.na.fill('Missing Title', ['title'])

In [14]:
### remove special characters
paper_df=clean_special_character(paper_df,'title')

In [15]:
### check if there are duplicate rows
paper_df.join(
    paper_df
        .groupBy(paper_df.columns) \
        .agg((F.count("*")>1) \
        .cast("int") \
        .alias("Duplicate_indicator")), \
        on=paper_df.columns,how="inner") \
    .show()
### from the dataframe view, we can see that there are no duplicates
paper_df.groupby(['paper_id']).count().where('count > 1').sort('count', ascending=False).show()

                                                                                

+--------+--------------------+----+-------------------+
|paper_id|               title|year|Duplicate_indicator|
+--------+--------------------+----+-------------------+
|       3|The verification ...|1984|                  0|
|       4|Another view of f...|1984|                  0|
|       5|Entityrelationshi...|1984|                  0|
|      10|The ThreeMachine ...|1984|                  0|
|      13|The VLSI Complexi...|1984|                  0|
|      14|Computability wit...|1984|                  0|
|      16|The implication p...|1984|                  0|
|      22|On two more Eigen...|1984|                  0|
|      27|Frame theory and ...|1984|                  0|
|      30|Stationary wave s...|1984|                  0|
|      36|On the design of ...|1983|                  0|
|      41|ELSA  an extensib...|1983|                  0|
|      44|ADA Concurrent Pr...|1984|                  0|
|      52|Automated microco...|1984|                  0|
|      58|The application o...|

[Stage 11:>                                                         (0 + 4) / 4]

+--------+-----+
|paper_id|count|
+--------+-----+
+--------+-----+



                                                                                

In [16]:
### check for the number of paper entries
paper_df.count()

                                                                                

2092356

# Load and clean paper_authors

In [17]:
### load paper_authors csv into schema
paper_author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}paper_authors.csv', f'{SCHEMAS_FOLDER}paper_authors.csv')
paper_author_df.show()

File path: ./assets/parsedData/paper_authors.csv, schema path: ./schemas/paper_authors.csv
Types from schema: [('authors', 'String'), ('paper_id', 'Integer')]
+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows



In [18]:
### remove leadind and trailing spaces
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))

In [19]:
### verify schema
paper_author_df.printSchema()

root
 |-- authors: string (nullable = true)
 |-- paper_id: integer (nullable = true)



In [20]:
### remove special letters
paper_author_df=clean_special_letters(paper_author_df, 'authors')

In [21]:
### split authors so we can have clean data and separate records { paper_id; author }
unique_paper_author_df = paper_author_df \
    .select(F.col("paper_id"), F.explode(F.split(F.col("authors"),";")).alias("author"))

unique_paper_author_df.show(20, False)

+--------+---------------------+
|paper_id|author               |
+--------+---------------------+
|65      |K Devine             |
|65      |F J. Smith           |
|130     |J Wolff von Gudenberg|
|195     |J. K. Reid           |
|195     |A. Jennings          |
|260     |William G. Golson    |
|260     |William C. Rounds    |
|325     |Stein Schjolberg     |
|390     |W Ian Gasarch        |
|390     |Steven Homer         |
|455     |Sam Toueg            |
|455     |zalp Babaoğlu        |
|520     |Frederick H. Dill    |
|520     |Satish Gupta         |
|520     |Daniel T. Ling       |
|520     |Richard E. Matick    |
|585     |A. R. Calderbank     |
|585     |E. G. Coffman, Jr.   |
|585     |L. Flatto            |
|650     |Uzi Vishkin          |
+--------+---------------------+
only showing top 20 rows



In [22]:
### remove leadind and trailing spaces
unique_paper_author_df = unique_paper_author_df.withColumn("author", trim(unique_paper_author_df.author))

In [23]:
unique_paper_author_df.printSchema()

root
 |-- paper_id: integer (nullable = true)
 |-- author: string (nullable = true)



In [24]:
### remove special characters
unique_paper_author_df=clean_special_character(unique_paper_author_df, 'author')

In [25]:
### lowercase author-name
unique_paper_author_df=unique_paper_author_df.withColumn('author', lower(col('author')))

In [26]:
### check for duplicate rows:
unique_paper_author_df \
    .groupby(['paper_id', 'author']) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .show()

### drop duplicate rows since here we need unique paper-author relation
unique_paper_author_df = unique_paper_author_df.dropDuplicates()

22/01/21 17:46:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:46:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:47:25 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+--------------------+-----+
|paper_id|              author|count|
+--------+--------------------+-----+
| 1523221|        dongkun shin|    4|
| 2040206|        anchun cheng|    3|
| 2059316|       han chuanfeng|    3|
| 1202294|            n sharma|    3|
| 2042230|             lu leng|    3|
| 2040206|        mingshu wang|    3|
|  390835|       jeff van west|    2|
| 1900218|        dongqiang xu|    2|
| 1423835|deogratias harori...|    2|
|  412060|microsoft corpora...|    2|
| 1221094|           w penczek|    2|
| 1986411| vishanth weerakkody|    2|
| 1668760|       pradeep kumar|    2|
|  817145|     michael j quinn|    2|
| 1498645|   emily s patterson|    2|
| 1503841|     a a khoroshilov|    2|
| 1423827|            xin geng|    2|
| 1198797|    alladi venkatesh|    2|
| 1221362|        steve clarke|    2|
| 1899891|        wang rongxia|    2|
+--------+--------------------+-----+
only showing top 20 rows





In [27]:
### split author-name so we keep only the surname
### This decision was made because there are a lot of disrepancies in full names,
### and later on we need the clean and nice surnames for correct joining of the tables by paper_id + surname.
unique_paper_author_cleaned_df = unique_paper_author_df \
    .select(F.col("paper_id"), F.trim(F.element_at (F.split(F.col("author")," "),-1)).alias('name'))

unique_paper_author_cleaned_df.show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------+---------+
|paper_id|     name|
+--------+---------+
|     470|soderlund|
|    1821|     suri|
|    1895|     plum|
|    2145|  chamoux|
|    3523| cardelli|
|    3584|    egghe|
|    5083|prodinger|
|    5151|   verity|
|    7290|  swieten|
|    8388|     huet|
|    8581| stoddart|
|    9505| korfhage|
|   10149| phillips|
|   10610|     saad|
|   11188|     maio|
|   11391|    tracz|
|   11831|   reeves|
|   12039|       li|
|   12427|     kitt|
|   12561|overgaard|
+--------+---------+
only showing top 20 rows



                                                                                

In [28]:
### check how many entries there are in unique_paper_author_cleaned_df
unique_paper_author_cleaned_df.count()

                                                                                

5237608

# Clean and Load Authors df

In [29]:
### load author csv into schema
author_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}authors.csv', f'{SCHEMAS_FOLDER}author.csv')
author_df.show()

File path: ./assets/parsedData/authors.csv, schema path: ./schemas/author.csv
Types from schema: [('author_id', 'Integer'), ('citation_count', 'Integer'), ('h_index', 'Integer'), ('name', 'String'), ('paper_count', 'Integer')]
+---------+--------------+-------+--------------------+-----------+
|author_id|citation_count|h_index|                name|paper_count|
+---------+--------------+-------+--------------------+-----------+
|       17|             0|      0|     J. Michael Howe|          1|
|       34|             0|      0|        Haitham Gabr|          2|
|       51|             4|      1|         Emma Tonkin|          8|
|       68|             1|      1|        Woochul Shin|          4|
|       85|             0|      0|           S Improta|          1|
|      102|             8|      2|       Richard Ferri|          5|
|      119|             0|      0|            Qing Liu|          1|
|      136|             0|      0|      Artur Gramacki|          2|
|      153|             0

In [30]:
### remove spaces from values of the column
author_df = author_df.withColumn("name", trim(author_df.name))

In [31]:
### check for nonsense null data (filtering will be done later)
null_values_author_df = author_df \
    .select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns])
null_values_author_df.show()



+---------+--------------+-------+----+-----------+
|author_id|citation_count|h_index|name|paper_count|
+---------+--------------+-------+----+-----------+
|        0|             2|      2|   3|          2|
+---------+--------------+-------+----+-----------+





In [32]:
### fill empty paper_count, citation_count, h_index to 0 (just one author)
author_df=author_df.na.fill(value=0, subset='paper_count')
author_df=author_df.na.fill(value=0, subset='citation_count')
author_df=author_df.na.fill(value=0, subset='h_index')

In [33]:
### remove special characters like í, â, é
author_df=clean_special_letters(author_df, 'name')

In [34]:
### remove special characters
author_df=clean_special_character(author_df,'name')

In [35]:
### lowercase author-name
author_df=author_df.withColumn('name', lower(col('name')))

In [36]:
### check if there are duplicate author_ids
author_df.groupby(['author_id']).count().where('count > 1').sort('count', ascending=False).show()



+---------+-----+
|author_id|count|
+---------+-----+
+---------+-----+





# Join author and paper dataframes to ensure consistency

Load and clean Author2Paper (from the supplement txt file)

In [37]:
### load paper_author_id csv into schema
dtypes = pd.read_csv('./schemas/paper_author_id.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
author_id_2_paper_id_df = spark.read.options(delimiter='\t').csv('./assets/AMiner-Author2Paper.txt', header=False,schema=schema)

[('index', 'Integer'), ('author_id', 'Integer'), ('paper_id', 'Integer'), ('author_position', 'Integer')]


In [38]:
### check how many entries there are
author_id_2_paper_id_df.count()

                                                                                

5192998

In [39]:
### check for duplicates between paper_id and author_id
author_id_2_paper_id_df \
    .groupby(['author_id', 'paper_id']) \
    .count().where('count > 1') \
    .sort('count', ascending=False).show(10,False)


22/01/21 17:50:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:51:09 WARN RowBasedKeyValueBatch: Calling spill() on

+---------+--------+-----+
|author_id|paper_id|count|
+---------+--------+-----+
+---------+--------+-----+





In [40]:
### join the suplementary author_id_2_paper_id_df df with author_df
author_id_2_paper_id_extended_df = author_id_2_paper_id_df.join(author_df, 'author_id', 'left').drop('index')

In [41]:
### split the author-name to keep only the surname ---> cleaning data
author_id_2_paper_id_cleaned_df = author_id_2_paper_id_extended_df \
    .select( \
        F.col("paper_id"), \
        F.col("author_id"), \
        F.col("citation_count"), \
        F.col("h_index"), \
        F.col("paper_count"), \
        F.trim(F.element_at(F.split(F.col("name")," "),-1)).alias('name') \
    )

In [42]:
### join the cleaned paper_2_author_cleaned_df to unique_paper_author_cleaned_df
final_paper_author_id_df = unique_paper_author_cleaned_df \
    .join(author_id_2_paper_id_cleaned_df, ['name', 'paper_id'], 'inner') \
    .dropDuplicates()

# Load and clean Affiliations df

In [45]:
### load affiliation csv into schema
affiliation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}affiliations.csv', f'{SCHEMAS_FOLDER}affiliation.csv')
affiliation_df.show()

File path: ./assets/parsedData/affiliations.csv, schema path: ./schemas/affiliation.csv
Types from schema: [('affiliations', 'String'), ('paper_id', 'Integer')]
+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows



In [46]:
### remove leading and trailing spaces
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))

root
 |-- affiliations: string (nullable = true)
 |-- paper_id: integer (nullable = true)

+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows



In [47]:
### check for null values in the affiliations column
### we can see, that there are many rows with null affiliations
### these rows will be cleaned up further
null_values_affiliations=affiliation_df \
    .select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns])
print(null_values_affiliations.show())



+------------+--------+
|affiliations|paper_id|
+------------+--------+
|       37499|       0|
+------------+--------+

None




In [48]:
### This df will in the end be used to count papers per unique affiliation,
### so if the affiliation is missing, it doesnt make sense to keep the row
### Decision: drop all rows where affiliation is null
affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [49]:
### split affiliations so we can have clean data and separate records { paper_id; affiliation }
unique_affiliations_df = affiliation_df \
    .select(F.col("paper_id"), F.explode(F.split(F.col("affiliations"),";")) \
    .alias("affiliation"))
unique_affiliations_df.show(20, False)

+--------+------------------------------------------------------+
|paper_id|affiliation                                           |
+--------+------------------------------------------------------+
|65      |The Queen's University of Belfast, Belfast, UK        |
|65      |The Queen's University of Belfast, Belfast, UK        |
|130     |Univ. of Karlsruhe, Karlsruhe, West Germany           |
|195     |AERE Harwell Laboratory, Oxon, UK                     |
|195     |Queen's Univ., Belfast, Northern Ireland              |
|260     |University of Michigan, Ann Arbor, MI                 |
|260     |University of Michigan, Ann Arbor, MI                 |
|325     |Oslo politikammer, Oslo, Norway                       |
|390     |Harvard Univ., Cambridge, MA                          |
|390     |Boston Univ., Boston, MA                              |
|455     |Cornell Univ., Ithaca, NY                             |
|455     |Cornell Univ., Ithaca, NY                             |
|520     |

In [50]:
### check for special nonsense characters "-" and filter them out
### If the affiliation is missing, there is no point of keeping the rows
unique_affiliations_df = unique_affiliations_df.where(unique_affiliations_df.affiliation != '-')

In [52]:
### check for duplicate rows
unique_affiliations_df \
    .groupby(['paper_id', 'affiliation']) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .show()

[Stage 86:>                                                         (0 + 4) / 4]

+--------+--------------------+-----+
|paper_id|         affiliation|count|
+--------+--------------------+-----+
|  569905|IBM and Universit...|   91|
| 1202294|Open Grid Forum—G...|   88|
| 1542970|University of Ten...|   65|
|  418817|Humanoid Robotics...|   62|
| 1731577|IBM Semiconductor...|   59|
|  772121|IBM Research Divi...|   52|
| 1038111|IBM Thomas J. Wat...|   46|
| 1241693|INFN-CNAF V.le Be...|   44|
| 1633898|NASA Goddard Spac...|   31|
| 1077644|Carnegie Mellon U...|   31|
|  864278|Dept. of Electr. ...|   29|
| 1229219|    No Affiliations,|   29|
| 1210078|IMEC, Kapeldreef ...|   29|
|  994444|Lehrstuhl fur Ope...|   29|
| 1423217|Shanghai Astronom...|   28|
|  827034|The Artist Educat...|   28|
|  771289|IBM Research Divi...|   28|
| 1312394|Atheros Communica...|   27|
| 1972771|LinkedIn, Inc, Mo...|   27|
| 1625041|           Microsoft|   27|
+--------+--------------------+-----+
only showing top 20 rows



                                                                                

In [53]:
### drop duplicate rows since here we need unique affiliations
unique_affiliations_df = unique_affiliations_df.dropDuplicates()

In [54]:
### remove special characters
unique_affiliations_df = clean_special_character(unique_affiliations_df, 'affiliation')
### remove special letters
unique_affiliations_df = clean_special_letters(unique_affiliations_df, 'affiliation')

In [55]:
### lowercase the affiliation col values
unique_affiliations_df = unique_affiliations_df.withColumn('affiliation', lower(col('affiliation')))

In [57]:
### drop duplicates
unique_affiliations_df = unique_affiliations_df.dropDuplicates()

In [58]:
### filter out affiliations that are not part of the final_paper_author_id_df
final_affiliations_df = unique_affiliations_df \
    .join(final_paper_author_id_df, ['paper_id'], 'inner') \
    .select(F.col('paper_id'), F.col('affiliation')) \
    .dropDuplicates()

# Load and clean Publication_venues df

In [59]:
### load publication_venues into schema
publication_venue_df = createDFFromFileAndSchema( \
    spark, f'{FILES_FOLDER}publication_venues.csv', f'{SCHEMAS_FOLDER}publication_venues.csv' \
)

File path: ./assets/parsedData/publication_venues.csv, schema path: ./schemas/publication_venues.csv
Types from schema: [('paper_id', 'Integer'), ('publication_venue', 'String')]


In [60]:
### trim publication_venue
publication_venue_df = publication_venue_df.withColumn( \
    "publication_venue", trim(publication_venue_df.publication_venue) \
)
publication_venue_df.show()

+--------+--------------------+
|paper_id|   publication_venue|
+--------+--------------------+
|      65|Information Techn...|
|     130|Proc. of the symp...|
|     195|ACM Transactions ...|
|     260|Information and C...|
|     325|Computers and pen...|
|     390|Information and C...|
|     455|SIAM Journal on C...|
|     520|IBM Journal of Re...|
|     585|Journal of the AC...|
|     650|Theoretical Compu...|
|     715|            Computer|
|     780|ACM Transactions ...|
|     845|Methods and tools...|
|     910|Information Proce...|
|     975|ACM Transactions ...|
|    1040|Information Proce...|
|    1105|ACM Transactions ...|
|    1170|Proc. of the 2nd ...|
|    1235|Proc. of the inte...|
|    1300|Foundations of co...|
+--------+--------------------+
only showing top 20 rows



In [61]:
### check for null values inside publication venues dataframe
null_values_publication_venue = publication_venue_df \
    .select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns])

In [62]:
null_values_publication_venue.show()



+--------+-----------------+
|paper_id|publication_venue|
+--------+-----------------+
|       0|              148|
+--------+-----------------+





In [63]:
### drop null values 
publication_venue_df=publication_venue_df.na.drop(how="any", subset=['publication_venue'])

In [64]:
### check for duplicate rows (no duplicates found)
publication_venue_df \
    .groupby(['paper_id', 'publication_venue']) \
    .count() \
    .where('count > 1').sort('count', ascending=False).show()

[Stage 99:>                                                         (0 + 4) / 5]

+--------+-----------------+-----+
|paper_id|publication_venue|count|
+--------+-----------------+-----+
+--------+-----------------+-----+



                                                                                

In [65]:
### join publication_venue_df with final_paper_author_id_df
### to remove publication venues of paper ids which are not part of final_paper_author_id_df
final_publication_venues_df = publication_venue_df \
    .join(final_paper_author_id_df, ['paper_id'], 'inner') \
    .select(F.col('paper_id'), F.col('publication_venue')).dropDuplicates()

# Load and clean Citations df

In [66]:
### load affiliation csv into schema
citation_df = createDFFromFileAndSchema(spark, f'{FILES_FOLDER}citations.csv', f'{SCHEMAS_FOLDER}citations.csv')
citation_df.show()

File path: ./assets/parsedData/citations.csv, schema path: ./schemas/citations.csv
Types from schema: [('paper_id', 'Integer'), ('ref_ids', 'String')]
+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [67]:
### remove leading and trailing spaces
citation_df = citation_df.withColumn("ref_ids", trim(citation_df.ref_ids))

In [68]:
### show how many paper ids are present in citation_df
citation_df.select(countDistinct('paper_id')).show()



+------------------------+
|count(DISTINCT paper_id)|
+------------------------+
|                 2092356|
+------------------------+



                                                                                

In [69]:
### check for duplicate rows (no duplicates found)
citation_df.groupby(['paper_id', 'ref_ids']).count().where('count > 1').sort('count', ascending=False).show()



+--------+-------+-----+
|paper_id|ref_ids|count|
+--------+-------+-----+
+--------+-------+-----+



                                                                                

In [70]:
### split citations so we can have clean data and seperate records {paper_id; ref_id}
unique_citation_df = citation_df \
    .select(F.col("paper_id"), F.explode_outer(F.split(F.col("ref_ids"),";")).alias("ref_id"))
unique_citation_df.show(20, False)

+--------+------+
|paper_id|ref_id|
+--------+------+
|65      |null  |
|130     |null  |
|195     |317424|
|195     |317425|
|195     |317573|
|260     |null  |
|325     |null  |
|390     |null  |
|455     |null  |
|520     |318368|
|520     |323493|
|585     |null  |
|650     |null  |
|715     |null  |
|780     |318420|
|780     |319233|
|780     |319290|
|780     |319579|
|780     |320813|
|845     |null  |
+--------+------+
only showing top 20 rows



In [72]:
### remove leading and trailing spaces
unique_citation_df = unique_citation_df.withColumn("ref_id", trim(unique_citation_df.ref_id))
### change data type of ref_id to Integer
unique_citation_df = unique_citation_df.withColumn("ref_id",unique_citation_df["ref_id"].cast(IntegerType()))

In [73]:
unique_citation_df.printSchema()

root
 |-- paper_id: integer (nullable = true)
 |-- ref_id: integer (nullable = true)



In [74]:
### check for duplicate rows
unique_citation_df.groupby(['paper_id', 'ref_id']).count().where('count > 1').sort('count', ascending=False).show()

22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 17:57:07 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+------+-----+
|paper_id|ref_id|count|
+--------+------+-----+
+--------+------+-----+



                                                                                

In [75]:
### check whether some papers have null ref_ids
### (those papers will still be kept in the dataframe for consistency)
unique_citation_df.filter(unique_citation_df['ref_id'].isNull()).show()

+--------+------+
|paper_id|ref_id|
+--------+------+
|      65|  null|
|     130|  null|
|     260|  null|
|     325|  null|
|     390|  null|
|     455|  null|
|     585|  null|
|     650|  null|
|     715|  null|
|     845|  null|
|     910|  null|
|    1040|  null|
|    1170|  null|
|    1235|  null|
|    1300|  null|
|    1365|  null|
|    1430|  null|
|    1495|  null|
|    1560|  null|
|    1625|  null|
+--------+------+
only showing top 20 rows



In [76]:
### clean up ref ids if their paper_id is not a part of final_paper_author_id_df
final_citation_df = unique_citation_df \
    .join(final_paper_author_id_df, ['paper_id'], 'inner') \
    .select(F.col('paper_id'), F.col('ref_id')) \
    .dropDuplicates()

# Load and Research_interests in df

In [77]:
### load research_interests csv into schema
research_interests_df = createDFFromFileAndSchema( \
    spark, f'{FILES_FOLDER}research_interests.csv', f'{SCHEMAS_FOLDER}research_interests.csv' \
)

File path: ./assets/parsedData/research_interests.csv, schema path: ./schemas/research_interests.csv
Types from schema: [('author_id', 'Integer'), ('research_interests', 'String')]


In [78]:
### remove leading and trailing spaces
research_interests_df = research_interests_df \
    .withColumn("research_interests", trim(research_interests_df.research_interests))

In [79]:
### check for null values in the affiliations column
research_interests_df \
    .select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in research_interests_df.columns]).show()
### drop null values (it is safe since we dont need research_interests for any computation)
research_interests_df = research_interests_df.na.drop(how="any", subset=['research_interests'])



+---------+------------------+
|author_id|research_interests|
+---------+------------------+
|        0|             15399|
+---------+------------------+



                                                                                

In [80]:
research_interests_df.printSchema()
research_interests_df.show()

root
 |-- author_id: integer (nullable = true)
 |-- research_interests: string (nullable = true)

+---------+--------------------+
|author_id|  research_interests|
+---------+--------------------+
|       17|HIV disease;Inter...|
|       34|associate polynom...|
|       51|metadata element;...|
|       68|Web Service;conte...|
|       85|intermediate key;...|
|      102|feedback loop;dif...|
|      119|Rough Set;nomal C...|
|      136|MATLAB toolbox;li...|
|      153|Byzantine agreeme...|
|      170|Ein objektorienti...|
|      187|portable device;A...|
|      204|Integer-valued pr...|
|      221|stock price;stock...|
|      238|Hypermedia Synchr...|
|      255|computer-mediated...|
|      272|Dijkstra method;o...|
|      289|low-frequency act...|
|      306|copyright process...|
|      323|uncertain informa...|
|      340|histology image;s...|
+---------+--------------------+
only showing top 20 rows



In [81]:
### split research interests
### so we can have clean data and separate records { paper_id; research_interest }
unique_research_interests_df = research_interests_df \
    .select(F.col("author_id"), F.explode(F.split(F.col("research_interests"),";")) \
    .alias("research_interest"))

unique_research_interests_df.show(20, False)

+---------+-----------------------------------------+
|author_id|research_interest                        |
+---------+-----------------------------------------+
|17       |HIV disease                              |
|17       |Internet resource                        |
|17       |World-Wide Web                           |
|17       |clinical management                      |
|34       |associate polynomial term                |
|34       |bivariate polynomial                     |
|34       |difficult computational problem          |
|34       |novel polynomial                         |
|34       |polynomial multiplication problem        |
|34       |polynomial term                          |
|34       |reachability problem                     |
|34       |Probabilistic Reachability               |
|34       |Reachability analysis                    |
|34       |better time complexity                   |
|51       |metadata element                         |
|51       |metadata record  

In [82]:
### remove leading and trailing spaces
unique_research_interests_df = unique_research_interests_df \
    .withColumn("research_interest", trim(unique_research_interests_df.research_interest))

In [83]:
#### clean special characters and special letters
unique_research_interests_df = clean_special_character(unique_research_interests_df, 'research_interest')
unique_research_interests_df = clean_special_letters(unique_research_interests_df, 'research_interest')
### lowercase research_interests
unique_research_interests_df = unique_research_interests_df \
    .withColumn('research_interest', lower(col('research_interest')))

In [85]:
### drop duplicates
unique_research_interests_df=unique_research_interests_df.dropDuplicates()

In [86]:
# clean up research interests if their author id is not a part of final_paper_author_id_df
final_research_interests_df = unique_research_interests_df \
    .join(final_paper_author_id_df, ['author_id'], 'inner') \
    .select(F.col('author_id'), F.col('research_interest')) \
    .dropDuplicates()

# Run Queries

### Q1.2 Compute paper count per unique affiliation

In [87]:
paper_count_per_affiliation_df = final_affiliations_df \
    .groupBy('affiliation') \
    .count() \
    .withColumnRenamed("count", "papers_count")

In [89]:
paper_count_per_affiliation_df.show()

22/01/21 18:08:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:08:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:08:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:08:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+--------------------+------------+
|         affiliation|papers_count|
+--------------------+------------+
|nagoya univ, nago...|          30|
|nam kwang enginee...|           1|
|univ of technolog...|           6|
|univ of texas at ...|          41|
|purdue univ, west...|         389|
|laboratoires de m...|           3|
|padova  univ, pad...|           1|
|royal swedish aca...|           1|
|dept of ee  cs, u...|           1|
|loughborough univ...|           1|
|univ of quebec at...|           1|
|shanghai jiaotong...|           4|
|univ of aberdeen,...|           5|
|department of ele...|          20|
|beyond words publ...|           1|
|western illinois ...|           5|
|the bdm corporati...|           1|
|the university of...|           2|
|university of mia...|          45|
|baylor college of...|          11|
+--------------------+------------+
only showing top 20 rows



                                                                                

### Q1.1 Validate precomputed paper counts, citation (ref) counts and h-indexes (per author)

#### How to compute h-index for a specific author
1. Retrieve all publications of the author
2. Calculate the number of references per publication
3. Sort the results in descending order
4. Find a threshold N, where N top publications have at least N references each. N is the h-index of the author.


In [90]:
# 1-2 Calculate the number of references per publication
refs_per_paper_count_df = final_citation_df \
    .groupBy("paper_id") \
    .count() \
    .withColumnRenamed("count","paper_references")

In [92]:
### 3 Join [papers per author] with [references per paper] and sort the results in descending order
author_papers_with_ref_count = final_paper_author_id_df.join(refs_per_paper_count_df, 'paper_id') \
    .sort(col("paper_references").desc())

In [94]:
### add index column to table to ease h-index calculation
window = Window.partitionBy(author_papers_with_ref_count['author_id']) \
    .orderBy(desc("paper_references"), desc("paper_id"))

indexed_grouped_papers_df = author_papers_with_ref_count.select('*', rank().over(window).alias('index'))


In [95]:
h_indexed_papers = indexed_grouped_papers_df \
    .withColumn("possible_h_index", \
                when( \
                    indexed_grouped_papers_df.index <= indexed_grouped_papers_df.paper_references, \
                    indexed_grouped_papers_df.index \
                    ).otherwise(0) \
                )

In [97]:
h_indexed_grouped_by_author_papers_df = h_indexed_papers.groupBy('author_id')

In [98]:
h_indexed_aggregated_papers_df = h_indexed_grouped_by_author_papers_df.agg( \
        F.count("paper_id").alias("validated_paper_count"),
        F.sum("paper_references").alias("validated_citation_count"),
        F.max("possible_h_index").alias("validated_h_index")
    )

#### Computed results for `validated_paper_count`, `validated_paper_count` and `validated_h_index`

__Final resulting dataframe is unique_authors_with_validated_cols_df__

In [99]:
### Join the computation results with author dataframe
### to be able to compare received values with the precomputed values
unique_authors_with_validated_cols_df = h_indexed_aggregated_papers_df.join(author_df, 'author_id', 'inner' )

In [100]:
unique_authors_with_validated_cols_df.count()

                                                                                

1712364

In [None]:
unique_authors_with_validated_cols_df.show()

In [101]:
### For the info: check how many precomputed h-indexes differ from the validated h-indexes
filter_condition = unique_authors_with_validated_cols_df["validated_h_index"] != unique_authors_with_validated_cols_df["h_index"]
unique_authors_with_validated_cols_df \
    .filter(filter_condition) \
    .count()

22/01/21 18:22:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:21 WARN RowBasedKeyValueBatch: Calling spill() on

22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:22:40 WARN RowBasedKeyValueBatch: Calling spill() on

1230051

In [102]:
### For the info: check how many precomputed author citation counts differ from the validated citation counts
filter_condition = unique_authors_with_validated_cols_df["validated_citation_count"] != unique_authors_with_validated_cols_df["citation_count"]
unique_authors_with_validated_cols_df \
    .filter(filter_condition) \
    .count()

22/01/21 18:25:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:27 WARN RowBasedKeyValueBatch: Calling spill() on

22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:25:32 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

1601563

In [103]:
### For the info: check how many precomputed author paper counts differ from the validated paper counts
filter_condition = unique_authors_with_validated_cols_df["validated_paper_count"] != unique_authors_with_validated_cols_df["paper_count"]
unique_authors_with_validated_cols_df.filter(filter_condition).count()

22/01/21 18:28:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/21 18:28:29 WARN RowBasedKeyValueBatch: Calling spill() on

19934

# Save cleaned & computed data into the csv files (all data needed for T2)

In [104]:
CLEAN_DATA_FOLDER = './assets/cleanedDFsData/'
def saveDFIntoCSVFolder(df, folderName, pathToFolder):
    # Save data to csv file
    df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(f'{CLEAN_DATA_FOLDER}{folderName}')
def moveFileToCorrectFolder(folderName, pathToFolder):
    filename = glob.glob(f'{CLEAN_DATA_FOLDER}{folderName}/*.csv')[0]
    shutil.move(filename, f'{CLEAN_DATA_FOLDER}{folderName}_ds.csv')
    
# After saving of the data to csv we have to call moveFileToCorrectFolder,
# because saveDFIntoCSVFolder actually saves the data into the folder with a csv file inside.
# moveFileToCorrectFolder moves the file to the correct location.

In [105]:
## Saving cleaned & unique paper author data
saveDFIntoCSVFolder(final_paper_author_id_df, 'paper_author')

                                                                                

In [106]:
moveFileToCorrectFolder('paper_author')

In [None]:
## Saving cleaned & unique affiliations data with computed paper_count
saveDFIntoCSVFolder(final_affiliations_df, 'affiliations')

[Stage 624:>                                                        (0 + 1) / 1]

In [None]:
moveFileToCorrectFolder('affiliations')

In [None]:
## Saving cleaned & unique publication venues data 
saveDFIntoCSVFolder(final_publication_venues_df, 'publication_venues')

In [None]:
moveFileToCorrectFolder('publication_venues')

In [None]:
## Saving cleaned & computed & research interests data
saveDFIntoCSVFolder(final_research_interests_df, 'research_interests')

In [None]:
moveFileToCorrectFolder('research_interests')

In [None]:
saveDFIntoCSVFolder(paper_df, 'papers')

In [None]:
moveFileToCorrectFolder('papers')