In [8]:
from pyspark.sql import SparkSession

In [9]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

In [10]:
spark

In [11]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType

# Load and clean Paper DF

In [134]:
### load paper into schema
dtypes = pd.read_csv('./schemas/paper.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_df = spark.read.option('header', 'true').csv('./assets/parsedData/papers.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('title', 'String'), ('year', 'Integer')]


In [135]:
paper_df.show()

+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|                null|null|
|     130|                null|null|
|     195|317424;317425;317573|null|
|     260|                null|null|
|     325|                null|null|
|     390|                null|null|
|     455|                null|null|
|     520|       318368;323493|null|
|     585|                null|null|
|     650|                null|null|
|     715|                null|null|
|     780|318420;319233;319...|null|
|     845|                null|null|
|     910|                null|null|
|     975|67604;318882;3718...|null|
|    1040|                null|null|
|    1105|289087;318014;318...|null|
|    1170|                null|null|
|    1235|                null|null|
|    1300|                null|null|
+--------+--------------------+----+
only showing top 20 rows



22/01/17 19:10:46 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 3
CSV file: file:///Users/user/PycharmProjects/aminer-publications-dia/assets/parsedData/papers.csv


In [137]:
### data cleaning for paper schema

### remove spaces from values of the columns
paper_df = paper_df.withColumn("paper_id", trim(paper_df.paper_id))
paper_df = paper_df.withColumn("title", trim(paper_df.title))
paper_df = paper_df.withColumn("year", trim(paper_df.year))

In [138]:
### check for the data types
paper_df.printSchema()
### change the data type of year to Integer
paper_df = paper_df.withColumn("year",paper_df["year"].cast(IntegerType()))

root
 |-- paper_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [139]:
### check for nonsense null data
null_values_paper_df = paper_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
   )
### save the ids of papers whose title is missing to clean up the other dataframes
null_paper_ids = paper_df.filter(paper_df['title'].isNull())
null_paper_ids_list=null_paper_ids.select('paper_id').rdd.flatMap(lambda x: x).collect()
null_paper_ids_list = [int(item) for item in null_paper_ids_list]

22/01/17 19:10:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: paper_id, ref_ids
 Schema: paper_id, title
Expected: title but found: ref_ids
CSV file: file:///Users/user/PycharmProjects/aminer-publications-dia/assets/parsedData/papers.csv
                                                                                

In [140]:
### after checking the below dataframes, all papers whose title is missing have the authors besides paper_id = 748056
### decision: fill missing titles with : Missing Title

paper_df=paper_df.na.fill('Missing Title', ['title'])

In [None]:
### remove special characters
paper_df=paper_df.withColumn('title', regexp_replace('title', '"', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', ';', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', ':', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\}', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\~', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\{', ''))
paper_df=paper_df.withColumn('title', regexp_replace('title', '\/', ''))

In [141]:
paper_df.columns

['paper_id', 'title', 'year']

In [142]:
### check if there are duplicate rows
paper_df.join(paper_df.groupBy(paper_df.columns).agg((F.count("*")>1).cast("int").alias("Duplicate_indicator")),
on=paper_df.columns,how="inner").show()
###there are no duplicates
paper_df.groupby(['paper_id']).count().where('count > 1').sort('count', ascending=False).show()

22/01/17 19:11:04 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 3
CSV file: file:///Users/user/PycharmProjects/aminer-publications-dia/assets/parsedData/papers.csv
22/01/17 19:11:19 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 3
CSV file: file:///Users/user/PycharmProjects/aminer-publications-dia/assets/parsedData/papers.csv
                                                                                

+--------+-------------+----+-------------------+
|paper_id|        title|year|Duplicate_indicator|
+--------+-------------+----+-------------------+
|  372592|Missing Title|  38|                  0|
|  989909|Missing Title| 300|                  0|
|  989911|Missing Title|  49|                  0|
|  989912|Missing Title|8848|                  0|
| 1802227|Missing Title|1976|                  0|
|  904713|Missing Title|1630|                  0|
| 1999888|Missing Title|   1|                  0|
| 1082418|Missing Title| 893|                  0|
| 1053960|Missing Title|2008|                  0|
|  153257|Missing Title|1993|                  0|
+--------+-------------+----+-------------------+



[Stage 160:>                                                        (0 + 4) / 4]

+--------+-----+
|paper_id|count|
+--------+-----+
+--------+-----+



                                                                                

In [143]:
paper_df.filter(unique_paper_author_df['title'].like("%%")).show(20,False)

AnalysisException: Cannot resolve column name "title" among (paper_id, author)

# Load and clean Affiliations df

In [None]:
### load affiliation into schema
dtypes = pd.read_csv('./schemas/affiliation.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
affiliation_df = spark.read.option('header', 'true').csv('./assets/parsedData/affiliations.csv', header=True, schema=schema)

In [None]:
### remove leading and trailing spaces
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))
affiliation_df = affiliation_df.withColumn("paper_id", trim(affiliation_df.paper_id))
affiliation_df = affiliation_df.withColumn("paper_id",affiliation_df["paper_id"].cast(IntegerType()))

affiliation_df.printSchema()
affiliation_df.show()

In [None]:
### check for null values in the affiliations column
null_values_affiliations=affiliation_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns]
   )

In [None]:
### This df is used to count papers per unique affiliation, so if the affiliation is missing, it doesnt make sense
### drop all rows where affiliation is null

affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [None]:
affiliation_df.filter(affiliation_df.affiliations.contains('-')).collect()

In [None]:
### check if affiliations are missing as well for the ids whose title was missing in paper_df
for rows in affiliation_df.select("affiliations","paper_id").collect():
    if rows[1] in null_paper_ids_list:
        print(rows[0], rows[1])

In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_affiliations_df = affiliation_df.select(F.col("paper_id"), F.explode(F.split(F.col("affiliations"),";")).alias("affiliation"))
unique_affiliations_df.show(20, False)
affiliation_df.show(20, False)

In [None]:
### check for special nonsense characters "-", If the affiliation is missing, there is no point of keeping the rows
###unique_affiliations_df.filter(unique_affiliations_df.affiliations=='-').collect()
unique_affiliations_df=unique_affiliations_df.where(unique_affiliations_df.affiliation!='-')

In [None]:
unique_affiliations_df.show()

In [None]:
### check for duplicate rows:
unique_affiliations_df.groupby(['paper_id', 'affiliation']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### drop duplicate rows since here we need unique affiliations
unique_affiliations_df=unique_affiliations_df.dropDuplicates()


In [None]:
unique_affiliations_df.count()

# Load and clean paper_authors

In [107]:
### load paper_authors into schema
dtypes = pd.read_csv('./schemas/paper_authors.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_author_df = spark.read.option('header', 'true').csv('./assets/parsedData/paper_authors.csv', header=True, schema=schema)

[('authors', 'String'), ('paper_id', 'Integer')]


In [108]:
### remove leadind and trailing spaces
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))
paper_author_df = paper_author_df.withColumn("paper_id", trim(paper_author_df.paper_id))

### change data type for paper_id to Integer
paper_author_df = paper_author_df.withColumn("paper_id",paper_author_df["paper_id"].cast(IntegerType()))

paper_author_df.show()
paper_author_df.printSchema()

+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows

root
 |-- authors: string (nullable = true)
 |-- paper_id: integer (nullable = true)



In [109]:
### remove special letters
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'íîìïīį', 'i'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÎÏÍĪĮÌ', 'I'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'àáâäæãåā', 'a'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÀÁÂÄÆÃÅĀ', 'A'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'èéêëēėę', 'e'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÈÉÊËĒĖĘ', 'E'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ûüùúū', 'u'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÛÜÙÚŪ', 'U'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'ÔÖÒÓŒØŌÕ', 'O'))
paper_author_df=paper_author_df.withColumn('authors', translate('authors', 'Ÿ', 'Y'))

In [110]:
null_values_paper_authors=paper_author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_author_df.columns]
   )

In [111]:
### check if authors are missing as well for the ids whose title was missing in paper_df
###for rows in paper_author_df.select("authors","paper_id").collect():
###    if rows[1] in null_paper_ids_list:
###        print(rows[0], rows[1])

                                                                                

NameError: name 'null_paper_ids_list' is not defined

In [121]:
### split authors so we can have clean data and seperate records {paper_id; author}
unique_paper_author_df = paper_author_df.select(F.col("paper_id"), F.explode(F.split(F.col("authors"),";")).alias("author"))
unique_paper_author_df.show(20, False)
paper_author_df.show(20, False)

+--------+---------------------+
|paper_id|author               |
+--------+---------------------+
|65      |K Devine             |
|65      |F J. Smith           |
|130     |J Wolff von Gudenberg|
|195     |J. K. Reid           |
|195     |A. Jennings          |
|260     |William G. Golson    |
|260     |William C. Rounds    |
|325     |Stein Schjolberg     |
|390     |W Ian Gasarch        |
|390     |Steven Homer         |
|455     |Sam Toueg            |
|455     |zalp Babaoğlu        |
|520     |Frederick H. Dill    |
|520     |Satish Gupta         |
|520     |Daniel T. Ling       |
|520     |Richard E. Matick    |
|585     |A. R. Calderbank     |
|585     |E. G. Coffman, Jr.   |
|585     |L. Flatto            |
|650     |Uzi Vishkin          |
+--------+---------------------+
only showing top 20 rows

+---------------------------------------------------------------+--------+
|authors                                                        |paper_id|
+-------------------------------

In [122]:
### remove leadind and trailing spaces
unique_paper_author_df = unique_paper_author_df.withColumn("author", trim(unique_paper_author_df.author))
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id", trim(unique_paper_author_df.paper_id))
### change data type for paper_id to Integer
unique_paper_author_df = unique_paper_author_df.withColumn("paper_id",unique_paper_author_df["paper_id"].cast(IntegerType()))


In [128]:
### remove special characters
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '"', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', ';', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', ':', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\}', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\~', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\{', ''))
unique_paper_author_df=unique_paper_author_df.withColumn('author', regexp_replace('author', '\/', ''))

In [19]:
unique_paper_author_df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- author: string (nullable = true)



In [60]:
### check for duplicate rows:
unique_paper_author_df.groupby(['paper_id', 'author']).count().where('count > 1').sort('count', ascending=False).show()

22/01/17 18:28:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:28:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:28:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:28:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/17 18:29:35 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+-----------------+-----+
|paper_id|           author|count|
+--------+-----------------+-----+
| 1523221|     Dongkun Shin|    4|
| 2059316|    Han Chuanfeng|    3|
| 1202294|        N. Sharma|    3|
| 2040206|     Anchun Cheng|    3|
| 2040206|     Mingshu Wang|    3|
| 2042230|          Lu Leng|    3|
| 1864850| Pauline C. Reich|    2|
| 1198485|  Nedeljko Cvejic|    2|
| 1989466|    Morris Riedel|    2|
| 1586139|     Hector Zenil|    2|
| 1198797| Alladi Venkatesh|    2|
| 1434302|        Meir Russ|    2|
| 1167730|  Max A. Woodbury|    2|
| 1071070| Thorbjrn Knudsen|    2|
| 1966201|           Bo Liu|    2|
| 1612982|         A. Klemm|    2|
| 1297077|           Bei Yu|    2|
| 1443917|      Lingli Zhao|    2|
| 1947014| Steven Warburton|    2|
|  581650|J. Howard Johnson|    2|
+--------+-----------------+-----+
only showing top 20 rows



                                                                                

In [61]:
### drop duplicate rows since here we need unique paper-author relation
unique_paper_author_df=unique_paper_author_df.dropDuplicates()

In [62]:
unique_paper_author_df.show(truncate=False)

[Stage 81:>                                                         (0 + 1) / 1]

+--------+-------------------+
|paper_id|author             |
+--------+-------------------+
|1117    |Benjamin Kuipers   |
|1574    |J. G. Brookshear   |
|1707    |C. Ghezzi          |
|1829    |Peter M. Stephan   |
|2080    |Matthew L. Ginsberg|
|2222    |Nissim Francez     |
|2615    |Dan Benanav        |
|2872    |Trevor J. Bentley  |
|2996    |Martin T. Sullivan |
|3185    |William B. Robinson|
|3261    |Guy Lapalme        |
|3584    |L. Egghe           |
|4362    |C-T Liou           |
|4369    |Y-C Chen           |
|4424    |Ron M Roth         |
|5735    |D Eyre             |
|5860    |S Makridakis       |
|6181    |W J Baggaley       |
|6247    |Ravi B Boppana     |
|6700    |Tomas Hirschfeld   |
+--------+-------------------+
only showing top 20 rows



                                                                                

# Load and clean Publication_venues df

In [None]:
### load publication_venues into schema
dtypes = pd.read_csv('./schemas/publication_venues.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
publication_venue_df = spark.read.option('header', 'true').csv('./assets/parsedData/publication_venues.csv', header=True, schema=schema)

In [None]:
publication_venue_df = publication_venue_df.withColumn("publication_venue", trim(publication_venue_df.publication_venue))
publication_venue_df = publication_venue_df.withColumn("paper_id", trim(publication_venue_df.paper_id))
publication_venue_df = publication_venue_df.withColumn("paper_id",publication_venue_df["paper_id"].cast(IntegerType()))
publication_venue_df.show()

In [None]:
null_values_publication_venue=publication_venue_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns]
   )

In [None]:
publication_venue_df.filter(publication_venue_df['publication_venue'].isNull()).show()

# Load and clean Citations df

In [None]:
### load affiliation into schema
dtypes = pd.read_csv('./schemas/citations.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
citation_df = spark.read.option('header', 'true').csv('./assets/parsedData/citations.csv', header=True, schema=schema)

In [None]:
citation_df.show()

In [None]:
### remove leading and trailing spaces
citation_df = citation_df.withColumn("ref_ids", trim(citation_df.ref_ids))
citation_df = citation_df.withColumn("paper_id", trim(citation_df.paper_id))
### change data type of paper_id to Integer
citation_df = citation_df.withColumn("paper_id",citation_df["paper_id"].cast(IntegerType()))
citation_df.show()

In [None]:
### check for duplicate rows
citation_df.groupby(['paper_id', 'ref_ids']).count().where('count > 1').sort('count', ascending=False).show()

In [None]:
### split citations so we can have clean data and seperate records {paper_id; ref_id}
unique_citation_df = citation_df.select(F.col("paper_id"), F.explode(F.split(F.col("ref_ids"),";")).alias("ref_id"))
unique_citation_df.show(20, False)
citation_df.show(20, False)

In [None]:
### remove leading and trailing spaces
unique_citation_df = unique_citation_df.withColumn("ref_id", trim(unique_citation_df.ref_id))
unique_citation_df = unique_citation_df.withColumn("paper_id", trim(unique_citation_df.paper_id))
### change datat type of ref_id to Integer
unique_citation_df = unique_citation_df.withColumn("ref_id",unique_citation_df["ref_id"].cast(IntegerType()))

In [None]:
unique_citation_df.printSchema()

In [None]:
### check for duplicate rows
unique_citation_df.groupby(['paper_id', 'ref_id']).count().where('count > 1').sort('count', ascending=False).show()

# Load and clean Author df

In [21]:
### load author into schema
dtypes = pd.read_csv('./schemas/author.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
author_df = spark.read.option('header', 'true').csv('./assets/parsedData/authors.csv', header=True, schema=schema)

[('author_id', 'Integer'), ('citation_count', 'Integer'), ('h_index', 'Integer'), ('name', 'String'), ('paper_count', 'Integer')]


In [22]:
author_df.printSchema()

root
 |-- author_id: integer (nullable = true)
 |-- citation_count: integer (nullable = true)
 |-- h_index: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- paper_count: integer (nullable = true)



In [23]:
### remove spaces from values of the columns
author_df = author_df.withColumn("author_id", trim(author_df.author_id))
author_df = author_df.withColumn("citation_count", trim(author_df.citation_count))
author_df = author_df.withColumn("h_index", trim(author_df.h_index))
author_df = author_df.withColumn("name", trim(author_df.name))
author_df = author_df.withColumn("paper_count", trim(author_df.paper_count))

In [24]:
### change data type of author_id, paper_count, citation_count, h_index to Integer
author_df = author_df.withColumn("author_id",author_df["author_id"].cast(IntegerType()))
author_df = author_df.withColumn("citation_count",author_df["citation_count"].cast(IntegerType()))
author_df = author_df.withColumn("h_index",author_df["h_index"].cast(IntegerType()))
author_df = author_df.withColumn("paper_count",author_df["paper_count"].cast(IntegerType()))

In [25]:
author_df.show(20,False)

+---------+--------------+-------+-------------------------+-----------+
|author_id|citation_count|h_index|name                     |paper_count|
+---------+--------------+-------+-------------------------+-----------+
|17       |0             |0      |J. Michael Howe          |1          |
|34       |0             |0      |Haitham Gabr             |2          |
|51       |4             |1      |Emma Tonkin              |8          |
|68       |1             |1      |Woochul Shin             |4          |
|85       |0             |0      |S Improta                |1          |
|102      |8             |2      |Richard Ferri            |5          |
|119      |0             |0      |Qing Liu                 |1          |
|136      |0             |0      |Artur Gramacki           |2          |
|153      |0             |0      |Olumuyiwa Oluwasanmi     |2          |
|170      |0             |0      |Josef Willenborg         |1          |
|187      |0             |0      |Qing Wei         

In [None]:
### check for nonsense null data
null_values_author_df = author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns]
   )
null_values_author_df.show()

In [26]:
### Decided to drop rows whose author--name is missing (2 authors)
### At the moment we can evaluate precomputed paper_count and citation_count only if we have the author_names

author_df=author_df.na.drop(how="any", subset=['name'])

In [27]:
author_df.printSchema()

root
 |-- author_id: integer (nullable = true)
 |-- citation_count: integer (nullable = true)
 |-- h_index: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- paper_count: integer (nullable = true)



In [28]:
### fill empty paper_count, citation_count, h_index to 0   (just one author)
author_df=author_df.na.fill(value=0, subset='paper_count')
author_df=author_df.na.fill(value=0, subset='citation_count')
author_df=author_df.na.fill(value=0, subset='h_index')

In [None]:
### check for nonsense null data
null_values_author_df = author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in author_df.columns]
   )
null_values_author_df.show()

In [53]:
### We noticed that there are many similar names using different symbols/characters like the example below {Antonio García, Antonio Garcia}
author_df.filter(author_df['name'].like("%Ö%")).show(20,False)

### remove special characters like í, â, é
from pyspark.sql.functions import regexp_replace
author_df=author_df.withColumn('name', translate('name', 'íîìïīį', 'i'))
author_df=author_df.withColumn('name', translate('name', 'ÎÏÍĪĮÌ', 'I'))
author_df=author_df.withColumn('name', translate('name', 'àáâäæãåā', 'a'))
author_df=author_df.withColumn('name', translate('name', 'ÀÁÂÄÆÃÅĀ', 'A'))
author_df=author_df.withColumn('name', translate('name', 'èéêëēėę', 'e'))
author_df=author_df.withColumn('name', translate('name', 'ÈÉÊËĒĖĘ', 'E'))
author_df=author_df.withColumn('name', translate('name', 'ûüùúū', 'u'))
author_df=author_df.withColumn('name', translate('name', 'ÛÜÙÚŪ', 'U'))
author_df=author_df.withColumn('name', translate('name', 'ÔÖÒÓŒØŌÕ', 'O'))
author_df=author_df.withColumn('name', translate('name', 'Ÿ', 'Y')) 




+---------+--------------+-------+----+-----------+
|author_id|citation_count|h_index|name|paper_count|
+---------+--------------+-------+----+-----------+
+---------+--------------+-------+----+-----------+





In [145]:
### remove special characters
author_df=author_df.withColumn('name', regexp_replace('name', '"', ''))
author_df=author_df.withColumn('name', regexp_replace('name', ';', ''))
author_df=author_df.withColumn('name', regexp_replace('name', ':', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\}', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\~', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\{', ''))
author_df=author_df.withColumn('name', regexp_replace('name', '\/', ''))



+---------+--------------+-------+------------------+-----------+
|author_id|citation_count|h_index|name              |paper_count|
+---------+--------------+-------+------------------+-----------+
|1021498  |0             |0      |Osman Og~uz       |1          |
|898419   |0             |0      |David Ria~{n}o    |1          |
|1248891  |0             |0      |A. Del~Bimbo      |1          |
|898713   |0             |0      |David Ria~{n}o    |1          |
|1315421  |3             |1      |Katia S. Guimar~es|1          |
|49348    |1             |1      |S. H. ~Son        |1          |
|1430854  |0             |0      |Jo~ao Sequeira    |1          |
|1519119  |8             |1      |M. ~Fujita        |1          |
+---------+--------------+-------+------------------+-----------+





In [None]:
### check if there are duplicate author_ids
author_df.groupby(['name']).count().where('count > 1').sort('count', ascending=False)

In [151]:
duplicated_authors=author_df.groupby(['name'])
unique_duplicated_authors=duplicated_authors.agg(
    round(F.avg("paper_count")).alias("paper_count"),
    round(F.avg("citation_count")).alias("citation_count"),
    round(F.avg("h_index")).alias("h_index"))

In [153]:
unique_duplicated_authors.count()

                                                                                

1287001

# Load and Research_interests in df

In [None]:
### load research_interests into schema
dtypes = pd.read_csv('./schemas/research_interests.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
research_interests_df = spark.read.option('header', 'true').csv('./assets/parsedData/research_interests.csv', header=True, schema=schema)

In [None]:
### remove leading and trailing spaces
research_interests_df = research_interests_df.withColumn("author_id", trim(research_interests_df.author_id))
research_interests_df = research_interests_df.withColumn("research_interests", trim(research_interests_df.research_interests))

### change data type to Integer for author_id
research_interests_df = research_interests_df.withColumn("author_id",research_interests_df["author_id"].cast(IntegerType()))

research_interests_df.printSchema()
research_interests_df.show()

In [None]:
### check for null values in the affiliations column
research_interests_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in research_interests_df.columns]).show()
### drop null values since we dont need research_interests for any computation
research_interests_df=research_interests_df.na.drop(how="any", subset=['research_interests'])


In [None]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_research_interests_df = research_interests_df.select(F.col("author_id"), F.explode(F.split(F.col("research_interests"),";")).alias("research_interest"))
unique_research_interests_df.show(20, False)
research_interests_df.show(20, False)


In [None]:
### remove leading and trailing spaces
unique_research_interests_df = unique_research_interests_df.withColumn("author_id", trim(unique_research_interests_df.author_id))
unique_research_interests_df = unique_research_interests_df.withColumn("research_interest", trim(unique_research_interests_df.research_interests))

### change data type to Integer for author_id
unique_research_interests_df = unique_research_interests_df.withColumn("author_id",unique_research_interests_df["author_id"].cast(IntegerType()))

In [None]:
unique_research_interests_df.show()

In [None]:
### check for duplicate rows:
unique_research_interests_df.groupby(['author_id', 'research_interest']).count().where('count > 1').sort('count', ascending=False).show()
### drop duplicates
unique_research_interests_df=unique_research_interests_df.dropDuplicates()


In [None]:
unique_research_interests_df.show()