In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/15 19:53:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/15 19:53:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/15 19:53:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
spark

In [4]:
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType

In [5]:
### load paper into schema
dtypes = pd.read_csv('./schemas/paper.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_df = spark.read.option('header', 'true').csv('./assets/parsedData/papers.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('ref_ids', 'String'), ('title', 'String'), ('year', 'Integer')]


In [6]:
paper_df.show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+--------+--------------------+--------------------+----+
|paper_id|             ref_ids|               title|year|
+--------+--------------------+--------------------+----+
|      65|                null|Direct file organ...|1984|
|     130|                null|An introduction t...|1983|
|     195|317424;317425;317573|On solving almost...|1984|
|     260|                null|Connections betwe...|1984|
|     325|                null|Computers and pen...|1984|
|     390|                null|Relativizations c...|1984|
|     455|                null|On the optimum ch...|1984|
|     520|       318368;323493|All points addres...|1984|
|     585|                null|Optimum Head Sepa...|1984|
|     650|                null|A parallel-design...|1984|
|     715|                null|Computer - IEEE C...|1984|
|     780|318420;319233;319...|Experience with G...|1984|
|     845|                null|Code generation a...|1984|
|     910|                null|On estimating acc...|1984|
|     975|6760

In [7]:
### data cleaning for paper schema

### remove spaces from values of the columns
paper_df = paper_df.withColumn("paper_id", trim(paper_df.paper_id))
paper_df = paper_df.withColumn("ref_ids", trim(paper_df.ref_ids))
paper_df = paper_df.withColumn("title", trim(paper_df.title))
paper_df = paper_df.withColumn("year", trim(paper_df.year))

In [8]:
### check for the data types
paper_df.printSchema()
### change the data type of year to Integer
paper_df = paper_df.withColumn("year",paper_df["year"].cast(IntegerType()))

root
 |-- paper_id: string (nullable = true)
 |-- ref_ids: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [9]:
### check for nonsense null data
null_values_paper_df = paper_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
   )
### save the ids of papers whose title is missing to clean up the other dataframes
null_paper_ids = paper_df.filter(paper_df['title'].isNull())
null_paper_ids_list=null_paper_ids.select('paper_id').rdd.flatMap(lambda x: x).collect()
null_paper_ids_list = [int(item) for item in null_paper_ids_list]



+--------+-------+-----+----+
|paper_id|ref_ids|title|year|
+--------+-------+-----+----+
|       0|1233568|   24| 157|
+--------+-------+-----+----+



                                                                                

In [20]:
### after checking the below dataframes, all papers whose title is missing have the authors besides paper_id = 748056
### decision: fill missing titles with : Missing Title

paper_df=paper_df.na.fill('Missing Title', ['title']).show()

+--------+--------------------+--------------------+----+
|paper_id|             ref_ids|               title|year|
+--------+--------------------+--------------------+----+
|      65|                null|Direct file organ...|1984|
|     130|                null|An introduction t...|1983|
|     195|317424;317425;317573|On solving almost...|1984|
|     260|                null|Connections betwe...|1984|
|     325|                null|Computers and pen...|1984|
|     390|                null|Relativizations c...|1984|
|     455|                null|On the optimum ch...|1984|
|     520|       318368;323493|All points addres...|1984|
|     585|                null|Optimum Head Sepa...|1984|
|     650|                null|A parallel-design...|1984|
|     715|                null|Computer - IEEE C...|1984|
|     780|318420;319233;319...|Experience with G...|1984|
|     845|                null|Code generation a...|1984|
|     910|                null|On estimating acc...|1984|
|     975|6760

In [11]:
print(null_paper_ids_list)

[1198146, 1473881, 1446907, 1556693, 1231174, 1447827, 1640034, 1041645, 1374774, 1214485, 1503936, 1491263, 1639073, 748056, 594463, 1556793, 1567064, 1639803, 1739775, 1309608, 1878685, 1446891, 1600165, 1755580]


In [12]:
### load affiliation into schema
dtypes = pd.read_csv('./schemas/affiliation.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
affiliation_df = spark.read.option('header', 'true').csv('./assets/parsedData/affiliations.csv', header=True, schema=schema)

[('affiliations', 'String'), ('paper_id', 'Integer')]


In [17]:
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))
affiliation_df = affiliation_df.withColumn("paper_id", trim(affiliation_df.paper_id))
affiliation_df = affiliation_df.withColumn("paper_id",affiliation_df["paper_id"].cast(IntegerType()))

affiliation_df.printSchema()
affiliation_df.show()

root
 |-- affiliations: string (nullable = true)
 |-- paper_id: integer (nullable = true)

+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows



In [26]:
### check for null values in the affiliations column
null_values_affiliations=affiliation_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns]
   )



+------------+--------+
|affiliations|paper_id|
+------------+--------+
|       37499|       0|
+------------+--------+



                                                                                

In [33]:
### This df is used to count papers per unique affiliation, so if the affiliation is missing, it doesnt make sense
### drop all rows where affiliation is null

affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [37]:
affiliation_df.filter(affiliation_df.affiliations.contains('-')).collect()

                                                                                

[Row(affiliations='-;-;-', paper_id=585),
 Row(affiliations='-', paper_id=715),
 Row(affiliations='-;-', paper_id=1300),
 Row(affiliations='-', paper_id=1430),
 Row(affiliations='-', paper_id=1495),
 Row(affiliations='-', paper_id=1625),
 Row(affiliations='-;-;-', paper_id=1690),
 Row(affiliations='Univ. of Illinois at Urbana-Champaign, Urbana;Massachusetts Institute of Technology, Cambridge', paper_id=2795),
 Row(affiliations='-', paper_id=2860),
 Row(affiliations='Mihailo Pupin Institute, Belgrade, Yugoslvia;Carnegie-Mellon Univ., Pittsburgh, PA;Intel Corp., Aloha, OR', paper_id=2990),
 Row(affiliations='Boston Univ., Boston, MA;-', paper_id=3185),
 Row(affiliations='NCR Corporation;-', paper_id=3315),
 Row(affiliations='-', paper_id=3575),
 Row(affiliations='-', paper_id=3705),
 Row(affiliations='-;-', paper_id=4095),
 Row(affiliations='-', paper_id=4160),
 Row(affiliations='Univ. of Alabama, Birmingham;-;-', paper_id=4290),
 Row(affiliations='-', paper_id=4485),
 Row(affiliations='

In [34]:
### check if affiliations are missing as well for the ids whose title was missing in paper_df
for rows in affiliation_df.select("affiliations","paper_id").collect():
    if rows[1] in null_paper_ids_list:
        print(rows[0], rows[1])

                                                                                

-;-;-;-;- 1198146
-;-;-;-;- 1473881
-;-;- 1446907
-;-;-;- 1556693
-;-;-;- 1231174
-;-;-;-;-;-;- 1447827
-;- 1640034
-;-;-;-;-;-;-;-;-;- 1041645
-;-;- 1374774
-;-;-;-;-;- 1214485
-;-;-;-;-;- 1503936
-;-;-;-;-;-;-;-;-;-;-;- 1491263
-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;- 1639073
-;- 594463
-;-;-;- 1556793
-;-;-;-;-;-;-;-;-;- 1567064
-;-;- 1639803
-;-;- 1739775
-;-;-;- 1309608
-;-;- 1878685
-;-;- 1446891
-;-;- 1600165
-;-;-;-;- 1755580


In [14]:
### load paper_authors into schema
dtypes = pd.read_csv('./schemas/paper_authors.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_author_df = spark.read.option('header', 'true').csv('./assets/parsedData/paper_authors.csv', header=True, schema=schema)

[('authors', 'String'), ('paper_id', 'Integer')]


In [15]:
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))
paper_author_df = paper_author_df.withColumn("paper_id", trim(paper_author_df.paper_id))
paper_author_df = paper_author_df.withColumn("paper_id",paper_author_df["paper_id"].cast(IntegerType()))

paper_author_df.show()
paper_author_df.printSchema()

+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows

root
 |-- authors: string (nullable = true)
 |-- paper_id: integer (nullable = true)



In [38]:
null_values_paper_authors=paper_author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_author_df.columns]
   )



+-------+--------+
|authors|paper_id|
+-------+--------+
|  37506|       0|
+-------+--------+





In [16]:
### check if authors are missing as well for the ids whose title was missing in paper_df
for rows in paper_author_df.select("authors","paper_id").collect():
    if rows[1] in null_paper_ids_list:
        print(rows[0], rows[1])

                                                                                

Andrew M. Waterhouse;James B. Procter;David M. A. Martin;Michèle Clamp;Geoffrey J. Barton 1198146
Susanne Balzer;Ketil Malde;Anders Lanzén;Animesh Sharma;Inge Jonassen 1473881
Christoph Müssel;Martin Hopfensitz;Hans A. Kestler 1446907
Vijay Garla;Yong Kong;Sebastian Szpakowski;Michael Krauthammer 1556693
Tianwei Yu;Youngja Park;Jennifer M. Johnson;Dean P. Jones 1231174
Juliane Liepe;Chris Barnes;Erika Cule;Kamil Erguler;Paul Kirk;Tina Toni;Michael P.H. Stumpf 1447827
Wei-Po Lee;Yu-Ting Hsiao 1640034
Robert Ghanea-Hercock;E. Gelenbe;Nicholas R. Jennings;Oliver Smith;David N. Allsopp;Alex Healing;Hakan Duman;Simon Sparks;Nishan C. Karunatillake;Perukrishnen Vytelingum 1041645
Kosaku Shinoda;Masaru Tomita;Yasushi Ishihama 1374774
Tal Vider-Shalit;Ronit Sarid;Kobi Maman;Lea Tsaban;Ran Levi;Yoram Louzoun 1214485
Sergii Ivakhno;Tom Royce;Anthony J. Cox;Dirk J. Evers;R. Keira Cheetham;Simon Tavaré 1503936
Ravi Shankar;Helen Parkinson;Tony Burdett;Emma Hastings;Junmin Liu;Michael Miller;Rashmi



+-------+--------+
|authors|paper_id|
+-------+--------+
|  37506|       0|
+-------+--------+



                                                                                

In [22]:
### load publication_venues into schema
dtypes = pd.read_csv('./schemas/publication_venues.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
publication_venue_df = spark.read.option('header', 'true').csv('./assets/parsedData/publication_venues.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('publication_venue', 'String')]


In [23]:
publication_venue_df = publication_venue_df.withColumn("publication_venue", trim(publication_venue_df.publication_venue))
publication_venue_df = publication_venue_df.withColumn("paper_id", trim(publication_venue_df.paper_id))
publication_venue_df = publication_venue_df.withColumn("paper_id",publication_venue_df["paper_id"].cast(IntegerType()))
publication_venue_df.show()

+--------+--------------------+
|paper_id|   publication_venue|
+--------+--------------------+
|      65|Information Techn...|
|     130|Proc. of the symp...|
|     195|ACM Transactions ...|
|     260|Information and C...|
|     325|Computers and pen...|
|     390|Information and C...|
|     455|SIAM Journal on C...|
|     520|IBM Journal of Re...|
|     585|Journal of the AC...|
|     650|Theoretical Compu...|
|     715|            Computer|
|     780|ACM Transactions ...|
|     845|Methods and tools...|
|     910|Information Proce...|
|     975|ACM Transactions ...|
|    1040|Information Proce...|
|    1105|ACM Transactions ...|
|    1170|Proc. of the 2nd ...|
|    1235|Proc. of the inte...|
|    1300|Foundations of co...|
+--------+--------------------+
only showing top 20 rows



In [39]:
null_values_publication_venue=publication_venue_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns]
   )



+--------+-----------------+
|paper_id|publication_venue|
+--------+-----------------+
|       0|              148|
+--------+-----------------+





In [40]:
publication_venue_df.filter(publication_venue_df['publication_venue'].isNull()).show()

[Stage 50:>                                                         (0 + 1) / 1]

+--------+-----------------+
|paper_id|publication_venue|
+--------+-----------------+
|  109525|             null|
|  987870|             null|
| 1065415|             null|
| 1555515|             null|
|  144171|             null|
|  804766|             null|
| 1032591|             null|
| 1986272|             null|
| 1013548|             null|
| 1054433|             null|
| 1785618|             null|
|  993855|             null|
| 1444435|             null|
|  893041|             null|
|  947251|             null|
|  638112|             null|
| 1074067|             null|
| 1375797|             null|
| 1376577|             null|
|  861258|             null|
+--------+-----------------+
only showing top 20 rows



                                                                                