In [8]:
from pyspark.sql import SparkSession

In [10]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

In [12]:
spark

In [37]:
authors_pyspark = spark.read.option('header', 'true').csv('./assets/parsedData/authors.csv', inferSchema=True)

                                                                                

In [38]:
authors_pyspark.show()

+-----------------+--------------------+----------+---------+--------------------+
|             name|        affiliations|pub_papers|citations|  research_interests|
+-----------------+--------------------+----------+---------+--------------------+
|        O. Willum|Res. Center for M...|         1|        0|new product;produ...|
|           D. Wei|Dept. of Electr. ...|         1|        0|lowpass filter;mu...|
|         Wenhu Wu|                null|         1|        0|MAP adaptation;ad...|
|     Zhiyuan Zeng|College of Geogra...|         1|        0|normalized differ...|
|       Erzen Hyko|Department of Com...|         1|        0|information conte...|
|      S. Wanstedt|                null|         1|        0|average quality;s...|
|       Lis Weimar|AmsterdamAmstella...|         1|        0|GSS transition;ac...|
|    Chih-Yung Wen|National ChengKun...|         3|        1|diaphragm deflect...|
|        An-Yeu Wu|                null|         1|        0|direct CSD approa...|
|   

In [None]:
authors_pyspark.printSchema()

In [None]:
authors_pyspark.select('name').show()

In [None]:
authors_pyspark.dtypes

In [None]:
authors_pyspark.describe().show()

In [None]:
### to rename columns
authors_pyspark.withColumnRenamed('name', 'Name').show()

In [None]:
### add columns
authors_pyspark.withColumn('pub_papers_2', authors_pyspark['pub_papers']+2).show()

In [None]:
### drop columns (need ti assign to the existing df)
authors_pyspark.drop('pub_papers').show()

In [None]:
### to drop rows which have all columns as null
authors_pyspark.na.drop(how="all").count()

In [None]:
### to drop rows that have any null values
authors_pyspark.na.drop(how="any", thresh=4).count()

In [None]:
### subset
authors_pyspark.na.drop(how="any", subset=['affiliations']).count()

In [None]:
### Filling missing values 
authors_pyspark.na.fill('Missing Values').show()

In [None]:
# Filling missing values in a specific column
authors_pyspark.na.fill('Missing Values', ['affiliations']).show()

In [None]:
authors_pyspark.show()

In [None]:
### filter operations
authors_pyspark.filter('citations<1').show()

In [None]:
### filter operations
authors_pyspark.filter('citations<1').select(['name', 'pub_papers']).show()

In [None]:
### filter operations
authors_pyspark.filter((authors_pyspark['citations']<1) & (authors_pyspark['pub_papers']<1)).show()

In [None]:
### filter operations
authors_pyspark.filter((authors_pyspark['citations']<1) | (authors_pyspark['pub_papers']<1)).show()

In [None]:
### filter operations - not condition ~
authors_pyspark.filter(~(authors_pyspark['citations']<1)).show()

In [None]:
### group by and aggregate functions
authors_pyspark.groupBy('name').sum('pub_papers', 'citations').show()

In [39]:
papers_pyspark = spark.read.option('header', 'true').csv('./assets/parsedData/papers.csv', inferSchema=True)

                                                                                

In [40]:
papers_pyspark.show()

+--------------------+--------------------+--------------------+-----+--------------------+---------+--------+
|         paper_title|             authors|        affiliations| year|   publication_venue|citations|abstract|
+--------------------+--------------------+--------------------+-----+--------------------+---------+--------+
|Book Review: Disc...| Marjorie Richardson|                   -| 1998|       Linux Journal|        0|    null|
|MOSFET table look...|                    |                    | 1984| Integration, the...|        0|    null|
|The verification ...|    Virgil D. Gligor| Univ. of Marylan...| 1984| International Jo...|        0|    null|
|Another view of f...| M. Gyssens;J. Pa...| Univ. of Antwerp...| 1984| International Jo...|        0|    null|
|Entity-relationsh...| Sushil Jajodia;P...| University of Mi...| 1984| International Jo...|        0|    null|
|The computer come...|         Rene Moreau|                   -| 1984| The computer com...|        0|    null|
|

In [None]:
papers_pyspark.dtypes

In [41]:
papers_pyspark.select('year').dtypes

[('year', 'string')]

In [42]:
### change year, citations dtypes from string to int
from pyspark.sql.types import IntegerType
papers_pyspark = papers_pyspark.withColumn("year",papers_pyspark["year"].cast(IntegerType()))
papers_pyspark = papers_pyspark.withColumn("citations",papers_pyspark["citations"].cast(IntegerType()))

In [43]:
### remove leading and trailing spaces 
from pyspark.sql.functions import *
papers_pyspark = papers_pyspark.withColumn("paper_title", trim(papers_pyspark.paper_title))
papers_pyspark = papers_pyspark.withColumn("authors", trim(papers_pyspark.authors))
papers_pyspark = papers_pyspark.withColumn("affiliations", trim(papers_pyspark.affiliations))
papers_pyspark = papers_pyspark.withColumn("publication_venue", trim(papers_pyspark.publication_venue))
papers_pyspark = papers_pyspark.withColumn("abstract", trim(papers_pyspark.abstract))
papers_pyspark = papers_pyspark.withColumn("citations", trim(papers_pyspark.citations))
papers_pyspark = papers_pyspark.withColumn("year", trim(papers_pyspark.year))

In [None]:
papers_pyspark.select('year').show(truncate=False)

In [None]:
papers_pyspark.show()

In [None]:
papers_pyspark.show(truncate=False)

In [44]:
papers_pyspark.filter(papers_pyspark['paper_title'].isNull()).show()



+-----------+--------------------+--------------------+----+--------------------+---------+--------------------+
|paper_title|             authors|        affiliations|year|   publication_venue|citations|            abstract|
+-----------+--------------------+--------------------+----+--------------------+---------+--------------------+
|       null|Jorge J. Gómez-Sa...|                 -;-|2003|FTDCS '03 Proceed...|        0|                null|
|       null|Robert Ghanea-Her...| -;-;-;-;-;-;-;-;-;-|2007|The Computer Journal|        0|The future digita...|
|       null|Andrew M. Waterho...|           -;-;-;-;-|2009|      Bioinformatics|        0|Summary: Jalview ...|
|       null|Tal Vider-Shalit;...|         -;-;-;-;-;-|2009|      Bioinformatics|        0|Motivation: Virus...|
|       null|Tianwei Yu;Youngj...|             -;-;-;-|2009|      Bioinformatics|        0|Motivation: Liqui...|
|       null|Jill L. Wegrzyn;J...|             -;-;-;-|2009|      Bioinformatics|        0|Summa

                                                                                