Dataset is available in https://drive.google.com/drive/folders/1yRsR1BsXJCcmmC5IOfexp2STFDL33gYL

In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
conf = SparkConf().setAppName("DF_demo").setMaster("local")
sc = SparkContext(conf=conf)

In [3]:
data = sc.textFile("books.csv")

# Issues in RDD processing

**Formatting & schema won't be maintained in RDD**

In [4]:
type(data)

pyspark.rdd.RDD

In [5]:
data.top(1)

['id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url']

In [7]:
data.top(2)

['id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url',
 '9999,8565083,8565083,13433613,7,61711527,9.78006171153e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture,Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture,eng,3.65,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m/8565083.jpg,https://images.gr-assets.com/books/1279214118s/8565083.jpg']

In [10]:
for line in data.top(5):
    print(line)

id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
9999,8565083,8565083,13433613,7,61711527,9.78006171153e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture,Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture,eng,3.65,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m/8565083.jpg,https://images.gr-assets.com/books/1279214118s/8565083.jpg
9998,77431,77431,2393986,60,039330762X,9.78039330763e+12,Patrick O'Brian,1977.0,The Mauritius Command,The Mauritius Command,eng,4.35,9421,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m/77431.jpg,https://images.gr-assets.com/books/1455373531s/77431.jpg
9997,208324,2

# DataFrame

-  Spark RDD APIs – An RDD stands for Resilient Distributed Datasets. It is Read-only partition collection of records. RDD is the fundamental data structure of Spark.

- Spark Dataframe APIs (Spark SQL) – Unlike an RDD, data organized into named columns. For example a table in a relational database. It is an immutable distributed collection of data. 

- DataFrame in Spark allows developers to impose a structure onto a distributed collection of data. 

- It is conceptually equal to a table in a relational database.

Prior Spark 2.0, Spark Context was the entry point of any spark application and used to access all spark features and needed a sparkConf which had all the cluster configs and parameters to create a Spark Context object. 

Spark session is a unified entry point of a spark application from Spark 2.0. unifies all the different contexts in spark and avoids the developer to worry about creating difference contexts.

![image.png](attachment:3d6e0630-ff86-4187-944b-08f0f32bf5ab.png)

## load data

In [11]:
import pyspark as ps 
spark = ps.sql.SparkSession.builder.master("local").appName("FirstApp").getOrCreate()

In [12]:
spark = ps.sql.SparkSession.builder.getOrCreate()

In [13]:
type(spark)

pyspark.sql.session.SparkSession

## print schema

In [15]:
# Importing books dataset as Spark dataframe
# header=True, Wanted to keep heard with dataframe as most of logic are based on DataFrame Header
# inferSchema=True To auto infer dataset as per the data given as ALS needs only number

books_df = spark.read.csv('books.csv', header=True)
books_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- best_book_id: string (nullable = true)
 |-- work_id: string (nullable = true)
 |-- books_count: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: string (nullable = true)
 |-- ratings_2: string (nullable = true)
 |-- ratings_3: string (nullable = true)
 |-- ratings_4: string (nullable = true)
 |-- ratings_5: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)



In [22]:
type(books_df.toPandas())

pandas.core.frame.DataFrame

In [16]:
type(books_df)

pyspark.sql.dataframe.DataFrame

In [17]:
len(books_df.columns)

23

In [18]:
rating_df = spark.read.csv("ratings.csv", header=True)
type(rating_df)

pyspark.sql.dataframe.DataFrame

## count

In [19]:
rating_df.count()

981756

In [24]:
rating_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)



## head

In [23]:
rating_df.head(3)

[Row(book_id='1', user_id='314', rating='5'),
 Row(book_id='1', user_id='439', rating='3'),
 Row(book_id='1', user_id='588', rating='5')]

## show

In [25]:
rating_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



## Select columns

In [27]:
rating_df.select("book_id","rating").show(5)

+-------+------+
|book_id|rating|
+-------+------+
|      1|     5|
|      1|     3|
|      1|     5|
|      1|     4|
|      1|     4|
+-------+------+
only showing top 5 rows



### select * from loop

In [116]:
rating_df.columns

['book_id', 'user_id', 'rating']

In [118]:
rating_df.select(*(col(c) for c in rating_df.columns)).show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
+-------+-------+------+
only showing top 20 rows



## Filter

In [28]:
rating_df.filter('rating <= 3').show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    439|     3|
|      1|   5461|     3|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  20076|     3|
+-------+-------+------+
only showing top 5 rows



## Column Operations

### Change Column type

When we load data from csv, it will be in "String" format

Different way of setting column type - https://sparkbyexamples.com/pyspark/pyspark-cast-column-type/

In [37]:
rating_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)



In [42]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [43]:
rating_df = rating_df.withColumn('book_id', col('book_id').cast(IntegerType())).withColumn("rating", col("rating").cast(IntegerType()))

In [44]:
rating_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)



In [120]:
from pyspark.sql.functions import col

for column in rating_df.columns:
    print(col(column))

Column<'book_id'>
Column<'user_id'>
Column<'rating'>


In [170]:
rating_df_infer = spark.read.option("inferSchema", "true").csv("ratings.csv", header=True)
rating_df_infer.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



### Change column value

Column is immutable. It won't override original df unless we assign it back to df

In [108]:
rating_df.withColumn("rating", rating_df.rating*10).show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|    50|
|      1|    439|    30|
|      1|    588|    50|
|      1|   1169|    40|
|      1|   1185|    40|
+-------+-------+------+
only showing top 5 rows



In [109]:
rating_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



### Add Column

In [110]:
new_dataset = rating_df.withColumn("rating_ten", rating_df.rating*10)
new_dataset.show(5)

+-------+-------+------+----------+
|book_id|user_id|rating|rating_ten|
+-------+-------+------+----------+
|      1|    314|     5|        50|
|      1|    439|     3|        30|
|      1|    588|     5|        50|
|      1|   1169|     4|        40|
|      1|   1185|     4|        40|
+-------+-------+------+----------+
only showing top 5 rows



### Drop column

Column is immutable. It won't override original df unless we assign it back to df

In [112]:
rating_df.drop('rating').show(5)

+-------+-------+
|book_id|user_id|
+-------+-------+
|      1|    314|
|      1|    439|
|      1|    588|
|      1|   1169|
|      1|   1185|
+-------+-------+
only showing top 5 rows



In [113]:
rating_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



## Distinct Value

In [47]:
rating_df.select('rating').drop_duplicates().show()

+------+
|rating|
+------+
|     1|
|     3|
|     5|
|     4|
|     2|
+------+



In [48]:
rating_df.select('book_id').drop_duplicates().show()

+-------+
|book_id|
+-------+
|    148|
|    463|
|    471|
|    496|
|    833|
|   1088|
|   1238|
|   1342|
|   1580|
|   1591|
|   1645|
|   1829|
|   1959|
|   2122|
|   2142|
|   2366|
|   2659|
|   2866|
|   3175|
|   3749|
+-------+
only showing top 20 rows



## GroupBy

In [46]:
rating_df.groupBy("book_id").mean("rating").show()

+-------+-----------+
|book_id|avg(rating)|
+-------+-----------+
|    148|       3.57|
|    463|       3.99|
|    471|       3.84|
|    496|       3.79|
|    833|       3.44|
|   1088|       3.82|
|   1238|       3.81|
|   1342|       4.43|
|   1580|       3.64|
|   1591|       4.01|
|   1645|       3.68|
|   1829|        3.7|
|   1959|       3.87|
|   2122|       3.46|
|   2142|       3.74|
|   2366|       3.92|
|   2659|        3.6|
|   2866|       3.49|
|   3175|       3.78|
|   3749|       4.25|
+-------+-----------+
only showing top 20 rows



In [55]:
top_ratings_by_book = rating_df.groupBy("book_id", "rating").count().sort("count",ascending=False).show()

+-------+------+-----+
|book_id|rating|count|
+-------+------+-----+
|   5207|     5|   84|
|   3275|     5|   83|
|   6361|     5|   82|
|   6920|     5|   82|
|   5580|     5|   81|
|   6590|     5|   80|
|   7947|     5|   80|
|   9566|     5|   79|
|   3628|     5|   79|
|   1308|     5|   79|
|   8946|     5|   79|
|   3753|     5|   79|
|   8978|     5|   79|
|   1788|     5|   79|
|   4483|     5|   78|
|   4868|     5|   78|
|   8109|     5|   77|
|   7254|     5|   76|
|   4778|     5|   75|
|   1355|     5|   75|
+-------+------+-----+
only showing top 20 rows



In [73]:
print(type(rating_df.groupby("rating").count()))
rating_df.groupby("rating").count().show()

<class 'pyspark.sql.dataframe.DataFrame'>
+------+------+
|rating| count|
+------+------+
|     1| 19575|
|     3|248623|
|     5|292961|
|     4|357366|
|     2| 63231|
+------+------+



### toPandas

In [70]:
print(type(rating_df.groupby("rating").count().toPandas()))
rating_df.groupby("rating").count().toPandas()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,rating,count
0,1,19575
1,3,248623
2,5,292961
3,4,357366
4,2,63231


## Unique count

In [49]:
unique_user_count = rating_df.select("user_id").distinct().count()
unique_user_count

53424

## describe df

In [57]:
rating_df.describe().show()

+-------+-----------------+------------------+------------------+
|summary|          book_id|           user_id|            rating|
+-------+-----------------+------------------+------------------+
|  count|           981756|            981756|            981756|
|   mean|4943.275635697668|25616.759933221696|3.8565335989797873|
| stddev|2873.207414896197| 15228.33882588251|0.9839408559620116|
|    min|                1|                 1|                 1|
|    max|            10000|              9999|                 5|
+-------+-----------------+------------------+------------------+



In [58]:
rating_df.describe("rating").show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|            981756|
|   mean|3.8565335989797873|
| stddev|0.9839408559620116|
|    min|                 1|
|    max|                 5|
+-------+------------------+



## Drop Duplicates

In [59]:
rating_df.count()

981756

In [60]:
aaa = rating_df.dropDuplicates()
aaa.count()

980112

## Dropna

In [62]:
rating_without_null = rating_df.dropna().count()
rating_without_null

981756

In [63]:
rating_df.dropna('any').count()

981756

In [64]:
rating_df.dropna('all').count()

981756

## Aggregation

In [65]:
rating_df.agg({'rating': 'max'}).show()

+-----------+
|max(rating)|
+-----------+
|          5|
+-----------+



## Join

In [74]:
rating_df.join(books_df, books_df.book_id == rating_df.book_id).select("user_id", "title").show(5)

+-------+--------------------+
|user_id|               title|
+-------+--------------------+
|    314|Harry Potter and ...|
|    439|Harry Potter and ...|
|    588|Harry Potter and ...|
|   1169|Harry Potter and ...|
|   1185|Harry Potter and ...|
+-------+--------------------+
only showing top 5 rows



In [78]:
books_df = books_df.withColumn("book_id", col('book_id').cast(IntegerType()))

In [84]:
rating_df.join(books_df, books_df.book_id == rating_df.book_id).select("user_id", "title").show(5)

+-------+--------------------+
|user_id|               title|
+-------+--------------------+
|    314|Harry Potter and ...|
|    439|Harry Potter and ...|
|    588|Harry Potter and ...|
|   1169|Harry Potter and ...|
|   1185|Harry Potter and ...|
+-------+--------------------+
only showing top 5 rows



In [87]:
rating_df.join(books_df, books_df.book_id == rating_df.book_id).printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: string (nullable = true)
 |-- work_id: string (nullable = true)
 |-- books_count: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: string (nullable = true)
 |-- ratings_2: string (nullable = true)
 |-- ratings_3: string (nullable = true)
 |-- ratings_4: string (nullable = true)
 |-- ratings_5: str

In [102]:
rating_df.join(books_df, books_df.book_id == rating_df.book_id).groupBy('rating').mean("rating").sort('rating', ascending=False).show(5)

+------+-----------+
|rating|avg(rating)|
+------+-----------+
|     5|        5.0|
|     4|        4.0|
|     3|        3.0|
|     2|        2.0|
|     1|        1.0|
+------+-----------+



## OrderBy

In [103]:
rating_df.orderBy("rating").show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      2|  13794|     1|
|      3|   9246|     1|
|      2|  17643|     1|
|      1|  51480|     1|
|      2|  48687|     1|
+-------+-------+------+
only showing top 5 rows



In [104]:
rating_df.orderBy(rating_df.rating.desc()).show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    588|     5|
|      1|   5885|     5|
|      1|   2900|     5|
|      1|   3922|     5|
+-------+-------+------+
only showing top 5 rows



In [107]:
rating_df.orderBy('rating','book_id').show(50)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|  51480|     1|
|      1|   9246|     1|
|      2|  17643|     1|
|      2|  48687|     1|
|      2|   6063|     1|
|      2|  13794|     1|
|      3|  48687|     1|
|      3|  33065|     1|
|      3|  49298|     1|
|      3|  10509|     1|
|      3|   4536|     1|
|      3|   9246|     1|
|      3|  10610|     1|
|      3|  37284|     1|
|      3|  52036|     1|
|      3|  10944|     1|
|      3|  10751|     1|
|      3|  11854|     1|
|      3|  13794|     1|
|      3|    588|     1|
|      3|  15604|     1|
|      3|  16377|     1|
|      3|  16569|     1|
|      3|  21733|     1|
|      3|  23576|     1|
|      3|  10246|     1|
|      3|  25214|     1|
|      3|  29703|     1|
|      3|  32305|     1|
|      4|   4606|     1|
|      5|   3022|     1|
|      6|  18179|     1|
|      6|  18031|     1|
|      7|  12455|     1|
|      7|  13282|     1|
|      7|  23576|     1|
|      7|  51480|     1|


## When, Count, alias

In [162]:
from pyspark.sql.functions import when, col, count, isnan

In [124]:
books_df.select([count(when(col(c).isNull(), c)).alias(c) for c in books_df.columns]).show()

+---+-------+------------+-------+-----------+----+------+-------+-------------------------+--------------+-----+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+---------+---------------+
| id|book_id|best_book_id|work_id|books_count|isbn|isbn13|authors|original_publication_year|original_title|title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|image_url|small_image_url|
+---+-------+------------+-------+-----------+----+------+-------+-------------------------+--------------+-----+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+---------+---------------+
|  0|      0|           0|      0|          0| 700|   585|      0|                       21|           585|    0|         1084|             0|            0|                

## Filter

In [127]:
rating_df.filter(rating_df.rating > 2).show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
|      1|  10610|     5|
+-------+-------+------+
only showing top 20 rows



## isnull

In [144]:
books_df.select([count(when(col(c).isNull(), c)).alias(c) for c in books_df.columns]).show()

TypeError: 'str' object is not callable

## infer schema

In [146]:
rating_df = spark.read.csv("ratings.csv", header=True)
rating_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)



In [147]:
rating_df_infer = spark.read.option("inferSchema", "true").csv("ratings.csv", header=True)
rating_df_infer.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



## Select columns by column type

In [164]:
rating_new = rating_df.withColumn("rating",col("rating").cast(IntegerType()))
rating_new.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)



In [166]:
[c for c in rating_new.dtypes if c[1] == 'string']

[('book_id', 'string'), ('user_id', 'string')]

In [168]:
[c for c in rating_new.dtypes if c[1] != 'string']

[('rating', 'int')]