# Pyspark introduction

In [1]:
import plotly.express as px
import plotly.graph_objects as go

from pyspark.sql import SparkSession, types
from pyspark.sql.functions import lit, when

from v_time import timeit

In [2]:
spark = SparkSession.builder.appName("pyspark_intro").getOrCreate()
sc = spark.sparkContext
print(spark.version)

2.4.4


## 1. Create dataframe

### 1.1. Dataframes
It can read all common tabular data file sources like csv or parquet.

In [3]:
sdf_train = spark.read.format("csv").load("datasets/titanic_train.csv")
sdf_test = spark.read.format("csv").load("datasets/titanic_test.csv")
sdf_train.count(), sdf_test.count()

(892, 419)

And it can use wildcards for reading. For example let's read both `titanic_train.csv` and `titanic_test.csv` as one dataframe

In [4]:
sdf = spark.read.format("csv").load("datasets/titanic_*.csv")
sdf.count()

1311

### 1.2. RDD
It is possible to read raw files and process them.
As an example we are reading a csv and transform it to a datafram.

<div class="alert alert-warning" role="alert">
    It is only an example is better to read it directly as a dataframe
</div>

In [5]:
data = sc.textFile("datasets/iris.csv")
parts = data.map(lambda x: x.split(";"))

iris_data = parts.map(lambda x: types.Row(SL=x[0], SW=x[1], PL=x[2], classification=x[3]))
sdf = spark.createDataFrame(iris_data)

sdf.show(3)

+---+---+---+--------------+
| PL| SL| SW|classification|
+---+---+---+--------------+
| PL| SL| SW|            PW|
|1,4|5,1|3,5|           0,2|
|1,4|4,9|  3|           0,2|
+---+---+---+--------------+
only showing top 3 rows



## 2. Inspect
By default outputting the dataframe only shows the columns and types

In [6]:
sdf = spark.read.option("header", "true").format("csv").load("datasets/titanic_train.csv")
sdf

DataFrame[PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

### 2.1. Show data

In [7]:
# Display first N rows
sdf.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### 2.2. General info 

In [10]:
sdf.count()

891

In [11]:
sdf.schema

StructType(List(StructField(PassengerId,StringType,true),StructField(Survived,StringType,true),StructField(Pclass,StringType,true),StructField(Name,StringType,true),StructField(Sex,StringType,true),StructField(Age,StringType,true),StructField(SibSp,StringType,true),StructField(Parch,StringType,true),StructField(Ticket,StringType,true),StructField(Fare,StringType,true),StructField(Cabin,StringType,true),StructField(Embarked,StringType,true)))

In [12]:
sdf.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [13]:
sdf.dtypes

[('PassengerId', 'string'),
 ('Survived', 'string'),
 ('Pclass', 'string'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'string'),
 ('SibSp', 'string'),
 ('Parch', 'string'),
 ('Ticket', 'string'),
 ('Fare', 'string'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

## 3. Slicing

In [8]:
# Retrive first N rows
sdf.head(2) # or sdf.take(N)

[Row(PassengerId='1', Survived='0', Pclass='3', Name='Braund, Mr. Owen Harris', Sex='male', Age='22', SibSp='1', Parch='0', Ticket='A/5 21171', Fare='7.25', Cabin=None, Embarked='S'),
 Row(PassengerId='2', Survived='1', Pclass='1', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age='38', SibSp='1', Parch='0', Ticket='PC 17599', Fare='71.2833', Cabin='C85', Embarked='C')]

<div class="alert alert-info" role="alert">
    <b>sdf.head()</b> and/or <b>sdf.take()</b> both retrives a list of rows, it is not a dataframe
</div>

In [9]:
# Slice the dataframe to the N first rows
sdf.limit(5).show()

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### 3.1. Filter columns

In [14]:
sdf.select("Sex").show(3)

+------+
|   Sex|
+------+
|  male|
|female|
|female|
+------+
only showing top 3 rows



In [15]:
sdf.select("Sex", "Age").show(3)

+------+---+
|   Sex|Age|
+------+---+
|  male| 22|
|female| 38|
|female| 26|
+------+---+
only showing top 3 rows



### 3.2. Filter rows

In [16]:
sdf[sdf["Age"] > 24].count() # or sdf.filter(sdf["Age"] > 24).count(), sdf.where(sdf["Age"] > 24).count()

436

In [18]:
sdf[sdf["Age"].between(20, 30)].count()

247

In [19]:
sdf[sdf["Pclass"].isin([1, 2])].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|PC 17599|71.2833|  C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|  113803|   53.1| C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male| 54|    0|    0|   17463|51.8625|  E46|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female| 14|    1|    0|  237736|30.0708| null|       C|
|         12|       1|     1|Bonnell, Miss. El...|female| 58|    0|    0|  113783|  26.55| C103|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
only showing top 5 rows



In [20]:
sdf[sdf["Name"].like("%Miss.%")].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|  Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282| 7.925| null|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female|  4|    1|    1|         PP 9549|  16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female| 58|    0|    0|          113783| 26.55| C103|       S|
|         15|       0|     3|Vestrom, Miss. Hu...|female| 14|    0|    0|          350406|7.8542| null|       S|
|         23|       1|     3|"McGowan, Miss. A...|female| 15|    0|    0|          330923|8.0292| null|       Q|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------

In [21]:
sdf[sdf["Name"].startswith("Hei")].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|7.925| null|       S|
|        817|       0|     3|Heininen, Miss. W...|female| 23|    0|    0|STON/O2. 3101290|7.925| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+



### 3.3. Unique values

In [22]:
sdf.select("Pclass").distinct().show()

+------+
|Pclass|
+------+
|     3|
|     1|
|     2|
+------+



In [25]:
sdf.select("Pclass").exceptAll(sdf.select("Survived")).distinct().show()

+------+
|Pclass|
+------+
|     3|
|     2|
+------+



## 4. Modify data

### 4.1. Add columns

In [27]:
sdf = sdf.withColumn("new col", sdf["Age"])
sdf.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|     35|
+-----------+--------+--

### Modify certain values

In [39]:
sdf.select(sdf["Sex"].substr(1, 1).alias("Sex code")).show(3)

+--------+
|Sex code|
+--------+
|       m|
|       f|
|       f|
+--------+
only showing top 3 rows



In [30]:
sdf.select("Age", when(sdf["Age"] > 18, 1).otherwise(0).alias("Adult")).show(8)

+----+-----+
| Age|Adult|
+----+-----+
|  22|    1|
|  38|    1|
|  26|    1|
|  35|    1|
|  35|    1|
|null|    0|
|  54|    1|
|   2|    0|
+----+-----+
only showing top 8 rows



In [49]:
# Update null age to 0
sdf.withColumn("Age", when(sdf["Age"] > 0, sdf["Age"]).otherwise(-1)).show(8)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|     35|
|          6|       0|  

In [55]:
sdf.fillna({"Age": 0, "Cabin": "no cabin"}).show(8)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+--------+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|   Cabin|Embarked|new col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+--------+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25|no cabin|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|     C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925|no cabin|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1|    C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05|no cabin|       S|     35|


In [61]:
sdf.replace("male", "m", "Sex").show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|     m| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|     m| 35|    0|    0|          373450|   8.05| null|       S|     35|
+-----------+--------+--

In [63]:
sdf.sort("Age").show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|         Ticket|    Fare|Cabin|Embarked|new col|
+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+-------+
|         88|       0|     3|Slocovski, Mr. Se...|  male|null|    0|    0|SOTON/OQ 392086|    8.05| null|       S|   null|
|        199|       1|     3|"Madigan, Miss. M...|female|null|    0|    0|         370370|    7.75| null|       Q|   null|
|         96|       0|     3|Shorney, Mr. Char...|  male|null|    0|    0|         374910|    8.05| null|       S|   null|
|         83|       1|     3|McDermott, Miss. ...|female|null|    0|    0|         330932|  7.7875| null|       Q|   null|
|        102|       0|     3|"Petroff, Mr. Pas...|  male|null|    0|    0|         349215|  7.8958| null|       S|   null|
|         46|   