# Pyspark introduction

In [1]:
from pyspark.sql import SparkSession, types
from pyspark.sql.functions import udf, when
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName("pyspark_intro").getOrCreate()
sc = spark.sparkContext
print(spark.version)

2.4.4


## 1. Create dataframe

### 1.1. Read file
It can read all common tabular data file sources like csv or parquet.

In [3]:
sdf_train = spark.read.format("csv").load("datasets/titanic_train.csv")
sdf_test = spark.read.format("csv").load("datasets/titanic_test.csv")
sdf_train.count(), sdf_test.count()

(892, 419)

And it can use wildcards for reading. For example let's read both `titanic_train.csv` and `titanic_test.csv` as one dataframe

In [4]:
sdf = spark.read.format("csv").load("datasets/titanic_*.csv")
sdf.count()

1311

### 1.2. RDD
It is possible to read raw files and process them.
As an example we are reading a csv and transform it to a datafram.

<div class="alert alert-warning" role="alert">
    It is only an example is better to read it directly as a dataframe
</div>

In [5]:
data = sc.textFile("datasets/iris.csv")
parts = data.map(lambda x: x.split(";"))

iris_data = parts.map(lambda x: types.Row(SL=x[0], SW=x[1], PL=x[2], classification=x[3]))
sdf = spark.createDataFrame(iris_data)

sdf.show(3)

+---+---+---+--------------+
| PL| SL| SW|classification|
+---+---+---+--------------+
| PL| SL| SW|            PW|
|1,4|5,1|3,5|           0,2|
|1,4|4,9|  3|           0,2|
+---+---+---+--------------+
only showing top 3 rows



## 2. Inspect
By default outputting the dataframe only shows the columns and types

In [6]:
sdf = spark.read.option("header", "true").format("csv").load("datasets/titanic_train.csv")
sdf

DataFrame[PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

### 2.1. Show data

In [7]:
# Display first N rows
sdf.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### 2.2. General info 

In [8]:
sdf.count()

891

In [9]:
sdf.schema

StructType(List(StructField(PassengerId,StringType,true),StructField(Survived,StringType,true),StructField(Pclass,StringType,true),StructField(Name,StringType,true),StructField(Sex,StringType,true),StructField(Age,StringType,true),StructField(SibSp,StringType,true),StructField(Parch,StringType,true),StructField(Ticket,StringType,true),StructField(Fare,StringType,true),StructField(Cabin,StringType,true),StructField(Embarked,StringType,true)))

In [10]:
sdf.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [11]:
sdf.dtypes

[('PassengerId', 'string'),
 ('Survived', 'string'),
 ('Pclass', 'string'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'string'),
 ('SibSp', 'string'),
 ('Parch', 'string'),
 ('Ticket', 'string'),
 ('Fare', 'string'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [12]:
(
    spark
    .read
    .option("header", "true")
    .option("delimiter", ";")
    .format("csv")
    .load("datasets/iris.csv")
).describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|                SL|                 SW|                PL|                PW|Classification|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.470588235294118|                3.0|4.3076923076923075|1.4615384615384615|          null|
| stddev|0.6242642728467979|0.27216552697590873|1.3155870289605438|0.5188745216627707|          null|
|    min|               4,3|                  2|                 1|               0,1|   Iris-setosa|
|    max|               7,9|                4,4|               6,9|               2,5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



## 3. Slicing

### 3.1. First rows

In [13]:
# Retrive first N rows
sdf.head(2) # or sdf.take(N)

[Row(PassengerId='1', Survived='0', Pclass='3', Name='Braund, Mr. Owen Harris', Sex='male', Age='22', SibSp='1', Parch='0', Ticket='A/5 21171', Fare='7.25', Cabin=None, Embarked='S'),
 Row(PassengerId='2', Survived='1', Pclass='1', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age='38', SibSp='1', Parch='0', Ticket='PC 17599', Fare='71.2833', Cabin='C85', Embarked='C')]

<div class="alert alert-info" role="alert">
    <b>sdf.head()</b> and/or <b>sdf.take()</b> both retrives a list of rows, it is not a dataframe
</div>

In [14]:
# Slice the dataframe to the N first rows
sdf.limit(5).show()

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### 3.2. Filter columns

In [15]:
sdf.select("Sex").show(3)

+------+
|   Sex|
+------+
|  male|
|female|
|female|
+------+
only showing top 3 rows



In [16]:
sdf.select("Sex", "Age").show(3)

+------+---+
|   Sex|Age|
+------+---+
|  male| 22|
|female| 38|
|female| 26|
+------+---+
only showing top 3 rows



### 3.3. Filter rows

In [17]:
sdf[sdf["Age"] > 24].count() # or sdf.filter(sdf["Age"] > 24).count(), sdf.where(sdf["Age"] > 24).count()

436

In [18]:
sdf[sdf["Age"].between(20, 30)].count()

247

In [19]:
sdf[sdf["Pclass"].isin([1, 2])].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|PC 17599|71.2833|  C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|  113803|   53.1| C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male| 54|    0|    0|   17463|51.8625|  E46|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female| 14|    1|    0|  237736|30.0708| null|       C|
|         12|       1|     1|Bonnell, Miss. El...|female| 58|    0|    0|  113783|  26.55| C103|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+--------+-------+-----+--------+
only showing top 5 rows



In [20]:
sdf[sdf["Name"].like("%Miss.%")].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|  Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282| 7.925| null|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female|  4|    1|    1|         PP 9549|  16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female| 58|    0|    0|          113783| 26.55| C103|       S|
|         15|       0|     3|Vestrom, Miss. Hu...|female| 14|    0|    0|          350406|7.8542| null|       S|
|         23|       1|     3|"McGowan, Miss. A...|female| 15|    0|    0|          330923|8.0292| null|       Q|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+------

In [21]:
sdf[sdf["Name"].startswith("Hei")].show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|7.925| null|       S|
|        817|       0|     3|Heininen, Miss. W...|female| 23|    0|    0|STON/O2. 3101290|7.925| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-----+-----+--------+



### 3.4. Unique values

In [22]:
sdf.select("Pclass").distinct().show()

+------+
|Pclass|
+------+
|     3|
|     1|
|     2|
+------+



It is also posible to subtract values based on another list.

In [23]:
sdf.select("Pclass").exceptAll(sdf.select("Survived")).distinct().show()

+------+
|Pclass|
+------+
|     3|
|     2|
+------+



## 4. Modify the dataframe
To modify a dataframe you need to update the original dataframe.
For example you would do `sdf = do_something(sdf)`.

<div class="alert alert-info" role="alert">
    It is posible to only call <b>do_something(sdf)</b> but it won't update the dataframe.
</div>

### 4.1. Add columns

In [24]:
sdf = sdf.withColumn("new_col", sdf["Age"])
sdf.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new_col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|     35|
+-----------+--------+--

### 4.2. Change dtypes

In [25]:
sdf = sdf.withColumn("Age", sdf["Age"].cast(IntegerType()))
sdf.dtypes

[('PassengerId', 'string'),
 ('Survived', 'string'),
 ('Pclass', 'string'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'int'),
 ('SibSp', 'string'),
 ('Parch', 'string'),
 ('Ticket', 'string'),
 ('Fare', 'string'),
 ('Cabin', 'string'),
 ('Embarked', 'string'),
 ('new_col', 'string')]

### 4.3. Modify certain values

In [26]:
sdf.withColumn("sex_code", sdf["Sex"].substr(1, 1).alias("Sex code")).show(3)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new_col|sex_code|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|       m|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|       f|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|       f|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+--------+
only showing top 3 rows



In [27]:
# Update invalid ages ages to -1
sdf.withColumn("Age", when(sdf["Age"] >= 0, sdf["Age"]).otherwise(-1)).show(8)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new_col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|     35|
|          6|       0|  

In [28]:
sdf.fillna({"Age": 0, "Cabin": "no cabin"}).show(8)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+--------+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|   Cabin|Embarked|new_col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+--------+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25|no cabin|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|     C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925|no cabin|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1|    C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05|no cabin|       S|     35|


In [29]:
sdf.replace("male", "m", "Sex").show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|new_col|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|     m| 22|    1|    0|       A/5 21171|   7.25| null|       S|     22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|     38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|     26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|     35|
|          5|       0|     3|Allen, Mr. Willia...|     m| 35|    0|    0|          373450|   8.05| null|       S|     35|
+-----------+--------+--

### 4.4. Sort

In [30]:
sdf.sort("Age", ascending=False).show()

+-----------+--------+------+--------------------+------+---+-----+-----+-----------+-------+-----------+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|     Ticket|   Fare|      Cabin|Embarked|new_col|
+-----------+--------+------+--------------------+------+---+-----+-----+-----------+-------+-----------+--------+-------+
|        631|       1|     1|Barkworth, Mr. Al...|  male| 80|    0|    0|      27042|     30|        A23|       S|     80|
|        852|       0|     3| Svensson, Mr. Johan|  male| 74|    0|    0|     347060|  7.775|       null|       S|     74|
|         97|       0|     1|Goldschmidt, Mr. ...|  male| 71|    0|    0|   PC 17754|34.6542|         A5|       C|     71|
|        494|       0|     1|Artagaveytia, Mr....|  male| 71|    0|    0|   PC 17609|49.5042|       null|       C|     71|
|        673|       0|     2|Mitchell, Mr. Hen...|  male| 70|    0|    0| C.A. 24580|   10.5|       null|       S|     70|
|        746|   

### 4.5. Delete columns

In [31]:
sdf = sdf.drop("new_col")
sdf.show(2)

+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



### 4.6. Drop duplicates

In [32]:
sdf.drop_duplicates(["Pclass"]).show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0| PC 17599|71.2833|  C85|       C|
|         10|       1|     2|Nasser, Mrs. Nich...|female| 14|    1|    0|   237736|30.0708| null|       C|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+



### 4.7. User Defined Functions (UDF)
It is posible to define new functions and apply them to a column.
To define it you need to specify the output type (int, str...).
The transformation will be apply element wise.

In [33]:
@udf(IntegerType())
def sum_1(x):
    if x is not None:
        return x + 1
    return None

sdf.withColumn("Next_Age", sum_1(sdf["Age"])).show(3)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Next_Age|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|      23|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|      39|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|      27|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+--------+
only showing top 3 rows



## 5. Group data

In [34]:
sdf.groupby("Sex").count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [35]:
sdf.groupby("Sex").agg({"sex": "count", "Age": "max"}).show()

+------+----------+--------+
|   Sex|count(sex)|max(Age)|
+------+----------+--------+
|female|       314|      63|
|  male|       577|      80|
+------+----------+--------+



## 6. SQL

One really useful feature of spark is that you can use **SQL** syntax and **spark** will translate and apply it.

Before using **SQL** you need to register the table:

In [36]:
sdf.createOrReplaceTempView("titanic")

Then you can perform queries using `spark` object:

In [37]:
spark.sql("SELECT * FROM titanic WHERE sex = 'female' LIMIT 5").show()

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female| 27|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female| 14|    1|    0|          237736|30.0708| null|       C|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

## 7. Write

In [38]:
sdf.write.format("parquet").mode("overwrite").save("data/titanic_train.parquet")

## 8. Interacting with python

In [39]:
sdf.toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30,B42,S
888,889,0,3,"""Johnston, Miss. Catherine Helen """"Carrie""""""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30,C148,C


<div class="alert alert-warning" role="alert">
    When transforming to Pandas it will load the data into memory. Be careful to not load a dataframe too big.
</div>

In [40]:
sdf.select("Name").rdd.flatMap(lambda x: x).collect()

['Braund, Mr. Owen Harris',
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
 'Heikkinen, Miss. Laina',
 'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
 'Allen, Mr. William Henry',
 'Moran, Mr. James',
 'McCarthy, Mr. Timothy J',
 'Palsson, Master. Gosta Leonard',
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
 'Nasser, Mrs. Nicholas (Adele Achem)',
 'Sandstrom, Miss. Marguerite Rut',
 'Bonnell, Miss. Elizabeth',
 'Saundercock, Mr. William Henry',
 'Andersson, Mr. Anders Johan',
 'Vestrom, Miss. Hulda Amanda Adolfina',
 'Hewlett, Mrs. (Mary D Kingcome) ',
 'Rice, Master. Eugene',
 'Williams, Mr. Charles Eugene',
 'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
 'Masselmani, Mrs. Fatima',
 'Fynney, Mr. Joseph J',
 'Beesley, Mr. Lawrence',
 '"McGowan, Miss. Anna ""Annie"""',
 'Sloper, Mr. William Thompson',
 'Palsson, Miss. Torborg Danira',
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)',
 'Emir, Mr. Farred Chehab',
 'Fortune, Mr. Charles Alexander'

In [41]:
sdf.agg({"Age": "max"}).collect()[0][0]

80