Adapted from [Databrick's tutorial](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html)

In [35]:
# import pyspark class Row from module sql
from pyspark.sql import *

## Step 0 - Create a spark session

* `pyspark` communicates with `spark` through a session
* Similar to `sqlalchemy` session.

In [46]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

## Overview -  `pyspark.DataFrame`

* A `DataFrame` is a collection of `Row`s
* `Row`s can be distributed over many machines
* `spark`
    * Hides the messy details
    * Optimizes operations

## Creating a `Row` of data

* Use the `Row` class
* Pass data using keywords
    * key == column name
    * value == cell value

In [32]:
department1 = Row(id='123456', name='Computer Science')
department1

Row(id='123456', name='Computer Science')

## Unpacking a `Row` dictionary

* Data is in a row dictionary
* Unpack keywords using `**`

In [26]:
dept2_info = {'id':'789012', 'name':'Mechanical Engineering'}
department2 = Row(**dept2_info)
department2

Row(id='789012', name='Mechanical Engineering')

## Unpacking a list of row dictionaries

In [42]:
dept_info = [{'id':123456, 'name':'Computer Science'},
             {'id':789012, 'name':'Mechanical Engineering'},
             {'id':345678, 'name':'Theater and Drama'},
             {'id':901234, 'name':'Indoor Recreation'}]

dept_rows = [Row(**r) for r in dept_info]
dept_rows

[Row(id=123456, name='Computer Science'),
 Row(id=789012, name='Mechanical Engineering'),
 Row(id=345678, name='Theater and Drama'),
 Row(id=901234, name='Indoor Recreation')]

## Access `Row` content with column attributes

In [69]:
[dept.id for dept in dept_rows]

[123456, 789012, 345678, 901234]

In [70]:
[dept.name for dept in dept_rows]

['Computer Science',
 'Mechanical Engineering',
 'Theater and Drama',
 'Indoor Recreation']

## Creating a `pyspark.DataFrame`

* A `DataFrame` is a collection of `Row`s
* Create with spark.createDataFrame
* Need to have a 

In [43]:
df = spark.createDataFrame(dept_rows)
df

DataFrame[id: bigint, name: string]

## How to think about a `pyspark.DataFrame`

<img src="./img/pyspark_df.png" width=600>

## Example - `filter` and `collect`

In [45]:
output = df.filter(df.name.startswith('C')).collect()
output

[Row(id=123456, name='Computer Science')]

## Why is `pyspark` so slow

* Optimized for 
    * Distributed computation
    * Big data 
* Not great for
    * Local work
    * Small data

## `filter` and `collect` illustrated

<img src="./img/pyspark_filter_collect.gif" width=600>

## Reading a `csv` file with `spark.read.csv`

In [88]:
heros = spark.read.csv('./data/heroes_information.csv', header=True)
heros

DataFrame[Id: string, name: string, Gender: string, Eye color: string, Race: string, Hair color: string, Height: string, Publisher: string, Skin color: string, Alignment: string, Weight: string]

## Inspecting the column types

In [89]:
heros.printSchema()

root
 |-- Id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Eye color: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Hair color: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Skin color: string (nullable = true)
 |-- Alignment: string (nullable = true)
 |-- Weight: string (nullable = true)



## Inspecting the content - `take`

In [90]:
heros.take(5)

[Row(Id='0', name='A-Bomb', Gender='Male', Eye color='yellow', Race='Human', Hair color='No Hair', Height='203.0', Publisher='Marvel Comics', Skin color='-', Alignment='good', Weight='441.0'),
 Row(Id='1', name='Abe Sapien', Gender='Male', Eye color='blue', Race='Icthyo Sapien', Hair color='No Hair', Height='191.0', Publisher='Dark Horse Comics', Skin color='blue', Alignment='good', Weight='65.0'),
 Row(Id='2', name='Abin Sur', Gender='Male', Eye color='blue', Race='Ungaran', Hair color='No Hair', Height='185.0', Publisher='DC Comics', Skin color='red', Alignment='good', Weight='90.0'),
 Row(Id='3', name='Abomination', Gender='Male', Eye color='green', Race='Human / Radiation', Hair color='No Hair', Height='203.0', Publisher='Marvel Comics', Skin color='-', Alignment='bad', Weight='441.0'),
 Row(Id='4', name='Abraxas', Gender='Male', Eye color='blue', Race='Cosmic Entity', Hair color='Black', Height='-99.0', Publisher='Marvel Comics', Skin color='-', Alignment='bad', Weight='-99.0')]

## Inspecting the content - `sample`

In [85]:
heros.sample(fraction=0.01).collect()

[Row(Id='198', name='Danny Cooper', Gender='Male', Eye color='brown', Race=None, Hair color='Blond', Height='-99.0', Publisher='HarperCollins', Skin color=None, Alignment='good', Weight='-99.0'),
 Row(Id='353', name='Jar Jar Binks', Gender='Male', Eye color='yellow', Race='Gungan', Hair color=None, Height='193.0', Publisher='George Lucas', Skin color='orange / white', Alignment='good', Weight='-99.0'),
 Row(Id='358', name='Jesse Quick', Gender='Female', Eye color=None, Race='Human', Hair color=None, Height='-99.0', Publisher='DC Comics', Skin color=None, Alignment='good', Weight='-99.0'),
 Row(Id='556', name='Renata Soliz', Gender='Female', Eye color=None, Race=None, Hair color=None, Height='-99.0', Publisher='HarperCollins', Skin color=None, Alignment='good', Weight='-99.0'),
 Row(Id='561', name='Rip Hunter', Gender='Male', Eye color='blue', Race='Human', Hair color='Blond', Height='-99.0', Publisher='DC Comics', Skin color=None, Alignment='good', Weight='-99.0'),
 Row(Id='660', name=

## Did you notice?

<img src="./img/pyspark_missing_values.png" width=400>

## Specifying a `nullValue`

In [92]:
heros = spark.read.csv('./data/heroes_information.csv', header=True, nullValue='-')
heros.take(5)

[Row(Id='0', name='A-Bomb', Gender='Male', Eye color='yellow', Race='Human', Hair color='No Hair', Height='203.0', Publisher='Marvel Comics', Skin color=None, Alignment='good', Weight='441.0'),
 Row(Id='1', name='Abe Sapien', Gender='Male', Eye color='blue', Race='Icthyo Sapien', Hair color='No Hair', Height='191.0', Publisher='Dark Horse Comics', Skin color='blue', Alignment='good', Weight='65.0'),
 Row(Id='2', name='Abin Sur', Gender='Male', Eye color='blue', Race='Ungaran', Hair color='No Hair', Height='185.0', Publisher='DC Comics', Skin color='red', Alignment='good', Weight='90.0'),
 Row(Id='3', name='Abomination', Gender='Male', Eye color='green', Race='Human / Radiation', Hair color='No Hair', Height='203.0', Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight='441.0'),
 Row(Id='4', name='Abraxas', Gender='Male', Eye color='blue', Race='Cosmic Entity', Hair color='Black', Height='-99.0', Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight='-99.0')

## Did you notice?

<img src="./img/pyspark_default_types.png" width=400>

Default type is a string

## Letting `spark` guess the types

Set `inferScheme=True` 

In [95]:
heros = spark.read.csv('./data/heroes_information.csv', header=True, inferSchema=True, nullValue='-')
heros

DataFrame[Id: int, name: string, Gender: string, Eye color: string, Race: string, Hair color: string, Height: double, Publisher: string, Skin color: string, Alignment: string, Weight: double]

## Checking the column types after `inferScheme`

In this case, `spark` guessed correctly

In [96]:
heros.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Eye color: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Hair color: string (nullable = true)
 |-- Height: double (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Skin color: string (nullable = true)
 |-- Alignment: string (nullable = true)
 |-- Weight: double (nullable = true)



## Inspecting the content - `take`

In [97]:
heros.take(5)

[Row(Id=0, name='A-Bomb', Gender='Male', Eye color='yellow', Race='Human', Hair color='No Hair', Height=203.0, Publisher='Marvel Comics', Skin color=None, Alignment='good', Weight=441.0),
 Row(Id=1, name='Abe Sapien', Gender='Male', Eye color='blue', Race='Icthyo Sapien', Hair color='No Hair', Height=191.0, Publisher='Dark Horse Comics', Skin color='blue', Alignment='good', Weight=65.0),
 Row(Id=2, name='Abin Sur', Gender='Male', Eye color='blue', Race='Ungaran', Hair color='No Hair', Height=185.0, Publisher='DC Comics', Skin color='red', Alignment='good', Weight=90.0),
 Row(Id=3, name='Abomination', Gender='Male', Eye color='green', Race='Human / Radiation', Hair color='No Hair', Height=203.0, Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight=441.0),
 Row(Id=4, name='Abraxas', Gender='Male', Eye color='blue', Race='Cosmic Entity', Hair color='Black', Height=-99.0, Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight=-99.0)]

## Explicit `schema` specification

Format is `add(name, type, nullable?)`

In [98]:
from pyspark.sql.types import StructType
from pyspark.sql.types import DoubleType, StringType, IntegerType

hero_schema = StructType().\
  add('Id', IntegerType(), False).\
  add('name', StringType(), True).\
  add('Gender', StringType(), True).\
  add('Eye color', StringType(), True).\
  add('Race', StringType(), True).\
  add('Hair color', StringType(), True).\
  add('Height', DoubleType(), True).\
  add('Publisher', StringType(), True).\
  add('Skin color', StringType(), True).\
  add('Alignment', StringType(), True).\
  add('Weight', DoubleType(), True)

heros = spark.read.csv('./data/heroes_information.csv', header=True, schema=hero_schema, nullValue='-')
heros

DataFrame[Id: int, name: string, Gender: string, Eye color: string, Race: string, Hair color: string, Height: double, Publisher: string, Skin color: string, Alignment: string, Weight: double]

In [99]:
heros.take(5)

[Row(Id=0, name='A-Bomb', Gender='Male', Eye color='yellow', Race='Human', Hair color='No Hair', Height=203.0, Publisher='Marvel Comics', Skin color=None, Alignment='good', Weight=441.0),
 Row(Id=1, name='Abe Sapien', Gender='Male', Eye color='blue', Race='Icthyo Sapien', Hair color='No Hair', Height=191.0, Publisher='Dark Horse Comics', Skin color='blue', Alignment='good', Weight=65.0),
 Row(Id=2, name='Abin Sur', Gender='Male', Eye color='blue', Race='Ungaran', Hair color='No Hair', Height=185.0, Publisher='DC Comics', Skin color='red', Alignment='good', Weight=90.0),
 Row(Id=3, name='Abomination', Gender='Male', Eye color='green', Race='Human / Radiation', Hair color='No Hair', Height=203.0, Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight=441.0),
 Row(Id=4, name='Abraxas', Gender='Male', Eye color='blue', Race='Cosmic Entity', Hair color='Black', Height=-99.0, Publisher='Marvel Comics', Skin color=None, Alignment='bad', Weight=-99.0)]

## <font color="red"> Exercise 1 </font>

Define a `schema` and read in `./data/super_hero_powers.csv`

In [64]:
from pyspark.sql.types import StructType
from pyspark.sql.types import BooleanType, StringType 

# Your code here

# Appendix

## Creating rows from list of data

## Creating a Row class

* Pass `Row` the columns names
* Creates a specialized `Row` class

In [65]:
Employee = Row("firstName", "lastName", "email", "salary")
Employee

<Row(firstName, lastName, email, salary)>

## Creating a `Employee` instance

* Pass the data to `Employee` to make a row
* Order matters ... use the same order as names

In [66]:
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee1

Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000)

## Unpacking a data list

* Suppose the data is in a list/tuple.
* Use sequence unpacking with `*`

In [67]:
empl2_info = ('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
empl2_info

('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)

In [68]:
employee2 = Employee(*empl2_info)
employee2

Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)

## Unpacking 

In [4]:
# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employees = [('michael', 'armbrust', 'no-reply@berkeley.edu', 100000),
             ('xiangrui', 'meng', 'no-reply@stanford.edu', 120000),
             ('matei', None, 'no-reply@waterloo.edu', 140000),
             (None, 'wendell', 'no-reply@berkeley.edu', 160000)]
emp_rows = [Employee(*r) for r in employees]
emp_rows

[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000),
 Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000),
 Row(firstName='matei', lastName=None, email='no-reply@waterloo.edu', salary=140000),
 Row(firstName=None, lastName='wendell', email='no-reply@berkeley.edu', salary=160000)]