###  Getting Started with Pyspark

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

import numpy
import pandas as pd
from pandas import DataFrame as df
import difflib
import datetime

#sc is the sparkContext that gets created whe pyspark is started
hc = HiveContext(sc)

#### If SparkContext doesnot get Intialized by itself, following set of commands may be used:

import findspark <br>
findspark.init() <br>
import pyspark # only run after findspark.init() <br>
from pyspark.sql import SparkSession <br>
spark = SparkSession.builder.getOrCreate() <br>
import pandas as pd <br>
sc = spark.sparkContext <br>

###  Make a sample dataframe

#### 1. Using pandas Dataframe:

Read Data into Pandas from csv, txt, excel files, etc. <br> 
Define a Schema: columns and their data types <br>
Use CreateDataFrame() to convert to spark dataframe

In [12]:
data1 = {'PassengerId': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
         'Name': {0: 'Owen', 1: 'Florence', 2: 'Laina', 3: 'Lily', 4: 'William'},
         'Gender': {0:'M', 1: 'F', 2: 'F', 3: 'F', 4: 'M'},
         'Survived': {0: 0, 1: 1, 2: 1, 3: 1, 4: 0}}

data2 = {'PassengerId': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
         'Age': {0: 22, 1: 38, 2: 26, 3: 35, 4: 35},
         'Fare': {0: 7.3, 1: 71.3, 2: 7.9, 3: 53.1, 4: 8.0},
         'Pclass': {0: 3, 1: 1, 2: 3, 3: 1, 4: 3}}

df1_pd = pd.DataFrame(data1, columns=data1.keys())
df2_pd = pd.DataFrame(data2, columns=data2.keys())

#### Without defining Data Schema

In [13]:
df1 = hc.createDataFrame(df1_pd)
df2 = hc.createDataFrame(df2_pd)
#df1.show()

In [14]:
df1.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- PassengerId: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Survived: long (nullable = true)



#### With Defined Data Schema 

In [17]:
df_schema_1 = StructType([StructField('Gender',StringType(),True),
                          StructField('PassengerId',IntegerType(),True),
                          StructField('Name',StringType(),True),
                          StructField('Survived',IntegerType(),True) ])
# for date type: StructField('MeetingDate',DateType(),True) ])
#True denotes that data field is nullable
df_spark_1 = hc.createDataFrame(df1_pd,df_schema_1)

#### Repeat the same for df2_pd

#### 2.Using Hadoop Tables:


In [None]:
# By importing the whole table: hc.table('table name')
#cust_1: database name in hadoop; #abc_xxx : table name
data = hc.table('cust_1.abc_xxx') 
# data is now a pyspark dataframe

# By running a sql query:

data = hc.sql(''' select name, r_no, age from cust_1.abc_xxx where trim(age) = '30' ''')

#data is now a pyspark dataframe

# To Note: trim is used in string types to ensure removal of white spaces, since data in hive can be unstructured

### 2. Change DataFrames Properties
#### 2.1 Change Column Type
 Available types include <br>

BinaryType <br>
BooleanType <br>
ByteType <br>
DoubleType <br>
DateType <br>
FloatType <br>
IntegerType <br>
etc.

#### Q. Change data type of PassengerID to String

In [19]:
from pyspark.sql.types import IntegerType, DateType

In [21]:
# .withColumn return a DataFrame by adding a column or replacing the existing column that has the same name.
df_spark_2 = df_spark_1.withColumn("PassengerId", df_spark_1["PassengerId"].cast(StringType()))


In [22]:
df_spark_2.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- PassengerId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Survived: integer (nullable = true)



In [23]:
df_spark_2.dtypes

[('Gender', 'string'),
 ('PassengerId', 'string'),
 ('Name', 'string'),
 ('Survived', 'int')]

#### 2.2 Change Column Name

#### Q. Rename Gender Column to Sex

In [24]:
df_spark_3 = df_spark_2.withColumnRenamed("Gender", "Sex")

In [25]:
df_spark_3.printSchema()

root
 |-- Sex: string (nullable = true)
 |-- PassengerId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Survived: integer (nullable = true)



#### 2.3 Modify Column Values and create new columns

#### Q1. Modify values in Sex Column: M to Male and F to female

In [29]:
df_spark_4 = df_spark_3.withColumn("Sex", F.when(trim(df_spark_3['Sex']) == 'M', 'Male').\
                                   otherwise(F.when(trim(df_spark_3['Sex']) == 'F', 'Female').otherwise('NA')))

In [39]:
df_spark_4

DataFrame[Sex: string, PassengerId: string, Name: string, Survived: int]



#### Q2. Create New column 'type_of_data' with constant value: 'train'

In [37]:
df_spark_5 = df_spark_4.withColumn('type_of_data', lit('train'))

#### 2.4 Order DataFrame by Specified Column(s)

#### Q. Order by  descending order: PassengerId

In [31]:
df_spark_4.sort(df_spark_4.PassengerId.desc()).show()

+------+-----------+--------+--------+
|   Sex|PassengerId|    Name|Survived|
+------+-----------+--------+--------+
|  Male|          5| William|       0|
|Female|          4|    Lily|       1|
|Female|          3|   Laina|       1|
|Female|          2|Florence|       1|
|  Male|          1|    Owen|       0|
+------+-----------+--------+--------+



In [40]:
df_spark_4.sort('PassengerId', ascending=False).show()

+------+-----------+--------+--------+
|   Sex|PassengerId|    Name|Survived|
+------+-----------+--------+--------+
|  Male|          5| William|       0|
|Female|          4|    Lily|       1|
|Female|          3|   Laina|       1|
|Female|          2|Florence|       1|
|  Male|          1|    Owen|       0|
+------+-----------+--------+--------+



#### 2.4 Filter by Specified Column(s)

#### Q. Filter out the female population from the data

Filter takes column expression or SQL expression ---


1.Using Column expression

In [36]:
female_data = df_spark_4.filter(trim(col('Sex')) == 'Female')

####  col('columnname') is eqivalent to dataframe['columnname'] equivalent to dataframe.columnname

2.Using SQL expression.Note the double and single quotes

In [34]:
female_data_2 = df_spark_4.filter("Sex='Female'")

#### 2.5 Summarize/Aggregate and group by

#### Q. Group By Passenger Class and find avergae Fare and Age

In [42]:
gdf2 = df2.groupby('Pclass')

avg_cols = ['Age', 'Fare']
gdf2.avg(*avg_cols).show()

+------+------------------+-----------------+
|Pclass|          avg(Age)|        avg(Fare)|
+------+------------------+-----------------+
|     1|              36.5|             62.2|
|     3|27.666666666666668|7.733333333333333|
+------+------------------+-----------------+



#### Q. Group By Passenger Class and find Total Fare and Average Age and count of passengers

In [43]:
gdf2.agg({'*': 'count', 'Age': 'avg', 'Fare':'sum'}).show()

+------+--------+------------------+---------+
|Pclass|count(1)|          avg(Age)|sum(Fare)|
+------+--------+------------------+---------+
|     1|       2|              36.5|    124.4|
|     3|       3|27.666666666666668|     23.2|
+------+--------+------------------+---------+



#### Q. rename the columns count(1), avg(Age) etc, 
using toDF().

In [44]:
gdf2.agg({'*': 'count', 'Age': 'avg', 'Fare':'sum'})\
    .toDF('Pclass', 'counts', 'average_age', 'total_fare')\
    .show()

+------+------+------------------+----------+
|Pclass|counts|       average_age|total_fare|
+------+------+------------------+----------+
|     1|     2|              36.5|     124.4|
|     3|     3|27.666666666666668|      23.2|
+------+------+------------------+----------+



#### Q. Arrange passenger class by total fare in ascending

In [46]:
df2.groupby('Pclass').agg({'Fare':'sum'}).sort('sum(Fare)', ascending = True).show()

+------+---------+
|Pclass|sum(Fare)|
+------+---------+
|     3|     23.2|
|     1|    124.4|
+------+---------+



#### Q. Get Count of distinct Passenger classes

In [47]:
df2.select('Pclass').distinct().count()

2

In [48]:
df2.select('Pclass').distinct().show()

+------+
|Pclass|
+------+
|     1|
|     3|
+------+



#### Q. Get Count of 'Distinct' Passenger Ids by Sex

#####Note 1: countDistinct here is a SQL Function hence 'F.countDistinct'; <br>
#####Note 2: Like F.countDistinct, F.sum, F.avg, F.max, etc sql functions can be used. <br>
#####Note 3: here there is no repitition of passenger ids hence count gives same result as count distinct function

In [49]:
df_spark_4.groupby('Sex').agg(F.countDistinct('PassengerId')).show()

+------+------------------+
|   Sex|count(PassengerId)|
+------+------------------+
|Female|                 3|
|  Male|                 2|
+------+------------------+



In [50]:
df2.groupby('Pclass').agg(F.sum('Fare')).show()

+------+---------+
|Pclass|sum(Fare)|
+------+---------+
|     1|    124.4|
|     3|     23.2|
+------+---------+



###Joins and unions¶
####There are two ways to combine dataframes --- joins and unions. The idea here is the same as joining and unioning tables in SQ

####Joins¶ Q.Join the two titanic dataframes by the column PassengerId

In [51]:
df1.join(df2, ['PassengerId']).show()

+-----------+------+--------+--------+----+---+------+
|PassengerId|Gender|    Name|Survived|Fare|Age|Pclass|
+-----------+------+--------+--------+----+---+------+
|          1|     M|    Owen|       0| 7.3| 22|     3|
|          2|     F|Florence|       1|71.3| 38|     1|
|          3|     F|   Laina|       1| 7.9| 26|     3|
|          4|     F|    Lily|       1|53.1| 35|     1|
|          5|     M| William|       0| 8.0| 35|     3|
+-----------+------+--------+--------+----+---+------+

