# Data Processing

## Import modules

In [None]:
# import SparkSession
from pyspark.sql import SparkSession

In [None]:
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType,DoubleType,IntegerType

## Set spark session

In [None]:
spark.sparkContext.appName

## Load data

In [None]:
# Load csv Dataset 
df=spark.read.csv('data/sample_data.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("dfTable")

## Inspect data

In [None]:
# columns of dataframe
df.columns

In [None]:
# check number of columns
len(df.columns)

In [None]:
# number of records in dataframe
df.count()

In [None]:
# shape of dataset
df.count(),len(df.columns)

In [None]:
# print dataframe schema
df.printSchema()

In [None]:
# display fisrt few rows of dataframe
df.show()
#df.show(10)

In [None]:
# display fisrt 5 rows of dataframe
df.head(5)

In [None]:
# display last 5 rows of dataframe
df.tail(5)

In [None]:
# display first row of dataframe
df.first()

In [None]:
spark.sql('''select * from dfTable limit 3''').show()

## Descriptive statistics

In [None]:
# info about dataframe
df.describe().show()

In [None]:
# info about dataframe
df.summary().show()

## Select columns

In [None]:
# select only 2 columns
df.select('age','mobile').show(5)

In [None]:
# use spark sql
spark.sql('select age, mobile from dfTable limit 5').show()

In [None]:
# use different pyspark sql functions
df.select(
    fn.expr('ratings'), 
    fn.col('family'), 
    fn.column('mobile'))\
.show(3)

In [None]:
df.select(fn.expr('age+1 AS age1')).show(5)

In [None]:
df.selectExpr('age+1 AS age1').show(5)

In [None]:
df.selectExpr(
'*',  # all original columns
'(age>=30) as over30')\
.show(5)

In [None]:
spark.sql('SELECT *, (age>=30) as over30 FROM dfTable LIMIT 5').show()

## Add a column

In [None]:
# with column
df.withColumn('age_after_10_yrs',(df['age']+10)).show(10,False)

In [None]:
# convert data type
df.withColumn('age_double',df['age'].cast(DoubleType())).show(10,False)

In [None]:
# use selectExpr method
df.selectExpr(
'*',  # all original columns
'cast(age as double) as age_double')\
.show(5)

## Delete column

In [None]:
# delete a column
df_new=df.drop('mobile')
df_new.show(5)

In [None]:
df_new=df.drop('age', 'mobile')
df_new.show(5)

## Filter data

In [None]:
# filter the records 
df.filter(df['mobile']=='Vivo').show()

In [None]:
# filter the records 
df.filter(df['mobile']=='Vivo').select('age','ratings','mobile').show()

In [None]:
# filter the records 
df.filter("mobile=='Vivo'").select('age','ratings','mobile').show()

In [None]:
# filter the records with spark sql 
spark.sql("""select age, ratings, mobile from dfTable where mobile=='Vivo'""").show()

In [None]:
# filter the multiple conditions
df.filter(df['mobile']=='Vivo').filter(df['experience'] >10).show()

In [None]:
# filter the multiple conditions
df.filter((df['mobile']=='Vivo')&(df['experience'] >10)).show()

In [None]:
# filter the multiple conditions
df.filter("mobile=='Vivo' and experience>10").show()

In [None]:
# filter the records with spark sql 
spark.sql('''select * from dfTable where mobile=='Vivo' and experience>10''').show()

## Distinct Values

In [None]:
# Distinct Values in a column
df.select('mobile').distinct().show()

In [None]:
# Distinct Values in columns
df.select('ratings', 'mobile').distinct().show()

In [None]:
# distinct value count
df.select('mobile').distinct().count()

In [None]:
# use spark sql
spark.sql('''select distinct(mobile) from dfTable''').show()

In [None]:
# use spark sql
spark.sql('''select count(distinct(mobile)) as counts from dfTable''').show()

## Sort rows

In [None]:
# sort age
df.sort('age').show(10)

In [None]:
# order by ratings and family
df.orderBy('ratings', 'family').show(10)

In [None]:
# descending order
df.orderBy('experience', ascending=False).show(10)

In [None]:
# use spark sql
spark.sql("""select ratings, age, mobile from dfTable 
        order by age desc""").show(10)

## Null values

In [None]:
# create dataframe
data = [('Alice', 22, 52),
        ('John', None , 68),
        ('Mary', 24, 55),
        ('Alan', None, None),
        ('Jane', 32, 48)]

emp=spark.createDataFrame(data, ['name', 'age', 'weight'])
emp.show()

In [None]:
# fill null value
emp.fillna(1).show()

In [None]:
# fill null value
val={'age': 30, 'weight': 50}
emp.fillna(val).show()

In [None]:
# drop null data
emp.dropna().show()

In [None]:
# drop null data
emp.dropna(thresh=2).show()

## Duplicate data

In [None]:
# create dataframe
data = [('Alice', 22, 'USA'),
        ('John', 18 , 'Japan'),
        ('Mary', 24, 'Germany'),
        ('Alice', 22, 'USA'),
        ('Jane', 24, 'Germany')]

cust=spark.createDataFrame(data, ['name', 'age', 'country'])
cust.show()

In [None]:
# drop duplicate rows
cust.dropDuplicates().show()

In [None]:
# drop duplicate rows base on some columns
cust.dropDuplicates(subset=['age', 'country']).show()

In [None]:
# use spark sql
cust.createOrReplaceTempView("custTable")
spark.sql('''select distinct * from custTable''').show()