# Data Processing

## Import modules

In [1]:
# import SparkSession
from pyspark.sql import SparkSession

In [2]:
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType,DoubleType,IntegerType

## Set spark session

In [3]:
spark.sparkContext.appName

'PySparkShell'

## Load data

In [4]:
# Load csv Dataset 
df=spark.read.csv('data/sample_data.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("dfTable")

                                                                                

## Inspect data

In [5]:
# columns of dataframe
df.columns

['ratings', 'age', 'experience', 'family', 'mobile']

In [6]:
# check number of columns
len(df.columns)

5

In [7]:
# number of records in dataframe
df.count()

                                                                                

33

In [8]:
# shape of dataset
df.count(),len(df.columns)

(33, 5)

In [9]:
# print dataframe schema
df.printSchema()

root
 |-- ratings: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: double (nullable = true)
 |-- family: integer (nullable = true)
 |-- mobile: string (nullable = true)



In [10]:
# display fisrt few rows of dataframe
df.show()
#df.show(10)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      3| 27|      13.0|     3|  Apple|
|      4| 22|       2.5|     0|Samsung|
|      4| 37|      16.5|     4|  Apple|
|      5| 27|       9.0|     1|     MI|
|      4| 27|       9.0|     0|   Oppo|
|      5| 37|      23.0|     5|   Vivo|
|      5| 37|      23.0|     5|Samsung|
|      3| 22|       2.5|     0|  Apple|
|      3| 27|       6.0|     0|     MI|
|      2| 27|       6.0|     2|   Oppo|
|      5| 27|       6.0|     2|Samsung|
|      3| 37|      16.5|     5|  Apple|
|      5| 27|       6.0|     0|     MI|
|      4| 22|       6.0|     1|   Oppo|
|      4| 37|       9.0|     2|Samsung|
|      4| 27|       6.0|     1|  Apple|
|      1| 37|      23.0|     5|     MI|
|      2| 42|      23.0|     2|   Oppo|
|      4| 37|       6.0|     0|   Vivo|
+-------+---+----------+------+-------+
only showing top 20 rows



                                                                                

In [11]:
# display fisrt 5 rows of dataframe
df.head(5)

                                                                                

[Row(ratings=3, age=32, experience=9.0, family=3, mobile='Vivo'),
 Row(ratings=3, age=27, experience=13.0, family=3, mobile='Apple'),
 Row(ratings=4, age=22, experience=2.5, family=0, mobile='Samsung'),
 Row(ratings=4, age=37, experience=16.5, family=4, mobile='Apple'),
 Row(ratings=5, age=27, experience=9.0, family=1, mobile='MI')]

In [12]:
# display last 5 rows of dataframe
df.tail(5)

[Row(ratings=2, age=32, experience=16.5, family=2, mobile='Oppo'),
 Row(ratings=3, age=27, experience=6.0, family=0, mobile='MI'),
 Row(ratings=3, age=27, experience=6.0, family=0, mobile='MI'),
 Row(ratings=4, age=22, experience=6.0, family=1, mobile='Oppo'),
 Row(ratings=4, age=37, experience=6.0, family=0, mobile='Vivo')]

In [13]:
# display first row of dataframe
df.first()

Row(ratings=3, age=32, experience=9.0, family=3, mobile='Vivo')

In [14]:
spark.sql('''select * from dfTable limit 3''').show()

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      3| 27|      13.0|     3|  Apple|
|      4| 22|       2.5|     0|Samsung|
+-------+---+----------+------+-------+



## Descriptive statistics

In [15]:
# info about dataframe
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------+
|summary|           ratings|               age|        experience|            family|mobile|
+-------+------------------+------------------+------------------+------------------+------+
|  count|                33|                33|                33|                33|    33|
|   mean|3.5757575757575757|30.484848484848484|10.303030303030303|1.8181818181818181|  null|
| stddev|1.1188806636071336|  6.18527087180309| 6.770731351213326|1.8448330794164254|  null|
|    min|                 1|                22|               2.5|                 0| Apple|
|    max|                 5|                42|              23.0|                 5|  Vivo|
+-------+------------------+------------------+------------------+------------------+------+



In [16]:
# info about dataframe
df.summary().show()

+-------+------------------+------------------+------------------+------------------+------+
|summary|           ratings|               age|        experience|            family|mobile|
+-------+------------------+------------------+------------------+------------------+------+
|  count|                33|                33|                33|                33|    33|
|   mean|3.5757575757575757|30.484848484848484|10.303030303030303|1.8181818181818181|  null|
| stddev|1.1188806636071336|  6.18527087180309| 6.770731351213326|1.8448330794164254|  null|
|    min|                 1|                22|               2.5|                 0| Apple|
|    25%|                 3|                27|               6.0|                 0|  null|
|    50%|                 4|                27|               6.0|                 1|  null|
|    75%|                 4|                37|              16.5|                 3|  null|
|    max|                 5|                42|              23.0|    

## Select columns

In [17]:
# select only 2 columns
df.select('age','mobile').show(5)

+---+-------+
|age| mobile|
+---+-------+
| 32|   Vivo|
| 27|  Apple|
| 22|Samsung|
| 37|  Apple|
| 27|     MI|
+---+-------+
only showing top 5 rows



In [18]:
# use spark sql
spark.sql('select age, mobile from dfTable limit 5').show()

+---+-------+
|age| mobile|
+---+-------+
| 32|   Vivo|
| 27|  Apple|
| 22|Samsung|
| 37|  Apple|
| 27|     MI|
+---+-------+



In [19]:
# use different pyspark sql functions
df.select(
    fn.expr('ratings'), 
    fn.col('family'), 
    fn.column('mobile'))\
.show(3)

+-------+------+-------+
|ratings|family| mobile|
+-------+------+-------+
|      3|     3|   Vivo|
|      3|     3|  Apple|
|      4|     0|Samsung|
+-------+------+-------+
only showing top 3 rows



                                                                                

In [20]:
df.select(fn.expr('age+1 AS age1')).show(5)

+----+
|age1|
+----+
|  33|
|  28|
|  23|
|  38|
|  28|
+----+
only showing top 5 rows



In [21]:
df.selectExpr('age+1 AS age1').show(5)

+----+
|age1|
+----+
|  33|
|  28|
|  23|
|  38|
|  28|
+----+
only showing top 5 rows



In [22]:
df.selectExpr(
'*',  # all original columns
'(age>=30) as over30')\
.show(5)

+-------+---+----------+------+-------+------+
|ratings|age|experience|family| mobile|over30|
+-------+---+----------+------+-------+------+
|      3| 32|       9.0|     3|   Vivo|  true|
|      3| 27|      13.0|     3|  Apple| false|
|      4| 22|       2.5|     0|Samsung| false|
|      4| 37|      16.5|     4|  Apple|  true|
|      5| 27|       9.0|     1|     MI| false|
+-------+---+----------+------+-------+------+
only showing top 5 rows



In [23]:
spark.sql('SELECT *, (age>=30) as over30 FROM dfTable LIMIT 5').show()

+-------+---+----------+------+-------+------+
|ratings|age|experience|family| mobile|over30|
+-------+---+----------+------+-------+------+
|      3| 32|       9.0|     3|   Vivo|  true|
|      3| 27|      13.0|     3|  Apple| false|
|      4| 22|       2.5|     0|Samsung| false|
|      4| 37|      16.5|     4|  Apple|  true|
|      5| 27|       9.0|     1|     MI| false|
+-------+---+----------+------+-------+------+



## Add a column

In [24]:
# with column
df.withColumn('age_after_10_yrs',(df['age']+10)).show(10,False)

+-------+---+----------+------+-------+----------------+
|ratings|age|experience|family|mobile |age_after_10_yrs|
+-------+---+----------+------+-------+----------------+
|3      |32 |9.0       |3     |Vivo   |42              |
|3      |27 |13.0      |3     |Apple  |37              |
|4      |22 |2.5       |0     |Samsung|32              |
|4      |37 |16.5      |4     |Apple  |47              |
|5      |27 |9.0       |1     |MI     |37              |
|4      |27 |9.0       |0     |Oppo   |37              |
|5      |37 |23.0      |5     |Vivo   |47              |
|5      |37 |23.0      |5     |Samsung|47              |
|3      |22 |2.5       |0     |Apple  |32              |
|3      |27 |6.0       |0     |MI     |37              |
+-------+---+----------+------+-------+----------------+
only showing top 10 rows



In [25]:
# convert data type
df.withColumn('age_double',df['age'].cast(DoubleType())).show(10,False)

+-------+---+----------+------+-------+----------+
|ratings|age|experience|family|mobile |age_double|
+-------+---+----------+------+-------+----------+
|3      |32 |9.0       |3     |Vivo   |32.0      |
|3      |27 |13.0      |3     |Apple  |27.0      |
|4      |22 |2.5       |0     |Samsung|22.0      |
|4      |37 |16.5      |4     |Apple  |37.0      |
|5      |27 |9.0       |1     |MI     |27.0      |
|4      |27 |9.0       |0     |Oppo   |27.0      |
|5      |37 |23.0      |5     |Vivo   |37.0      |
|5      |37 |23.0      |5     |Samsung|37.0      |
|3      |22 |2.5       |0     |Apple  |22.0      |
|3      |27 |6.0       |0     |MI     |27.0      |
+-------+---+----------+------+-------+----------+
only showing top 10 rows



In [26]:
# use selectExpr method
df.selectExpr(
'*',  # all original columns
'cast(age as double) as age_double')\
.show(5)

+-------+---+----------+------+-------+----------+
|ratings|age|experience|family| mobile|age_double|
+-------+---+----------+------+-------+----------+
|      3| 32|       9.0|     3|   Vivo|      32.0|
|      3| 27|      13.0|     3|  Apple|      27.0|
|      4| 22|       2.5|     0|Samsung|      22.0|
|      4| 37|      16.5|     4|  Apple|      37.0|
|      5| 27|       9.0|     1|     MI|      27.0|
+-------+---+----------+------+-------+----------+
only showing top 5 rows



## Delete column

In [27]:
# delete a column
df_new=df.drop('mobile')
df_new.show(5)

+-------+---+----------+------+
|ratings|age|experience|family|
+-------+---+----------+------+
|      3| 32|       9.0|     3|
|      3| 27|      13.0|     3|
|      4| 22|       2.5|     0|
|      4| 37|      16.5|     4|
|      5| 27|       9.0|     1|
+-------+---+----------+------+
only showing top 5 rows



In [28]:
df_new=df.drop('age', 'mobile')
df_new.show(5)

+-------+----------+------+
|ratings|experience|family|
+-------+----------+------+
|      3|       9.0|     3|
|      3|      13.0|     3|
|      4|       2.5|     0|
|      4|      16.5|     4|
|      5|       9.0|     1|
+-------+----------+------+
only showing top 5 rows



## Filter data

In [29]:
# filter the records 
df.filter(df['mobile']=='Vivo').show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      3| 32|       9.0|     3|  Vivo|
|      5| 37|      23.0|     5|  Vivo|
|      4| 37|       6.0|     0|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
|      4| 37|       6.0|     0|  Vivo|
+-------+---+----------+------+------+



In [30]:
# filter the records 
df.filter(df['mobile']=='Vivo').select('age','ratings','mobile').show()

+---+-------+------+
|age|ratings|mobile|
+---+-------+------+
| 32|      3|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
+---+-------+------+



In [31]:
# filter the records 
df.filter("mobile=='Vivo'").select('age','ratings','mobile').show()

+---+-------+------+
|age|ratings|mobile|
+---+-------+------+
| 32|      3|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
+---+-------+------+



In [32]:
# filter the records with spark sql 
spark.sql("""select age, ratings, mobile from dfTable where mobile=='Vivo'""").show()

+---+-------+------+
|age|ratings|mobile|
+---+-------+------+
| 32|      3|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
| 37|      5|  Vivo|
| 37|      4|  Vivo|
+---+-------+------+



In [33]:
# filter the multiple conditions
df.filter(df['mobile']=='Vivo').filter(df['experience'] >10).show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      5| 37|      23.0|     5|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
+-------+---+----------+------+------+



In [34]:
# filter the multiple conditions
df.filter((df['mobile']=='Vivo')&(df['experience'] >10)).show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      5| 37|      23.0|     5|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
+-------+---+----------+------+------+



In [35]:
# filter the multiple conditions
df.filter("mobile=='Vivo' and experience>10").show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      5| 37|      23.0|     5|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
+-------+---+----------+------+------+



In [36]:
# filter the records with spark sql 
spark.sql('''select * from dfTable where mobile=='Vivo' and experience>10''').show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      5| 37|      23.0|     5|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
+-------+---+----------+------+------+



## Distinct Values

In [37]:
# Distinct Values in a column
df.select('mobile').distinct().show()

+-------+
| mobile|
+-------+
|     MI|
|   Oppo|
|Samsung|
|   Vivo|
|  Apple|
+-------+



In [38]:
# Distinct Values in columns
df.select('ratings', 'mobile').distinct().show()

+-------+-------+
|ratings| mobile|
+-------+-------+
|      5|Samsung|
|      5|     MI|
|      4|  Apple|
|      3|  Apple|
|      2|   Oppo|
|      4|   Oppo|
|      1|     MI|
|      3|     MI|
|      4|Samsung|
|      5|   Vivo|
|      4|   Vivo|
|      3|   Vivo|
|      2|Samsung|
+-------+-------+



In [39]:
# distinct value count
df.select('mobile').distinct().count()

5

In [40]:
# use spark sql
spark.sql('''select distinct(mobile) from dfTable''').show()

+-------+
| mobile|
+-------+
|     MI|
|   Oppo|
|Samsung|
|   Vivo|
|  Apple|
+-------+



In [41]:
# use spark sql
spark.sql('''select count(distinct(mobile)) as counts from dfTable''').show()

+------+
|counts|
+------+
|     5|
+------+



## Sort rows

In [42]:
# sort age
df.sort('age').show(10)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      4| 22|       6.0|     1|   Oppo|
|      4| 22|       2.5|     0|Samsung|
|      5| 22|       2.5|     0|Samsung|
|      3| 22|       2.5|     0|  Apple|
|      4| 22|       6.0|     1|   Oppo|
|      5| 27|       6.0|     0|     MI|
|      5| 27|       6.0|     2|Samsung|
|      5| 27|       9.0|     1|     MI|
|      4| 27|       6.0|     1|  Apple|
|      3| 27|      13.0|     3|  Apple|
+-------+---+----------+------+-------+
only showing top 10 rows



In [43]:
# order by ratings and family
df.orderBy('ratings', 'family').show(10)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      1| 37|      23.0|     5|     MI|
|      2| 27|       6.0|     2|   Oppo|
|      2| 32|      16.5|     2|   Oppo|
|      2| 42|      23.0|     2|   Oppo|
|      2| 27|       9.0|     2|Samsung|
|      2| 27|       6.0|     2|   Oppo|
|      3| 22|       2.5|     0|  Apple|
|      3| 27|       6.0|     0|     MI|
|      3| 27|       6.0|     0|     MI|
|      3| 27|       6.0|     0|     MI|
+-------+---+----------+------+-------+
only showing top 10 rows



In [44]:
# descending order
df.orderBy('experience', ascending=False).show(10)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      5| 37|      23.0|     5|   Vivo|
|      5| 37|      23.0|     5|Samsung|
|      1| 37|      23.0|     5|     MI|
|      3| 42|      23.0|     5|     MI|
|      2| 42|      23.0|     2|   Oppo|
|      3| 37|      16.5|     5|  Apple|
|      2| 32|      16.5|     2|   Oppo|
|      4| 37|      16.5|     4|  Apple|
|      3| 37|      16.5|     5|  Apple|
|      3| 27|      13.0|     3|  Apple|
+-------+---+----------+------+-------+
only showing top 10 rows



In [45]:
# use spark sql
spark.sql("""select ratings, age, mobile from dfTable 
        order by age desc""").show(10)

+-------+---+-------+
|ratings|age| mobile|
+-------+---+-------+
|      2| 42|   Oppo|
|      3| 42|     MI|
|      4| 37|  Apple|
|      5| 37|   Vivo|
|      3| 37|  Apple|
|      1| 37|     MI|
|      5| 37|Samsung|
|      4| 37|Samsung|
|      4| 37|   Vivo|
|      3| 37|  Apple|
+-------+---+-------+
only showing top 10 rows



## Null values

In [46]:
# create dataframe
data = [('Alice', 22, 52),
        ('John', None , 68),
        ('Mary', 24, 55),
        ('Alan', None, None),
        ('Jane', 32, 48)]

emp=spark.createDataFrame(data, ['name', 'age', 'weight'])
emp.show()

+-----+----+------+
| name| age|weight|
+-----+----+------+
|Alice|  22|    52|
| John|null|    68|
| Mary|  24|    55|
| Alan|null|  null|
| Jane|  32|    48|
+-----+----+------+



In [47]:
# fill null value
emp.fillna(1).show()

+-----+---+------+
| name|age|weight|
+-----+---+------+
|Alice| 22|    52|
| John|  1|    68|
| Mary| 24|    55|
| Alan|  1|     1|
| Jane| 32|    48|
+-----+---+------+



In [48]:
# fill null value
val={'age': 30, 'weight': 50}
emp.fillna(val).show()

+-----+---+------+
| name|age|weight|
+-----+---+------+
|Alice| 22|    52|
| John| 30|    68|
| Mary| 24|    55|
| Alan| 30|    50|
| Jane| 32|    48|
+-----+---+------+



In [49]:
# drop null data
emp.dropna().show()

+-----+---+------+
| name|age|weight|
+-----+---+------+
|Alice| 22|    52|
| Mary| 24|    55|
| Jane| 32|    48|
+-----+---+------+



In [50]:
# drop null data
emp.dropna(thresh=2).show()

+-----+----+------+
| name| age|weight|
+-----+----+------+
|Alice|  22|    52|
| John|null|    68|
| Mary|  24|    55|
| Jane|  32|    48|
+-----+----+------+



## Duplicate data

In [51]:
# create dataframe
data = [('Alice', 22, 'USA'),
        ('John', 18 , 'Japan'),
        ('Mary', 24, 'Germany'),
        ('Alice', 22, 'USA'),
        ('Jane', 24, 'Germany')]

cust=spark.createDataFrame(data, ['name', 'age', 'country'])
cust.show()

+-----+---+-------+
| name|age|country|
+-----+---+-------+
|Alice| 22|    USA|
| John| 18|  Japan|
| Mary| 24|Germany|
|Alice| 22|    USA|
| Jane| 24|Germany|
+-----+---+-------+



In [52]:
# drop duplicate rows
cust.dropDuplicates().show()

+-----+---+-------+
| name|age|country|
+-----+---+-------+
| Jane| 24|Germany|
|Alice| 22|    USA|
| Mary| 24|Germany|
| John| 18|  Japan|
+-----+---+-------+



In [53]:
# drop duplicate rows base on some columns
cust.dropDuplicates(subset=['age', 'country']).show()

+-----+---+-------+
| name|age|country|
+-----+---+-------+
| John| 18|  Japan|
|Alice| 22|    USA|
| Mary| 24|Germany|
+-----+---+-------+



In [54]:
# use spark sql
cust.createOrReplaceTempView("custTable")
spark.sql('''select distinct * from custTable''').show()

+-----+---+-------+
| name|age|country|
+-----+---+-------+
| John| 18|  Japan|
|Alice| 22|    USA|
| Mary| 24|Germany|
| Jane| 24|Germany|
+-----+---+-------+



22/05/25 15:48:50 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
22/05/25 15:48:50 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:919)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:154)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:262)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:169)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)
	at org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$Mess