In [1]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .master("local[*]") \
    .getOrCreate()

# Access SparkContext from SparkSession
sc = spark.sparkContext

In [2]:
df= spark.read.format('csv').option('inferSchema',True).option('header',True).load('/users/shubh/Data Engineering/BigMart Sales.csv')

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
df_json = spark.read.format('json').option('inferedSchema', True)\
    .option('header', True)\
        .option('multiLine',False).load('/users/shubh/Data Engineering/drivers.json')

In [None]:
df.filter(col('Outlet_Location_Type').isin('Tier 1','Tier 2') & col('Outlet_Size').isNull()).limit(20).toPandas()

In [None]:
df.limit(20).toPandas()

In [19]:
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

### Column Rename: withColumnRenamed

In [None]:
df.withColumnRenamed('Item_Weight','Item_Wt').limit(10).toPandas()

### withColumn

## Scenario 1

In [None]:
df_new = df.withColumn('Flag',lit('New'))

In [None]:
df_new.limit(10).toPandas()

In [None]:
df_new.withColumn('Multiple', col('Item_Weight')*col('Item_MRP')).toPandas()

### Scenario 2

In [None]:
df_new.withColumn('Item_Fat_Content', regexp_replace('Item_Fat_Content','Regular','Reg'))\
        .withColumn('Item_Fat_Content', regexp_replace('Item_Fat_Content','Low Fat','LF')).toPandas()

### Type Casting

In [None]:
df_new1= df.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))

In [None]:
df_new1.printSchema()

# Sort

## Scenario 1

In [None]:
df.sort(col('Item_Weight').desc()).limit(50).toPandas()

In [None]:
df.sort(col('Item_Visibility').asc()).limit(10).toPandas()

## Scenario 2

In [None]:
df.sort(['Item_Weight','Item_MRP'], ascending= [0,1]).limit(100).toPandas()

# limit

In [None]:
df.limit(10).show()

# DROP

In [None]:
df.drop('Item_Visibility').limit(10).show()

In [None]:
df.limit(10).toPandas()

In [None]:
df.drop('Item_Visibility','Outlet_Type').limit(10).toPandas()

# Drop Duplicates

### Scenario 1 (table level)

In [None]:
df.toPandas()

In [None]:
df.dropDuplicates().toPandas()

In [None]:
df.distinct().toPandas()

### Scenario 2 (column level)

In [None]:
df.drop_duplicates(subset=['Item_Type']).toPandas()

# Union

### creating Dataframes

In [2]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Data and schema for df1
data1 = [('1', 'kad'), ('2', 'sid')]
schema1 = 'id STRING, name STRING'
df1 = spark.createDataFrame(data1, schema1)

# Data and schema for df2
data2 = [('3', 'rahul'), ('4', 'jas')]
schema2 = 'id STRING, name STRING'
df2 = spark.createDataFrame(data2, schema2)

# Show the dataframes
df1.show()
df2.show()

+---+----+
| id|name|
+---+----+
|  1| kad|
|  2| sid|
+---+----+

+---+-----+
| id| name|
+---+-----+
|  3|rahul|
|  4|  jas|
+---+-----+



In [6]:
df1.union(df2).show()

+---+-----+
| id| name|
+---+-----+
|  1|  kad|
|  2|  sid|
|  3|rahul|
|  4|  jas|
+---+-----+



In [12]:
data1 = [('kad','1'), ('sid','2')]
schema1 = 'name STRING,id STRING'
df1 = spark.createDataFrame(data1, schema1)

# Data and schema for df2
data2 = [('3', 'rahul'), ('4', 'jas')]
schema2 = 'id STRING, name STRING'
df2 = spark.createDataFrame(data2, schema2)

# Show the dataframes
df1.show()
df2.show()
df1.union(df2).show()

+----+---+
|name| id|
+----+---+
| kad|  1|
| sid|  2|
+----+---+

+---+-----+
| id| name|
+---+-----+
|  3|rahul|
|  4|  jas|
+---+-----+

+----+-----+
|name|   id|
+----+-----+
| kad|    1|
| sid|    2|
|   3|rahul|
|   4|  jas|
+----+-----+



### unionByName

In [13]:
df1.unionByName(df2).show()

+-----+---+
| name| id|
+-----+---+
|  kad|  1|
|  sid|  2|
|rahul|  3|
|  jas|  4|
+-----+---+



In [26]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



# String Functions

In [19]:
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [21]:
df.select(initcap('Item_Type').alias('Item_Type')).toPandas()

Unnamed: 0,Item_Type
0,Dairy
1,Soft Drinks
2,Meat
3,Fruits And Vegetables
4,Household
...,...
8518,Snack Foods
8519,Baking Goods
8520,Health And Hygiene
8521,Snack Foods


In [22]:
df.select(upper('Item_Type').alias('Item_Type')).toPandas()

Unnamed: 0,Item_Type
0,DAIRY
1,SOFT DRINKS
2,MEAT
3,FRUITS AND VEGETABLES
4,HOUSEHOLD
...,...
8518,SNACK FOODS
8519,BAKING GOODS
8520,HEALTH AND HYGIENE
8521,SNACK FOODS


In [23]:
df.select(lower('Item_Type').alias('Item_Type')).toPandas()

Unnamed: 0,Item_Type
0,dairy
1,soft drinks
2,meat
3,fruits and vegetables
4,household
...,...
8518,snack foods
8519,baking goods
8520,health and hygiene
8521,snack foods


# Data functions

In [25]:
df= df.withColumn('curr_date',current_date())
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-08
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-08
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-08
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-08
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-08
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-08
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-08
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-08


In [30]:
df= df.withColumn('week_after',date_add('curr_date',7))
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-15,2025-06-22,2025-06-08
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-15,2025-06-22,2025-06-08
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-15,2025-06-22,2025-06-08
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-15,2025-06-22,2025-06-08
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-15,2025-06-22,2025-06-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-15,2025-06-22,2025-06-08
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-15,2025-06-22,2025-06-08
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-15,2025-06-22,2025-06-08
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-15,2025-06-22,2025-06-08


### date substract

In [31]:
df= df.withColumn('week_before',date_sub('curr_date',7))
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-15,2025-06-22,2025-06-08
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-15,2025-06-22,2025-06-08
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-15,2025-06-22,2025-06-08
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-15,2025-06-22,2025-06-08
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-15,2025-06-22,2025-06-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-15,2025-06-22,2025-06-08
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-15,2025-06-22,2025-06-08
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-15,2025-06-22,2025-06-08
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-15,2025-06-22,2025-06-08


In [43]:
df= df.withColumn('week_before',date_add('curr_date',-7))
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_diff
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-15,2025-06-22,2025-06-08,14
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-15,2025-06-22,2025-06-08,14
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-15,2025-06-22,2025-06-08,14
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-15,2025-06-22,2025-06-08,14
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-15,2025-06-22,2025-06-08,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-15,2025-06-22,2025-06-08,14
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-15,2025-06-22,2025-06-08,14
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-15,2025-06-22,2025-06-08,14
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-15,2025-06-22,2025-06-08,14


In [34]:
df=df.withColumn('date_diff',date_diff('week_after','week_before'))
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_diff
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-15,2025-06-22,2025-06-08,14
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-15,2025-06-22,2025-06-08,14
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-15,2025-06-22,2025-06-08,14
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-15,2025-06-22,2025-06-08,14
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-15,2025-06-22,2025-06-08,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-15,2025-06-22,2025-06-08,14
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-15,2025-06-22,2025-06-08,14
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-15,2025-06-22,2025-06-08,14
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-15,2025-06-22,2025-06-08,14


## Date format

In [45]:
df = df.withColumn('week_before', date_format('week_before', 'dd-MM-yyyy'))
df.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_diff
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,2025-06-15,2025-06-22,08-06-2025,14
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-06-15,2025-06-22,08-06-2025,14
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,2025-06-15,2025-06-22,08-06-2025,14
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,2025-06-15,2025-06-22,08-06-2025,14
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-06-15,2025-06-22,08-06-2025,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2025-06-15,2025-06-22,08-06-2025,14
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,2025-06-15,2025-06-22,08-06-2025,14
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,2025-06-15,2025-06-22,08-06-2025,14
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,2025-06-15,2025-06-22,08-06-2025,14


In [44]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- curr_date: date (nullable = false)
 |-- week_after: date (nullable = false)
 |-- week_before: date (nullable = false)
 |-- date_diff: integer (nullable = false)



In [4]:
df.toPandas()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


# Handling nulls
### Droping nulls

In [6]:
df.dropna('all').toPandas() # all colums null

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [8]:
df.dropna('any').toPandas()  #any column is null

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
4,FDP36,10.395,Regular,0.000000,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
...,...,...,...,...,...,...,...,...,...,...,...,...
4645,FDF53,20.750,reg,0.083607,Frozen Foods,178.8318,OUT046,1997,Small,Tier 1,Supermarket Type1,3608.6360
4646,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
4647,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
4648,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [11]:
df.dropna(subset=['Outlet_Size']).toPandas() #specific column

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
4,FDP36,10.395,Regular,0.000000,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
...,...,...,...,...,...,...,...,...,...,...,...,...
6108,FDF53,20.750,reg,0.083607,Frozen Foods,178.8318,OUT046,1997,Small,Tier 1,Supermarket Type1,3608.6360
6109,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
6110,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
6111,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


### Filling Nulls

In [12]:
df.fillna('Not available').toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Not available,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,Not available,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [13]:
df.fillna('Not Available',subset=['Outlet_Size']).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|  Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|       Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|       Medium|              Tier

# Split and indexing
### Split

In [14]:
df.withColumn('Outlet_Type',split('Outlet_Type', ' ')).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|         Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|[Supermarket, Type1]|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Ti

### indexing

In [18]:
df.withColumn('Outlet_Type',split('Outlet_Type', ' ')[1]).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|      Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|      Type2|         443.4228|


# Explode

In [19]:
df_exp =df.withColumn('Outlet_Type',split('Outlet_Type', ' '))

df_exp.toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"[Supermarket, Type1]",3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,"[Supermarket, Type2]",443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,"[Supermarket, Type1]",2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,"[Grocery, Store]",732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,"[Supermarket, Type1]",994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,"[Supermarket, Type1]",2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,"[Supermarket, Type1]",549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,"[Supermarket, Type1]",1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,"[Supermarket, Type2]",1845.5976


In [21]:
df_exp.withColumn('Outlet_Type',explode('Outlet_Type')).toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket,3735.1380
1,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Type1,3735.1380
2,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket,443.4228
3,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Type2,443.4228
4,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket,2097.2700
...,...,...,...,...,...,...,...,...,...,...,...,...
17041,NCJ29,10.60,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Type1,1193.1136
17042,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket,1845.5976
17043,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Type2,1845.5976
17044,DRG01,14.80,Low Fat,0.044878,Soft Drinks,75.4670,OUT046,1997,Small,Tier 1,Supermarket,765.6700


### array contains

In [22]:
df_exp.withColumn('Outlet_Type_flag',array_contains('Outlet_Type','Type1')).toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Type_flag
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"[Supermarket, Type1]",3735.1380,True
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,"[Supermarket, Type2]",443.4228,False
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,"[Supermarket, Type1]",2097.2700,True
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,"[Grocery, Store]",732.3800,False
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,"[Supermarket, Type1]",994.7052,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,"[Supermarket, Type1]",2778.3834,True
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,"[Supermarket, Type1]",549.2850,True
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,"[Supermarket, Type1]",1193.1136,True
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,"[Supermarket, Type2]",1845.5976,False


# Group by

### Scenario 1

In [24]:
df.groupBy('Item_Type').agg(sum('Item_MRP')).toPandas()

Unnamed: 0,Item_Type,sum(Item_MRP)
0,Starchy Foods,21880.0274
1,Baking Goods,81894.7364
2,Breads,35379.1198
3,Fruits and Vegetables,178124.081
4,Meat,59449.8638
5,Hard Drinks,29334.6766
6,Soft Drinks,58514.165
7,Household,135976.5254
8,Breakfast,15596.6966
9,Dairy,101276.4596


In [25]:
df.groupBy('Item_Type').agg(avg('Item_MRP')).toPandas()

Unnamed: 0,Item_Type,avg(Item_MRP)
0,Starchy Foods,147.838023
1,Baking Goods,126.380766
2,Breads,140.952669
3,Fruits and Vegetables,144.581235
4,Meat,139.882032
5,Hard Drinks,137.077928
6,Soft Drinks,131.492506
7,Household,149.424753
8,Breakfast,141.788151
9,Dairy,148.499208


### Scenario 2

In [26]:
df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP').alias('Item_MRP_sum')).toPandas()

Unnamed: 0,Item_Type,Outlet_Size,Item_MRP_sum
0,Starchy Foods,Medium,7124.1362
1,Fruits and Vegetables,Medium,59047.2172
2,Starchy Foods,,6040.6402
3,Breads,,10011.5004
4,Baking Goods,,23433.8388
...,...,...,...
59,Dairy,Small,28896.5360
60,Health and Hygiene,,19975.6632
61,Health and Hygiene,Small,17929.0888
62,Others,High,2121.2258


### Scenario 3

In [29]:
df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP').alias('Item_MRP_sum'),avg('Item_MRP')\
                                          .alias('Item_MRP_Average')).toPandas()

Unnamed: 0,Item_Type,Outlet_Size,Item_MRP_sum,Item_MRP_Average
0,Starchy Foods,Medium,7124.1362,148.419504
1,Fruits and Vegetables,Medium,59047.2172,142.971470
2,Starchy Foods,,6040.6402,140.480005
3,Breads,,10011.5004,139.048617
4,Baking Goods,,23433.8388,126.669399
...,...,...,...,...
59,Dairy,Small,28896.5360,145.942101
60,Health and Hygiene,,19975.6632,130.559890
61,Health and Hygiene,Small,17929.0888,131.831535
62,Others,High,2121.2258,132.576613


### collect list

In [31]:
data = [('user1','book1'),
        ('user1','book2'),
        ('user2','book2'),
        ('user2','book4'),
        ('user3','book1')]

schema = 'user string, book string'

df_book = spark.createDataFrame(data,schema)

df_book.show()

+-----+-----+
| user| book|
+-----+-----+
|user1|book1|
|user1|book2|
|user2|book2|
|user2|book4|
|user3|book1|
+-----+-----+



In [32]:
df_book.groupBy('user').agg(collect_list('book')).show()

+-----+------------------+
| user|collect_list(book)|
+-----+------------------+
|user1|    [book1, book2]|
|user2|    [book2, book4]|
|user3|           [book1]|
+-----+------------------+



In [33]:
df.select('Item_Type','Outlet_Size','Item_MRP').show()

+--------------------+-----------+--------+
|           Item_Type|Outlet_Size|Item_MRP|
+--------------------+-----------+--------+
|               Dairy|     Medium|249.8092|
|         Soft Drinks|     Medium| 48.2692|
|                Meat|     Medium| 141.618|
|Fruits and Vegeta...|       NULL| 182.095|
|           Household|       High| 53.8614|
|        Baking Goods|     Medium| 51.4008|
|         Snack Foods|       High| 57.6588|
|         Snack Foods|     Medium|107.7622|
|        Frozen Foods|       NULL| 96.9726|
|        Frozen Foods|       NULL|187.8214|
|Fruits and Vegeta...|     Medium| 45.5402|
|               Dairy|      Small|144.1102|
|Fruits and Vegeta...|     Medium|145.4786|
|         Snack Foods|      Small|119.6782|
|Fruits and Vegeta...|       High|196.4426|
|           Breakfast|      Small| 56.3614|
|  Health and Hygiene|     Medium|115.3492|
|           Breakfast|     Medium| 54.3614|
|         Hard Drinks|     Medium|113.2834|
|               Dairy|      Smal

# Pivot

In [35]:
df.groupBy('Item_Type').pivot('Outlet_Size').agg(avg('Item_MRP')).show()

+--------------------+------------------+------------------+------------------+------------------+
|           Item_Type|              null|              High|            Medium|             Small|
+--------------------+------------------+------------------+------------------+------------------+
|       Starchy Foods|140.48000465116277|158.15707368421053| 148.4195041666666| 150.2701736842105|
|              Breads|139.04861666666667|         133.75896| 140.8610385542169| 145.5236507042254|
|        Baking Goods|126.66939891891889|129.20204383561642|126.17856847290639|125.21336363636368|
|Fruits and Vegeta...|142.57516045845267|145.57287042253515| 142.9714702179177|148.31336951219507|
|                Meat|139.29453448275865| 137.2447902439025|136.41913154362408|145.69925042016808|
|         Hard Drinks| 134.3875333333333| 141.9275217391304|142.83769599999994|        129.758784|
|         Soft Drinks|133.42344360902257|131.75847346938772| 128.2696817518248| 132.8550428571429|
|         