In [0]:
from pyspark.sql.functions import *

In [0]:
my_ddl_schema= '''
                    Item_Identifier String,
                    Item_Weight Double,
                    Item_Fat_Content String,
                    Item_Visibility Double,
                    Item_Type String,
                    Item_MRP Double,
                    Outlet_Identifier String,
                    Outlet_Establishment_Year Integer,
                    Outlet_Size String,
                    Outlet_Location_Type String,
                    Outlet_Type String,
                    Item_Outlet_Sales Double
'''

In [0]:
df= spark.read.schema(my_ddl_schema).csv('/Volumes/spakr_data/default/datastorage/BigMart Sales.csv',header=True)
display(df)

## Drop

In [0]:
df.drop('Item_visibility').limit(10).display()

In [0]:
df.drop('Item_visibility','Item_Fat_Content').limit(10).display()

## drop dublicates table **level**

In [0]:
df.dropDuplicates().display()


In [0]:
df.distinct().toPandas()

In [0]:
# Drop duplicates column level
df.dropDuplicates(['Item_Identifier','Outlet_Identifier']).display()

In [0]:
df.dropDuplicates(subset=['Item_Identifier','Outlet_Identifier']).display()

In [0]:
df1=[(1,'Kid'),(2,'Teen'),(3,'Adult')]
df_schema='''StudentId INT,Name STRING'''
df1=spark.createDataFrame(df1,schema=df_schema)
display(df1)

## Union

In [0]:
df2=[(11,'John'),(12,'Mary'),(13,'Jane')]
df_schema='''StudentId INT,Name STRING'''
df2=spark.createDataFrame(df2,schema=df_schema)
display(df2)

In [0]:
# Union both DataFrames must have the same schema (same column names in the same order and same data types).If schema is different → you’ll get an error.
df1.union(df2).display()

In [0]:
df3=spark.createDataFrame([(1,'Kid'),(2,'Teen'),(3,'Adult'),(4,'Senior')],schema='StudentId INT,Name STRING')
df4=spark.createDataFrame([('John',12),('Mary',13),('Jane',14),('peter',15)],schema='Name STRING,StudentId INT')

#It matches columns by their names, not positions. Missing columns in one DataFrame → Spark fills them with null.
df1.unionByName(df2).display()

In [0]:
df1.unionByName(df2).show()

## String functions

In [0]:
df.display()

In [0]:
df.select(initcap('Item_Type').alias('Item_Type')).display()

In [0]:
df.select(upper('Item_Type').alias('Item_Type')).display()
df.select(lower('Item_Type').alias('Item_Type')).display()

## Date functions

In [0]:
df_currDate=df.withColumn('Current_Date',current_date())
df_currDate.display()


In [0]:
df_currDate=df_currDate.withColumn('Current_Timestamp',current_timestamp())
df_currDate.display()

In [0]:
df_currDate=df_currDate.withColumn('week_after',date_add(current_date(),7)).withColumn('week_before',date_sub(current_date(),7))
df_currDate.display()

In [0]:
df_currDate=df_currDate.withColumn('week_before',date_sub(col('Current_Date'),-7))
df_currDate.display()

In [0]:
df_currDate=df_currDate.withColumn('week_before',date_sub(col('Current_Date'),7))
df_currDate=df_currDate.withColumn('date_diff',date_diff(col('week_after'),col('week_before')))
df_currDate.display()

In [0]:
df_currDate=df_currDate.withColumn('week_before',date_format(col('Current_Date'),'dd-MM-yyyy'))
df_currDate.display()


## Handling Nulls

In [0]:
# drop null all columns
df.dropna(how='all').display()

In [0]:
# drop row if all columns are null
df.dropna('all').display()

In [0]:
# drop row if any column is null
df.dropna(how='any').display()

In [0]:
df.dropna(subset=['Item_Weight']).display()

In [0]:
df.dropna(subset=['Outlet_Size']).display()

In [0]:
#df.fillna('Not available').display()
df.fillna({'Item_Weight': 0}).display()

In [0]:
df.fillna("not available",).display()

In [0]:
df.fillna("not available", subset=['Outlet_Size']).display()


## Split and Indexing

In [0]:
df.withColumn('Outlet_type',split(col('Outlet_Type'), ' ')).display()

In [0]:
df.withColumn('Outlet_type',split(col('Outlet_type'),' ')[1]).display()

In [0]:
#explode create seperate row for each splitted value
df_exp=df.withColumn('Outlet_type',split(col('Outlet_Type'),' '))
df_exp.withColumn('Outlet_type',explode(col('Outlet_Type'))).display()

In [0]:
#check wheather the value is present in the array or not
df_exp.withColumn('Outlet_type_flad',array_contains('Outlet_type','Type1')).display()

## Group By

In [0]:
df.groupBy('Item_Type').agg(sum('Item_MRP')).display()

In [0]:
df.groupBy('Item_Type').agg(avg('Item_MRP')).display()

In [0]:
df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP').alias('Item_MRP_sum')).toPandas()

In [0]:
df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP').alias('Item_MRP_sum'),avg('Item_MRP').alias('Item_MRP_avg')).display()

In [0]:
data = [('user1','book1'),
        ('user1','book2'),
        ('user2','book2'),
        ('user2','book4'),
        ('user3','book1')]

schema = 'user string, book string'

df_book = spark.createDataFrame(data,schema)

df_book.show()

In [0]:
df_book.groupBy('user').agg(collect_list('book')).display()
df_book.groupBy('user').agg(collect_list('book')).show()

In [0]:
df.select('Item_Type','Outlet_Size','Item_MRP').display()

In [0]:
#Pivot
df.groupBy('Item_Type').pivot('Outlet_Size').agg(avg('Item_MRP')).display()

In [0]:
#Pivot with round
df.groupBy('Item_Type').pivot('Outlet_Size').agg(round(avg('Item_MRP'),2)).display()