In [0]:
# Step 1: Read CSV with header
df = spark.read.format('csv').option('header', 'true').load('Files/orders/2019.csv')
display(df)
# Step 2: Read CSV without header
df = spark.read.format('csv').option('header', 'false').load('Files/orders/2019.csv')
display(df)
# Step 3: Define schema and read CSV with schema
from pyspark.sql.types import *
orderSchema = StructType([
    StructField('SalesOrderNumber', StringType()),
    StructField('SalesOrderLineNumber', IntegerType()),
    StructField('OrderDate', DateType()),
    StructField('CustomerName', StringType()),
    StructField('Email', StringType()),
    StructField('Item', StringType()),
    StructField('Quantity', IntegerType()),
    StructField('UnitPrice', FloatType()),
    StructField('Tax', FloatType())
])
df = spark.read.format('csv').schema(orderSchema).load('Files/orders/2019.csv')
display(df)
# Step 4: Load multiple CSV files
df = spark.read.format('csv').schema(orderSchema).load('Files/orders/*.csv')
display(df)
# Step 5: Get distinct customers
customers = df.select('CustomerName', 'Email')
print(customers.count())
print(customers.distinct().count())
display(customers.distinct())
# Step 6: Filter customers who ordered a specific item
customers = df.select('CustomerName', 'Email').where(df['Item'] == 'Road-250 Red, 52')
print(customers.count())
print(customers.distinct().count())
display(customers.distinct())
# Step 7: Group sales by item
productSales = df.select('Item', 'Quantity').groupBy('Item').sum()
display(productSales)
# Step 8: Count yearly sales
from pyspark.sql.functions import *
yearlySales = df.select(year(col('OrderDate')).alias('Year')).groupBy('Year').count().orderBy('Year')
display(yearlySales)
# Step 9: Transform data – add Year, Month, FirstName, LastName
from pyspark.sql.functions import *
transformed_df = df.withColumn('Year', year(col('OrderDate'))).withColumn('Month', month(col('OrderDate')))
transformed_df = transformed_df.withColumn('FirstName', split(col('CustomerName'), ' ').getItem(0)).withColumn('LastName', split(col('CustomerName'), ' ').getItem(1))
transformed_df = transformed_df.select('SalesOrderNumber','SalesOrderLineNumber','OrderDate','Year','Month','FirstName','LastName','Email','Item','Quantity','UnitPrice','Tax')
display(transformed_df.limit(5))
# Step 10: Save transformed data
transformed_df.write.mode('overwrite').parquet('Files/transformed_data/orders')
print('Transformed data saved!')
# Step 11: Read Parquet data
orders_df = spark.read.format('parquet').load('Files/transformed_data/orders')
display(orders_df)
# Step 12: Partition Parquet data by Year and Month
orders_df.write.partitionBy('Year','Month').mode('overwrite').parquet('Files/partitioned_data')
print('Transformed data saved!')
# Step 13: Read specific partition
orders_2021_df = spark.read.format('parquet').load('Files/partitioned_data/Year=2021/Month=*')
display(orders_2021_df)
# Step 14: Save as Delta table
df.write.format('delta').saveAsTable('salesorders')
spark.sql('DESCRIBE EXTENDED salesorders').show(truncate=False)
# Step 15: Query table from catalog
df = spark.sql('SELECT * FROM practicels.salesorders LIMIT 1000')
display(df)
# Step 16: SQL Query for yearly revenue
%%sql
SELECT YEAR(OrderDate) AS OrderYear, SUM((UnitPrice * Quantity) + Tax) AS GrossRevenue FROM salesorders GROUP BY YEAR(OrderDate) ORDER BY OrderYear;
# Step 17: Compute yearly metrics (PySpark SQL)
sqlQuery = '''
SELECT CAST(YEAR(OrderDate) AS CHAR(4)) AS OrderYear, SUM((UnitPrice * Quantity) + Tax) AS GrossRevenue, COUNT(DISTINCT SalesOrderNumber) AS YearlyCounts FROM salesorders GROUP BY CAST(YEAR(OrderDate) AS CHAR(4)) ORDER BY OrderYear'''
df_spark = spark.sql(sqlQuery)
df_spark.show()
# Step 18: Convert to Pandas and visualize revenue
from matplotlib import pyplot as plt
df_sales = df_spark.toPandas()
plt.bar(x=df_sales['OrderYear'], height=df_sales['GrossRevenue'])
plt.show()
# Step 19: Seaborn Visualization
import seaborn as sns
plt.clf()
sns.set_theme(style='whitegrid')
sns.barplot(x='OrderYear', y='GrossRevenue', data=df_sales)
plt.show()