In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder\
        .appName("Data Ingestion")\
        .getOrCreate()

schema = StructType([
    StructField("InvoiceNo", IntegerType(), nullable=False),
    StructField("StockCode", IntegerType(), nullable=False),
    StructField("Description", StringType(), nullable=False),
    StructField("Quantity", DoubleType(), nullable=False),
    StructField("InvoiceDate", StringType(), nullable=False),
    StructField("UnitPrice", DoubleType(), nullable=False),
    StructField("CustomerID", IntegerType(), nullable=False),
    StructField("Country", StringType(), nullable=False)
])

df = spark.read.format("csv")\
    .option("header", "true") \
    .option("inferSchema", "true") \
    .schema(schema)\
    .load("/workspaces/DataEngineering/Csvs/online_retail.csv")

df.show(10)

+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|   536365|     NULL|WHITE HANGING HEA...|     6.0|12/1/10 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|     6.0|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|     NULL|CREAM CUPID HEART...|     8.0|12/1/10 8:26|     2.75|     17850|United Kingdom|
|   536365|     NULL|KNITTED UNION FLA...|     6.0|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|     NULL|RED WOOLLY HOTTIE...|     6.0|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|     2.0|12/1/10 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|     6.0|12/1/10 8:26|     4.25|     17850|United

In [10]:
import shutil
#Delete if OutputCsv found already
file_path = "/workspaces/DataEngineering/OutputCsv/Country.csv"
shutil.rmtree(file_path)
#Create Partition Csv for each country
df.write.partitionBy("Country").csv(f"{file_path}")

                                                                                

In [11]:
#Define Schema
schema1 = StructType([
    StructField("InvoiceNo", IntegerType(), nullable=False),
    StructField("StockCode", IntegerType(), nullable=False),
    StructField("Description", StringType(), nullable=False),
    StructField("Quantity", DoubleType(), nullable=False),
    StructField("InvoiceDate", StringType(), nullable=False),
    StructField("UnitPrice", DoubleType(), nullable=False),
    StructField("CustomerID", IntegerType(), nullable=False),
])

In [12]:
#Get All the data according to country name

import os

country = "United Kingdom"

# Directory containing the files
directory_path = f"{file_path}/Country={country}"

# List all files in the directory that end with .csv
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

#Create single dataframe for mentioned country
for file in csv_files:
    UkDf = spark.read.schema(schema1).csv(f"{directory_path}/{file}", inferSchema=True)

UkDf.show()


+---------+---------+--------------------+--------+-------------+---------+----------+
|InvoiceNo|StockCode|         Description|Quantity|  InvoiceDate|UnitPrice|CustomerID|
+---------+---------+--------------------+--------+-------------+---------+----------+
|   563031|    21932|SCANDINAVIAN PAIS...|     2.0|8/11/11 14:38|     1.65|     13263|
|   563031|    23209|LUNCH BAG VINTAGE...|     1.0|8/11/11 14:38|     1.65|     13263|
|   563031|    22384|LUNCH BAG PINK PO...|     2.0|8/11/11 14:38|     1.65|     13263|
|   563031|    20727|LUNCH BAG  BLACK ...|     1.0|8/11/11 14:38|     1.65|     13263|
|   563031|    22383|LUNCH BAG SUKI DE...|     2.0|8/11/11 14:38|     1.65|     13263|
|   563031|    23208|LUNCH BAG VINTAGE...|     1.0|8/11/11 14:38|     1.65|     13263|
|   563031|    20725|LUNCH BAG RED RET...|     2.0|8/11/11 14:38|     1.65|     13263|
|   563031|    20728| LUNCH BAG CARS BLUE|     2.0|8/11/11 14:38|     1.65|     13263|
|   563031|    23207|LUNCH BAG ALPHABE...| 

In [None]:
#Get top 5 buyer in UK
#Most expensive item in UK
#Least expensive item in UK
#Average money spent by each buyer(remove null & negative outliers)
#Date on which customers bought most

Get top 5 buyer in UK

In [23]:
#Drop null values in the dataset
UkDf = UkDf.dropna(subset=["Quantity", "UnitPrice", "CustomerID"])

In [24]:
#Get temp table
tableName = "UnitedKingdom"
UkDf.createOrReplaceTempView(tableName)

In [25]:
UkSql = spark.sql(f"SELECT CustomerID, SUM(Quantity*UnitPrice) AS TotalPrice FROM {tableName} GROUP BY CustomerID ORDER BY TotalPrice DESC LIMIT 5")
UkSql.show()

+----------+------------------+
|CustomerID|        TotalPrice|
+----------+------------------+
|     18102|129120.06999999996|
|     17450|119534.68999999994|
|     14096| 57120.91000000003|
|     17511| 35551.64999999998|
|     16684| 32770.05999999999|
+----------+------------------+



Most expensive item in UK

In [26]:
UkSql = spark.sql(f"SELECT Description, UnitPrice FROM {tableName} ORDER BY UnitPrice DESC LIMIT 1")
UkSql.show()

+-----------+---------+
|Description|UnitPrice|
+-----------+---------+
|     Manual|  3155.95|
+-----------+---------+



                                                                                

Least expensive item in UK

In [27]:
UkSql = spark.sql(f"SELECT Description, UnitPrice FROM {tableName} ORDER BY UnitPrice ASC LIMIT 1")
UkSql.show()

+--------------------+---------+
|         Description|UnitPrice|
+--------------------+---------+
|HANGING METAL HEA...|      0.0|
+--------------------+---------+



Average money spent by each buyer(remove null & negative outliers).