# Data Cleaning

#### Removing the null values

In [31]:
import pyspark
from pyspark.sql import SparkSession
import seaborn as sns
import pandas as pd


# Initialize Spark session
spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Read CSV file into DataFrame
df = spark.read.csv("./originalDataSets/ecommerce_customer_data_large.csv", header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Check for null values in the DataFrame
null_counts = df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns])
null_counts.show()

# Replace null values in the 'Returns' column with 0
df = df.fillna({'Returns': 0})

# Show the updated DataFrame
df.show()

# Check for null values in the DataFrame
null_counts = df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns])
null_counts.show()

+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------+---+------+-----+
|Customer ID|      Purchase Date|Product Category|Product Price|Quantity|Total Purchase Amount|Payment Method|Customer Age|Returns| Customer Name|Age|Gender|Churn|
+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------+---+------+-----+
|      44605|2023-05-03 21:30:02|            Home|          177|       1|                 2427|        PayPal|          31|    1.0|   John Rivera| 31|Female|    0|
|      44605|2021-05-16 13:57:44|     Electronics|          174|       3|                 2448|        PayPal|          31|    1.0|   John Rivera| 31|Female|    0|
|      44605|2020-07-13 06:16:57|           Books|          413|       1|                 2345|   Credit Card|          31|    1.0|   John Rivera| 31|Female|    0|
|      44605|202

In [32]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Define window specification
window_spec = Window.orderBy("Purchase Date")

df_with_order_id = df.select("*").withColumn("order_id", row_number().over(window_spec))

# Show the updated DataFrame
df_with_order_id.show()

+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------------+---+------+-----+--------+
|Customer ID|      Purchase Date|Product Category|Product Price|Quantity|Total Purchase Amount|Payment Method|Customer Age|Returns|       Customer Name|Age|Gender|Churn|order_id|
+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------------+---+------+-----+--------+
|      11789|2020-01-01 00:07:26|        Clothing|          426|       4|                 2046|          Cash|          45|    1.0|       Matthew Davis| 45|  Male|    0|       1|
|      48592|2020-01-01 00:11:40|        Clothing|          160|       4|                 2514|          Cash|          49|    0.0|       Tina Phillips| 49|  Male|    0|       2|
|      30486|2020-01-01 00:15:47|        Clothing|          230|       4|                  713|   Credit 

In [33]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import collect_list, array_distinct

# Assuming 'CustomerID' is the column to group by
df_grouped = df_with_order_id.groupBy("Customer ID").agg(array_distinct(collect_list("Product Category")).alias("Product Category List"))
df_grouped.show()
fp_growth = FPGrowth(itemsCol="Product Category List", minSupport=0.2, minConfidence=0.5)
model = fp_growth.fit(df_grouped)
model.setPredictionCol("newPrediction")

+-----------+---------------------+
|Customer ID|Product Category List|
+-----------+---------------------+
|          1| [Books, Electroni...|
|          3| [Electronics, Hom...|
|          6| [Clothing, Books,...|
|         12| [Home, Clothing, ...|
|         13| [Home, Books, Clo...|
|         16|        [Books, Home]|
|         20| [Books, Clothing,...|
|         22| [Electronics, Clo...|
|         26| [Clothing, Home, ...|
|         27| [Electronics, Boo...|
|         28|    [Clothing, Books]|
|         31| [Clothing, Books,...|
|         34|        [Electronics]|
|         40| [Electronics, Clo...|
|         44| [Home, Electronic...|
|         47| [Home, Clothing, ...|
|         48|     [Home, Clothing]|
|         52| [Electronics, Boo...|
|         53| [Electronics, Clo...|
|         54| [Home, Clothing, ...|
+-----------+---------------------+
only showing top 20 rows



FPGrowthModel: uid=FPGrowth_0904e5a1e041, numTrainingRecords=49661

In [34]:
# Frequent Itemsets
frequent_itemsets = model.freqItemsets
frequent_itemsets.show()

# Association Rules
association_rules = model.associationRules
association_rules.show()


+--------------------+-----+
|               items| freq|
+--------------------+-----+
|             [Books]|35504|
|   [Books, Clothing]|25338|
|[Books, Clothing,...|18008|
|[Books, Clothing,...|12816|
|[Books, Clothing,...|18081|
|[Books, Electronics]|25275|
|[Books, Electroni...|18025|
|       [Books, Home]|25389|
|              [Home]|35752|
|       [Electronics]|35713|
| [Electronics, Home]|25508|
|          [Clothing]|35654|
|[Clothing, Electr...|25436|
|[Clothing, Electr...|18142|
|    [Clothing, Home]|25456|
+--------------------+-----+

+--------------------+-------------+------------------+------------------+-------------------+
|          antecedent|   consequent|        confidence|              lift|            support|
+--------------------+-------------+------------------+------------------+-------------------+
|[Books, Clothing,...|       [Home]|0.7116836961350511|0.9885579557440919| 0.2580697126517791|
|[Books, Clothing,...|[Electronics]|0.7088103534096566| 0.9856419500

In [35]:
predictions = model.transform(df_grouped)
predictions.show()

+-----------+---------------------+--------------------+
|Customer ID|Product Category List|       newPrediction|
+-----------+---------------------+--------------------+
|          1| [Books, Electroni...|              [Home]|
|          3| [Electronics, Hom...|             [Books]|
|          6| [Clothing, Books,...|                  []|
|         12| [Home, Clothing, ...|                  []|
|         13| [Home, Books, Clo...|       [Electronics]|
|         16|        [Books, Home]|[Electronics, Clo...|
|         20| [Books, Clothing,...|       [Electronics]|
|         22| [Electronics, Clo...|              [Home]|
|         26| [Clothing, Home, ...|       [Electronics]|
|         27| [Electronics, Boo...|          [Clothing]|
|         28|    [Clothing, Books]| [Electronics, Home]|
|         31| [Clothing, Books,...|              [Home]|
|         34|        [Electronics]|[Books, Home, Clo...|
|         40| [Electronics, Clo...|             [Books]|
|         44| [Home, Electronic