In [14]:
import pyspark
from pyspark.sql import SparkSession
import seaborn as sns
import pandas as pd


# Initialize Spark session
spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Read CSV file into DataFrame
df = spark.read.csv("./ecommerce_customer_data_custom_ratios/ecommerce_customer_data_custom_ratios.csv", header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Check for null values in the DataFrame
null_counts = df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns])
null_counts.show()

# Replace null values in the 'Returns' column with 0
df = df.fillna({'Returns': 0})

# Show the updated DataFrame
df.show()

# Check for null values in the DataFrame
null_counts = df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns])
null_counts.show()

+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+-------------------+---+------+-----+
|Customer ID|      Purchase Date|Product Category|Product Price|Quantity|Total Purchase Amount|Payment Method|Customer Age|Returns|      Customer Name|Age|Gender|Churn|
+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+-------------------+---+------+-----+
|      46251|2020-09-08 09:38:32|     Electronics|           12|       3|                  740|   Credit Card|          37|    0.0|Christine Hernandez| 37|  Male|    0|
|      46251|2022-03-05 12:56:35|            Home|          468|       4|                 2739|        PayPal|          37|    0.0|Christine Hernandez| 37|  Male|    0|
|      46251|2022-05-23 18:18:01|            Home|          288|       2|                 3196|        PayPal|          37|    0.0|Christine Hernandez| 37|

In [15]:
from pyspark.sql.functions import year, month, dayofmonth, dayofweek

# Extract date components
df = df.withColumn('Year', year(df['Purchase Date']))
df = df.withColumn('Month', month(df['Purchase Date']))
df = df.withColumn('Day', dayofmonth(df['Purchase Date']))
df = df.withColumn('DayOfWeek', dayofweek(df['Purchase Date']))

# Calculate correlations
correlations = {}
for col in ['Year', 'Month', 'Day', 'DayOfWeek']:
    correlations[col] = df.stat.corr('Churn', col)

# Show correlations
for col, corr_value in correlations.items():
    print(f"Correlation between Churn and {col}: {corr_value}")

Correlation between Churn and Year: 0.003465300430301534
Correlation between Churn and Month: -0.0005727856019225997
Correlation between Churn and Day: 0.003399072189347331
Correlation between Churn and DayOfWeek: 0.0017228742396081667


In [None]:
from pyspark.sql.functions import corr

categorical_columns = ["Product Category", "Payment Method", "Gender", "Returns"]
continuous_columns = ["Product Price", "Quantity", "Total Purchase Amount", "Customer Age", "Age"]
date_columns = ["Year", "Month", "Day", "DayOfWeek"]

