In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, year, month, dayofweek, lit
from pyspark.sql.types import IntegerType, DoubleType
import pandas as pd

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark Example") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("ecommerce_customer_data_large.csv", header=True, inferSchema=True)

# Fill null values with 0
df = df.fillna(0)

# Convert 'Purchase Date' to timestamp
df = df.withColumn("Purchase Date", col("Purchase Date").cast("timestamp"))

# Extract date components
df = df.withColumn("year", year(col("Purchase Date"))) \
       .withColumn("month", month(col("Purchase Date"))) \
       .withColumn("dayOfweek", dayofweek(col("Purchase Date")))\
       .withColumn("quarter", (month(col("Purchase Date")) - 1) // 3 + 1)

# Create age ranges
df = df.withColumn("Age Range", when(col("Age") <= 18, "0-18")
                   .when((col("Age") > 18) & (col("Age") <= 30), "19-30")
                   .when((col("Age") > 30) & (col("Age") <= 40), "31-40")
                   .when((col("Age") > 40) & (col("Age") <= 50), "41-50")
                   .when((col("Age") > 50) & (col("Age") <= 60), "51-60")
                   .otherwise("60+"))

# Calculate revenue
df = df.withColumn("revenue", col("Product Price") * col("Quantity"))


In [2]:
df.show()

+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------+---+------+-----+----+-----+---------+---------+-------+
|Customer ID|      Purchase Date|Product Category|Product Price|Quantity|Total Purchase Amount|Payment Method|Customer Age|Returns| Customer Name|Age|Gender|Churn|year|month|dayOfweek|Age Range|revenue|
+-----------+-------------------+----------------+-------------+--------+---------------------+--------------+------------+-------+--------------+---+------+-----+----+-----+---------+---------+-------+
|      44605|2023-05-03 21:30:02|            Home|          177|       1|                 2427|        PayPal|          31|    1.0|   John Rivera| 31|Female|    0|2023|    5|        4|    31-40|    177|
|      44605|2021-05-16 13:57:44|     Electronics|          174|       3|                 2448|        PayPal|          31|    1.0|   John Rivera| 31|Female|    0|2021|    5|        1|    