# Exploratory Data Analysis (EDA)

## This notebook includes full code with EDA. Only run if you need to see EDA

## Clean and Transform Data

In [0]:
import importlib
from pyspark.sql import SparkSession
from scripts.data_cleaning import clean_cosmetic_df, clean_mapping_df, clean_reviews_df
!pip install spacy
!python -m spacy download en_core_web_sm
from scripts.feature_engineering import process_reviews_df, add_customer_engagement, add_predictor_features
from scripts.data_transformation import transform_cosmetic_data, transform_reviews_data, transform_mapping_data

In [0]:
# Load Dataset from S3 Bucket
spark = SparkSession.builder.appName("E-Commerce Pipeline").getOrCreate()

cosmetic_store_data_path = "s3://e-commerce-pipeline-dataset/Cosmetic Store Website Data.csv"
reviews_data_path = "s3://e-commerce-pipeline-dataset/nyka_top_brands_cosmetics_product_reviews.csv"
product_mapping_path = "s3://e-commerce-pipeline-dataset/unique_product_id_pairings.csv"

In [0]:
cosmetic_df = spark.read.csv(cosmetic_store_data_path, header=True, inferSchema=True)
reviews_df = spark.read.csv(reviews_data_path, header=True, inferSchema=True)
mapping_df = spark.read.csv(product_mapping_path, header=True, inferSchema=True)

In [0]:
cosmetic_df = clean_cosmetic_df(cosmetic_df)
mapping_df = clean_mapping_df(mapping_df)
reviews_df = clean_reviews_df(reviews_df)

In [0]:
reviews_df = process_reviews_df(reviews_df)

In [0]:
cosmetic_df = add_customer_engagement(cosmetic_df)

In [0]:
cosmetic_df = add_predictor_features(cosmetic_df)

In [0]:
cosmetic_df = transform_cosmetic_data(cosmetic_df)
reviews_df = transform_reviews_data(reviews_df)
mapping_df = transform_mapping_data(mapping_df)

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import EDA

# Reload the module
importlib.reload(EDA)
from scripts.EDA import perform_eda

In [0]:
# Combine datasets for EDA
cosmetic_mapped_df = cosmetic_df.join(mapping_df, cosmetic_df["cosmeticProductId"] == mapping_df["cosmeticProductId"], "inner")
combined_df = cosmetic_mapped_df.join(reviews_df, cosmetic_mapped_df["reviewProductId"] == reviews_df["reviewProductId"], "inner")

# Drop unnecessary columns
combined_df = combined_df.drop(
    "cosmeticProductId", "reviewProductId", "event_type", "brand_name", "stemmed_title", "lemmatized_title",
    "stemmed_text", "lemmatized_text", "review_text_clean", "review_title_clean"
)

# Save combined data to Delta table
combined_data_path = "/mnt/delta/combined_cleaned_data"
combined_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(combined_data_path)

# Reload combined Delta table for EDA
combined_df = spark.read.format("delta").load(combined_data_path)

## EDA

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from collections import Counter


In [0]:
# Summary statistics for numeric columns
combined_df.describe().show()

In [0]:
# Calculate specific statistics
combined_df.select("cosmetic_price", "review_price", "product_rating", "product_rating_count").describe().show()


In [0]:
# Average product rating by brand
combined_df.groupBy("brand").agg({"product_rating": "avg"}).withColumnRenamed("avg(product_rating)", "average_rating").show()

### Distribution of Price

In [0]:
# Distribution of cosmetic_price and review_price to understand pricing across products

cosmetic_price_data = combined_df.select("cosmetic_price").rdd.flatMap(lambda x: x).collect()
review_price_data = combined_df.select("review_price").rdd.flatMap(lambda x: x).filter(lambda x: x is not None).collect()

plt.figure(figsize=(12, 6))

# Histogram for cosmetic_price
plt.subplot(1, 2, 1)
plt.hist(cosmetic_price_data, bins=50, color="blue", alpha=0.7)
plt.title("Distribution of Cosmetic Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")

# Histogram for review_price
plt.subplot(1, 2, 2)
plt.hist(review_price_data, bins=50, color="purple", alpha=0.7)
plt.title("Distribution of Review Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()


### Product Rating Distribution

In [0]:
# Collect rating data and filter out None values
rating_data = combined_df.select("product_rating").rdd.flatMap(lambda x: x).filter(lambda x: x is not None).collect()

plt.figure(figsize=(10, 6))
sns.histplot(rating_data, bins=10, kde=True, color="green", alpha=0.6)

# Mean rating calculation
mean_rating = np.mean(rating_data)
plt.axvline(mean_rating, color="red", linestyle="--", label=f"Mean Rating = {mean_rating:.2f}")

plt.title("Detailed Distribution of Product Ratings", fontsize=16)
plt.xlabel("Product Rating", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.legend(fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

### Customer Engagement Patterns Over Time

In [0]:
combined_df = combined_df.withColumn("event_date", col("event_time").cast("date"))

# Count engagement events per day
daily_engagement = combined_df.groupBy("event_date").count().orderBy("event_date")
daily_engagement.show()

date_data = daily_engagement.select("event_date").rdd.flatMap(lambda x: x).collect()
engagement_data = daily_engagement.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.plot(date_data, engagement_data, color="blue", marker="o")
plt.title("Daily Customer Engagement Events")
plt.xlabel("Date")
plt.ylabel("Event Count")
plt.show()


### Distribution of Customer Engagement Levels (Event Types)

In [0]:
# Count occurrences of each class
event_counts = cosmetic_df.groupBy("event_type_index").count().toPandas()

# Plot
plt.figure(figsize=(8, 5))
plt.bar(event_counts["event_type_index"], event_counts["count"], color="skyblue")
plt.title("Distribution of Event Types (Customer Engagement Levels)", fontsize=14)
plt.xlabel("Event Type (0=View, 1=Add-to-Cart, 2=Remove-from-Cart, 3=Purchase)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.show()

### Sentimental Analysis

In [0]:
sentiment_counts = reviews_df.groupBy("review_label").count().toPandas()

plt.figure(figsize=(8, 5))
plt.bar(sentiment_counts["review_label"], sentiment_counts["count"], color="skyblue")
plt.title("Sentiment Distribution", fontsize=16)
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()


In [0]:
# Sentiment by Product Rating
sentiment_rating = reviews_df.groupBy("review_label", "product_rating").count().toPandas()
pivot_data = sentiment_rating.pivot(index="review_label", columns="product_rating", values="count")

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_data, annot=True, fmt=".0f", cmap="Blues", cbar=True)
plt.title("Sentiment vs. Product Rating", fontsize=16)
plt.xlabel("Product Rating")
plt.ylabel("Sentiment")
plt.show()

In [0]:
from collections import Counter

# Collect lemmatized text into a single string
lemmatized_texts = reviews_df.select("lemmatized_text").rdd.flatMap(lambda x: x).collect()
all_words = " ".join([text for text in lemmatized_texts if text]).split()

# Calculate word frequencies
word_freq = Counter(all_words).most_common(20)

# Separate words and counts for plotting
words, counts = zip(*word_freq)

# Plot most common words
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x=list(counts), y=list(words), palette="magma")
plt.title("Top 20 Most Frequent Words in Lemmatized Review Text", fontsize=16)
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()



In [0]:
top_products = reviews_df.groupBy("product_title").count().orderBy("count", ascending=False).limit(10).toPandas()

plt.figure(figsize=(12, 6))
sns.barplot(x="count", y="product_title", data=top_products, palette="cool")
plt.title("Top 10 Most Reviewed Products", fontsize=16)
plt.xlabel("Review Count")
plt.ylabel("Product Title")
plt.show()

#### Time Series Analysis of Ratings

In [0]:
from pyspark.sql import functions as F

# Extract date from review_date
combined_df = combined_df.withColumn(
    "review_date_parsed",
    F.to_timestamp("review_date", "yyyy-MM-dd HH:mm:ss").cast("date")
)

# Proceed with grouping and aggregation
daily_ratings = combined_df.groupBy("review_date_parsed").agg(
    F.avg("product_rating").alias("avg_rating")
).orderBy("review_date_parsed")

daily_ratings.show()

# Plot time series
date_data = [row['review_date_parsed'] for row in daily_ratings.collect()]
rating_data = [row['avg_rating'] for row in daily_ratings.collect()]

plt.figure(figsize=(14, 7))
plt.plot(date_data, rating_data, marker='o', linestyle='-')
plt.title('Average Product Rating Over Time')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### User Behavior Analysis

In [0]:
# Number of events per user
user_events = combined_df.groupBy("user_id").count().orderBy(F.desc("count"))
user_events.show(10)

# Plot distribution of events per user
event_counts = user_events.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.hist(event_counts, bins=50, color="coral", alpha=0.7)
plt.title('Distribution of Events per User')
plt.xlabel('Number of Events')
plt.ylabel('Number of Users')
plt.show()

#### Distribution of Product Ratings by Brand

In [0]:
# Average rating per brand
brand_ratings = combined_df.groupBy("brand").agg(
    F.avg("product_rating").alias("avg_rating"),
    F.count("product_rating").alias("rating_count")
)

top_brands_ratings = brand_ratings.orderBy(F.desc("rating_count")).limit(10)
top_brands_pd = top_brands_ratings.toPandas()

# Bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='brand', y='avg_rating', data=top_brands_pd)
plt.xticks(rotation=45)
plt.title('Average Product Rating by Top 10 Brands')
plt.xlabel('Brand')
plt.ylabel('Average Rating')
plt.show()

#### Convert Review Ratings to Numeric and Analyze

In [0]:
# Convert review_rating to numeric
combined_df = combined_df.withColumn("review_rating_num", F.col("review_rating").cast("double"))

# Drop rows where conversion failed
combined_df = combined_df.na.drop(subset=["review_rating_num"])

# Histogram of review ratings
review_rating_data = combined_df.select("review_rating_num").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
sns.histplot(review_rating_data, bins=5, kde=False, color="skyblue")
plt.title('Distribution of Review Ratings')
plt.xlabel('Review Rating')
plt.ylabel('Frequency')
plt.show()

#### Session-Level Analysis

In [0]:
# Number of events per session
session_events = combined_df.groupBy("user_session").count().orderBy(F.desc("count"))
session_events.show(10)

# Distribution plot
session_counts = session_events.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.hist(session_counts, bins=50, color="limegreen", alpha=0.7)
plt.title('Distribution of Events per Session')
plt.xlabel('Number of Events')
plt.ylabel('Number of Sessions')
plt.show()