# Royal Cybers: End-to-End Machine Learning Pipeline for Personalized recommendations in Databricks 

## THIS IS THE MAIN NOTEBOOK (not including EDA)

In [0]:
import importlib
from pyspark.sql import SparkSession
from scripts.data_cleaning import clean_cosmetic_df, clean_mapping_df, clean_reviews_df
!pip install spacy
!python -m spacy download en_core_web_sm
from scripts.feature_engineering import process_reviews_df, add_customer_engagement, add_predictor_features
from scripts.data_transformation import transform_cosmetic_data, transform_reviews_data, transform_mapping_data
from pyspark.sql.functions import col
from scripts.EDA import perform_eda
!pip install mlxtend
from scripts.baseline_model import recommendations, run_fp_growth
from pyspark.sql.functions import lit


## Load Dataset

In [0]:
# Load Dataset from S3 Bucket
spark = SparkSession.builder.appName("E-Commerce Pipeline").getOrCreate()

cosmetic_store_data_path = "s3://e-commerce-pipeline-dataset/Cosmetic Store Website Data.csv"
reviews_data_path = "s3://e-commerce-pipeline-dataset/nyka_top_brands_cosmetics_product_reviews.csv"
product_mapping_path = "s3://e-commerce-pipeline-dataset/unique_product_id_pairings.csv"

In [0]:
cosmetic_df = spark.read.csv(cosmetic_store_data_path, header=True, inferSchema=True)
reviews_df = spark.read.csv(reviews_data_path, header=True, inferSchema=True)
mapping_df = spark.read.csv(product_mapping_path, header=True, inferSchema=True)

## Clean Data

In [0]:
cosmetic_df = clean_cosmetic_df(cosmetic_df)
mapping_df = clean_mapping_df(mapping_df)
reviews_df = clean_reviews_df(reviews_df)

In [0]:
cosmetic_df.show(5)
mapping_df.show(5)
reviews_df.show(5)

## Delta Table Paths

In [0]:
cosmetic_delta_path = "/mnt/delta/cosmetic_store_data"
reviews_delta_path = "/mnt/delta/product_reviews"
mapping_delta_path = "/mnt/delta/product_mapping"

In [0]:
# Save cleaned DataFrames to Delta format
cosmetic_df.write.format("delta").mode("overwrite").save(cosmetic_delta_path)
reviews_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(reviews_delta_path)
mapping_df.write.format("delta").mode("overwrite").save(mapping_delta_path)

In [0]:
# Load Delta tables
cosmetic_df = spark.read.format("delta").load(cosmetic_delta_path)
reviews_df = spark.read.format("delta").load(reviews_delta_path)
mapping_df = spark.read.format("delta").load(mapping_delta_path)

## Feature Engineering

### Process unstructured data (reviews)

In [0]:
reviews_df = process_reviews_df(reviews_df)

### Add outcome variable (Y)

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import feature_engineering

# Reload the module
importlib.reload(feature_engineering)
from scripts.feature_engineering import process_reviews_df, add_customer_engagement, add_predictor_features

In [0]:
cosmetic_df = add_customer_engagement(cosmetic_df)

### Add predictor variables (X)

In [0]:
cosmetic_df = add_predictor_features(cosmetic_df)

In [0]:
reviews_df.show(5)
cosmetic_df.show(5)

## Data Transformation

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import data_transformation

# Reload the module
importlib.reload(data_transformation)
from scripts.data_transformation import transform_cosmetic_data, transform_reviews_data, transform_mapping_data

In [0]:
cosmetic_df = transform_cosmetic_data(cosmetic_df)
reviews_df = transform_reviews_data(reviews_df)
mapping_df = transform_mapping_data(mapping_df)

In [0]:
cosmetic_df.show(5)
reviews_df.show(5)
mapping_df.show(5)

## Baseline Model for Product Recommendation

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import baseline_model

# Reload the module
importlib.reload(baseline_model)
from scripts.baseline_model import recommendations, run_fp_growth

In [0]:
purchase_df = cosmetic_df.filter(cosmetic_df['event_type'] == 'purchase')
purchase_df = purchase_df.withColumn("product_quantity", lit(1))

In [0]:
# Convert PySpark DataFrame to pandas DataFrame
purchase_df_pandas = purchase_df.toPandas()

In [0]:
# Generate Recommendations
prod_recs = recommendations(
    df=purchase_df_pandas,
    filename="product_recs_cosine_similarity.csv",
    rows="user_session",
    cols="cosmeticProductId",
    quantity="product_quantity",
    top=11
)


In [0]:
# Generate FP-Growth
frequent_itemsets, association_rules = run_fp_growth(
    purchase_df,
    min_support=0.001,
    min_confidence=0.1
)

In [0]:
# Display results
print(prod_recs.head())
print(frequent_itemsets.head())
print(association_rules.head())
