# Royal Cybers: End-to-End Machine Learning Pipeline for Personalized recommendations in Databricks 

## THIS IS THE MAIN NOTEBOOK (not including EDA)

In [0]:

import importlib
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from scripts.data_cleaning import clean_cosmetic_df, clean_mapping_df, clean_reviews_df
!pip install spacy
!python -m spacy download en_core_web_sm
from scripts.feature_engineering import process_reviews_df, add_customer_engagement, add_predictor_features
from scripts.data_transformation import transform_cosmetic_data, transform_reviews_data, transform_mapping_data
from pyspark.sql.functions import col
from scripts.EDA import perform_eda
!pip install mlxtend
from scripts.baseline_model import generate_cosine_sim_recs, run_fp_growth, run_als_recommender
from delta.tables import DeltaTable
from pyspark.sql.functions import lit


## Load Dataset

In [0]:
# Load Dataset from S3 Bucket
spark = SparkSession.builder.appName("E-Commerce Pipeline").getOrCreate()

cosmetic_store_data_path = "s3://e-commerce-pipeline-dataset/Cosmetic Store Website Data.csv"
reviews_data_path = "s3://e-commerce-pipeline-dataset/nyka_top_brands_cosmetics_product_reviews.csv"
product_mapping_path = "s3://e-commerce-pipeline-dataset/unique_product_id_pairings.csv"

In [0]:
cosmetic_df = spark.read.csv(cosmetic_store_data_path, header=True, inferSchema=True)
reviews_df = spark.read.csv(reviews_data_path, header=True, inferSchema=True)
mapping_df = spark.read.csv(product_mapping_path, header=True, inferSchema=True)

## Clean Data

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import data_cleaning

# Reload the module
importlib.reload(data_cleaning)
from scripts.data_cleaning import clean_cosmetic_df, clean_mapping_df, clean_reviews_df

In [0]:
cosmetic_df = clean_cosmetic_df(cosmetic_df)
mapping_df = clean_mapping_df(mapping_df)
reviews_df = clean_reviews_df(reviews_df)

In [0]:
cosmetic_df.show(5)
mapping_df.show(5)
reviews_df.show(5)

## Unity Catalog Paths

In [0]:
%sql
-- CREATE CATALOG ecommerces_catalog;
-- CREATE SCHEMA ecommerces_catalog.recommendation_schema;

In [0]:
# Unity Catalog table names
cosmetic_store_table = "ecommerces_catalog.recommendation_schema.cosmetic_store_data"
reviews_table = "ecommerces_catalog.recommendation_schema.product_reviews"
mapping_table = "ecommerces_catalog.recommendation_schema.product_mapping"

In [0]:
# Load Data into Unity Catalog
cosmetic_df.write.format("delta").mode("overwrite").saveAsTable(cosmetic_store_table)
reviews_df.write.format("delta").mode("overwrite").saveAsTable(reviews_table)
mapping_df.write.format("delta").mode("overwrite").saveAsTable(mapping_table)

In [0]:
# Load data from Unity Catalog
cosmetic_df = spark.read.table(cosmetic_store_table)
reviews_df = spark.read.table(reviews_table)
mapping_df = spark.read.table(mapping_table)

## Feature Engineering

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import feature_engineering

# Reload the module
importlib.reload(feature_engineering)
from scripts.feature_engineering import process_reviews_df, add_customer_engagement, add_predictor_features

### Process unstructured data (reviews)

In [0]:
reviews_df = process_reviews_df(reviews_df)

### Add outcome variable (Y)

In [0]:
cosmetic_df = add_customer_engagement(cosmetic_df)

### Add predictor variables (X)

In [0]:
cosmetic_df = add_predictor_features(cosmetic_df)

In [0]:
reviews_df.show(5)
cosmetic_df.show(5)

## Data Transformation

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import data_transformation

# Reload the module
importlib.reload(data_transformation)
from scripts.data_transformation import transform_cosmetic_data, transform_reviews_data, transform_mapping_data

In [0]:
transformed_cosmetic_df = transform_cosmetic_data(cosmetic_df)
transformed_reviews_df = transform_reviews_data(reviews_df)
transformed_mapping_df = transform_mapping_data(mapping_df)

In [0]:
transformed_cosmetic_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(cosmetic_store_table)

In [0]:
# This does take a while to load since it carries alot of data
transformed_reviews_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(reviews_table)

In [0]:
transformed_mapping_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(mapping_table)

In [0]:
transformed_cosmetic_df.show(5)
transformed_mapping_df.show(5)

## Baseline Model for Product Recommendation

In [0]:
# Sometimes the scripts dont get updated here, so this should do it
import importlib
from scripts import baseline_model

# Reload the module
importlib.reload(baseline_model)
from scripts.baseline_model import generate_cosine_sim_recs, run_fp_growth, run_als_recommender

In [0]:
purchase_df = transformed_cosmetic_df.filter(transformed_cosmetic_df['event_type'] == 'purchase')
purchase_df = purchase_df.withColumn("product_quantity", lit(1))

In [0]:
# Convert PySpark DataFrame to pandas DataFrame
purchase_df_pandas = purchase_df.toPandas()

In [0]:
# Generate Cosine Similarity Recommendations
prod_recs = generate_cosine_sim_recs(
    df=purchase_df_pandas,
    filename="product_recs_cosine_similarity.csv",
    rows="user_session",
    cols="cosmetic_product_id",
    quantity="product_quantity",
    top=11
)


In [0]:
# Save Recommendations to Unity Catalog
prod_recs_spark = spark.createDataFrame(prod_recs)
prod_recs_spark = prod_recs_spark.select(
    *[col(c).alias(c.replace(" ", "_").replace(".", "_")) for c in prod_recs_spark.columns]
)

prod_recs_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.product_recommendations")

In [0]:
# Generate FP-Growth
frequent_itemsets, association_rules = run_fp_growth(
    purchase_df,
    min_support=0.001,
    min_confidence=0.1
)

In [0]:
# Save Frequent Itemsets to Unity Catalog
frequent_itemsets_spark = spark.createDataFrame(frequent_itemsets)
frequent_itemsets_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.frequent_itemsets")

# Save Association Rules to Unity Catalog
association_rules_spark = spark.createDataFrame(association_rules)
association_rules_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.association_rules")

In [0]:
# Gnerate ALS recommender
model, user_recs, item_recs = run_als_recommender(purchase_df)

In [0]:
# Save User Recommendations to Unity Catalog
user_rec_spark = spark.createDataFrame(user_recs)
user_rec_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.als_user_recs")

# Save Item Recommendations to Unity Catalog
item_recs_spark = spark.createDataFrame(item_recs)
item_recs_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.als_item_recs")

In [0]:
# Display results
print(prod_recs.head())
print(frequent_itemsets.head())
print(association_rules.head())
print(user_recs.head())
print(item_recs.head())