# DO NOT RUN THIS NOTEBOOK, IT HAS FULL CODE FR REFERENCE 

## Load Dataset

In [0]:
# Load Dataset from S3 Bucket
cosmetic_store_data_path = "s3://e-commerce-pipeline-dataset/Cosmetic Store Website Data.csv"
reviews_data_path = "s3://e-commerce-pipeline-dataset/nyka_top_brands_cosmetics_product_reviews.csv"
product_mapping_path = "s3://e-commerce-pipeline-dataset/unique_product_id_pairings.csv"

cosmetic_df = spark.read.csv(cosmetic_store_data_path, header=True, inferSchema=True)
reviews_df = spark.read.csv(reviews_data_path, header=True, inferSchema=True)
mapping_df = spark.read.csv(product_mapping_path, header=True, inferSchema=True)

In [0]:
cosmetic_df.show(5)

In [0]:
reviews_df.show(5)

In [0]:
mapping_df.show(5)

%md
## Initial Data Cleaning

In [0]:
# Drop Duplicates
cosmetic_df = cosmetic_df.dropDuplicates()
reviews_df = reviews_df.dropDuplicates()
mapping_df = mapping_df.dropDuplicates()

In [0]:
# Standardizing column names
cosmetic_df = cosmetic_df.withColumnRenamed("product_id", "cosmeticProductId") \
                         .withColumnRenamed("price", "cosmetic_price") 

reviews_df = reviews_df.withColumnRenamed("product_id", "reviewProductId") \
                       .withColumnRenamed("price", "review_price")

mapping_df = mapping_df.withColumnRenamed("product_id_events", "cosmeticProductId") \
                       .withColumnRenamed("product_id_reviews", "reviewProductId")

In [0]:
# Drop unnecessary columns and fill missing values
cosmetic_df = cosmetic_df.filter(cosmetic_df['cosmetic_price'] > 0)

mapping_df = mapping_df.na.drop()

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [0]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = text.replace('\n', ' ')  # Remove new lines
        text = text.replace('\t', ' ')  # Remove tabs
        text = text.replace('\r', ' ')  # Remove returns
        text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphanumeric characters except spaces
        words = text.split()  # Tokenize the text
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        text = ' '.join(words)  # Combine words back into text
        return text.strip()
    return text  # Return original value if not a string (e.g., NaN)

# Register the clean_text function as a UDF
clean_text_udf = udf(clean_text, StringType())

# Drop unnecessary columns and handle missing values
reviews_df = reviews_df.drop("product_tags")
reviews_df = reviews_df.fillna({
    'review_text': 'No review', 
    'brand_name': 'Unknown', 
    'review_label': 'No Label',
    'product_title': 'Unknown Title'
})

# Apply the UDF to clean the review_title and review_text columns
reviews_df = reviews_df.withColumn("review_title_clean", clean_text_udf(reviews_df["review_title"]))
reviews_df = reviews_df.withColumn("review_text_clean", clean_text_udf(reviews_df["review_text"]))

# Show the results
reviews_df.select("review_title", "review_title_clean", "review_text", "review_text_clean").show(5)

In [0]:
# Filtering our multi classificaiton 
valid_events = ["view", "cart", "remove_from_cart", "purchase"]
cosmetic_df = cosmetic_df.filter(cosmetic_df.event_type.isin(valid_events))

cosmetic_df.groupBy("event_type").count().show()

In [0]:
cosmetic_df.show(5)

In [0]:
reviews_df.show(5)

In [0]:
mapping_df.show(5)

## Unity Catalog Paths

In [0]:
# # Example: Deleting the directory in DBFS
# dbutils.fs.rm("dbfs:/mnt/delta/combined_cleaned_data", True)  # True for recursive delete

# dbutils.fs.rm("dbfs:/mnt/delta/cosmetic_store_data", True)  # True for recursive delete
# dbutils.fs.rm("dbfs:/mnt/delta/product_reviews", True)  # True for recursive delete
# dbutils.fs.rm("dbfs:/mnt/delta/product_mapping", True)  # True for recursive delete

In [0]:
%sql
-- CREATE CATALOG ecommerces_catalog;
-- CREATE SCHEMA ecommerces_catalog.recommendation_schema;

In [0]:
# Unity Catalog table names
cosmetic_store_table = "ecommerces_catalog.recommendation_schema.cosmetic_store_data"
reviews_table = "ecommerces_catalog.recommendation_schema.product_reviews"
mapping_table = "ecommerces_catalog.recommendation_schema.product_mapping"


In [0]:
# Load Data into Unity Catalog
cosmetic_df.write.format("delta").mode("overwrite").saveAsTable(cosmetic_store_table)
reviews_df.write.format("delta").mode("overwrite").saveAsTable(reviews_table)
mapping_df.write.format("delta").mode("overwrite").saveAsTable(mapping_table)


In [0]:
# Load data from Unity Catalog
cosmetic_df = spark.read.table(cosmetic_store_table)
reviews_df = spark.read.table(reviews_table)
mapping_df = spark.read.table(mapping_table)


## Feature Engineering

### Unstructed Data

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
import nltk
from pyspark.sql.functions import col

In [0]:

reviews_df = reviews_df.withColumn("review_title_clean", col("review_title_clean").cast(StringType()))
reviews_df = reviews_df.withColumn("review_text_clean", col("review_text_clean").cast(StringType()))

reviews_df = reviews_df.fillna({'review_title_clean': 'No title', 'review_text_clean': 'No review'})

In [0]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [0]:
nlp = spacy.load("en_core_web_sm")

# Define the stemming and lemmatization functions
def process_text_spacy(text):
    try:
        if isinstance(text, str) and text:
            doc = nlp(text)
            lemmatized = " ".join([token.lemma_ for token in doc])  # Lemmatization using SpaCy
            stemmed = " ".join([token.text[:3] for token in doc])   # Simulated "stemming" by truncating words (example)
            return stemmed, lemmatized
        else:
            return "", ""  # Handle invalid input gracefully
    except Exception as e:
        print(f"Error processing text: {text} | Error: {e}")
        return "", ""

# Register UDFs for stemming and lemmatization
stem_udf = udf(lambda text: process_text_spacy(text)[0], StringType())
lemmatize_udf = udf(lambda text: process_text_spacy(text)[1], StringType())

# Fill missing values for relevant columns
reviews_df = reviews_df.fillna({
    "review_title_clean": "No title",
    "review_text_clean": "No review"
})

# Apply UDFs to DataFrame columns
reviews_df = reviews_df.withColumn("stemmed_title", stem_udf(reviews_df["review_title_clean"]))
reviews_df = reviews_df.withColumn("lemmatized_title", lemmatize_udf(reviews_df["review_title_clean"]))
reviews_df = reviews_df.withColumn("stemmed_text", stem_udf(reviews_df["review_text_clean"]))
reviews_df = reviews_df.withColumn("lemmatized_text", lemmatize_udf(reviews_df["review_text_clean"]))

# Show results for titles
reviews_df.select("review_title_clean", "stemmed_title", "lemmatized_title").show(5)

# Show results for text
reviews_df.select("review_text_clean", "stemmed_text", "lemmatized_text").show(5)

### Structured Data

#### Outcome Variable (Y): Customer Engagement Level

In [0]:
from pyspark.sql.functions import datediff, current_date
from pyspark.sql.functions import min, max
from pyspark.sql.functions import countDistinct
from pyspark.ml.feature import StringIndexer

In [0]:
#Encode event_type to map engagement levels
cosmetic_df = cosmetic_df.drop("event_type_index")

indexer = StringIndexer(inputCol="event_type", outputCol="event_type_index")
cosmetic_df = indexer.fit(cosmetic_df).transform(cosmetic_df)

cosmetic_df.select("event_type", "event_type_index").distinct().show()

In [0]:
# Distribution of classes
cosmetic_df.groupBy("event_type_index").count().show()

#### Predictor Variables (X): Structured Features

In [0]:
from pyspark.sql.functions import datediff, current_date
from pyspark.sql.functions import min, max
from pyspark.sql.functions import countDistinct

In [0]:
#Recency: Days since the last interaction

cosmetic_df = cosmetic_df.withColumn("recency", datediff(current_date(), col("event_time")))

In [0]:
# Frequency: Total interactions per user session

frequency_df = cosmetic_df.groupBy("user_session").count().withColumnRenamed("count", "frequency")
cosmetic_df = cosmetic_df.join(frequency_df, on="user_session", how="left")

In [0]:
# Product Popularity: Number of purchases per product

product_popularity = cosmetic_df.filter(col("event_type") == "purchase").groupBy("cosmeticProductId").count().withColumnRenamed("count", "popularity")
cosmetic_df = cosmetic_df.join(product_popularity, on="cosmeticProductId", how="left")

In [0]:
#Session Diversity: Number of unique product categories viewed in a session

session_diversity = cosmetic_df.groupBy("user_session").agg(countDistinct("category_code").alias("session_diversity"))
cosmetic_df = cosmetic_df.join(session_diversity, on="user_session", how="left")

## Data Transformation

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql.functions import mean, stddev, col
from pyspark.sql.types import DoubleType, IntegerType

### Transform Cosmetic Data (Structured Interaction Data)

In [0]:
# Scale `cosmetic_price` column
assembler = VectorAssembler(inputCols=["cosmetic_price"], outputCol="price_vec")
cosmetic_df = assembler.transform(cosmetic_df)

scaler = StandardScaler(inputCol="price_vec", outputCol="price_scaled")
cosmetic_df = scaler.fit(cosmetic_df).transform(cosmetic_df)

### Transform Reviews Data with GPT Tokenizer (Unstructured Sentiment Data)

In [0]:
from transformers import AutoTokenizer, AutoModel

# Load a pre-trained GPT-like model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModel.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Encode text to embeddings
text = "This moisturizer is amazing for dry skin!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
print(embeddings)

In [0]:
# Cast numerical columns to appropriate types
reviews_df = reviews_df.withColumn("mrp", reviews_df["mrp"].cast(DoubleType())) \
    .withColumn("review_price", reviews_df["review_price"].cast(DoubleType())) \
    .withColumn("product_rating", reviews_df["product_rating"].cast(DoubleType())) \
    .withColumn("product_rating_count", reviews_df["product_rating_count"].cast(IntegerType()))

### Transform Mapping Data

In [0]:
# Cast columns to IntegerType in `mapping_df`
mapping_df = mapping_df.withColumn("cosmeticProductId", mapping_df["cosmeticProductId"].cast(IntegerType()))
mapping_df = mapping_df.withColumn("reviewProductId", mapping_df["reviewProductId"].cast(IntegerType()))

# Calculate mean and standard deviation for filtering outliers
stats = mapping_df.select(
    mean("reviewProductId").alias("mean_reviews"),
    stddev("reviewProductId").alias("stddev_reviews")
).collect()[0]

mean_reviews = stats["mean_reviews"]
stddev_reviews = stats["stddev_reviews"]

# Filter out outliers beyond 3 standard deviations
mapping_df = mapping_df.filter(
    (col("reviewProductId") > mean_reviews - 3 * stddev_reviews) &
    (col("reviewProductId") < mean_reviews + 3 * stddev_reviews)
)

# Scale `cosmeticProductId` and `reviewProductId` in `mapping_df`
assembler = VectorAssembler(inputCols=["cosmeticProductId", "reviewProductId"], outputCol="mapping_features_vec")
mapping_df = assembler.transform(mapping_df)

scaler = StandardScaler(inputCol="mapping_features_vec", outputCol="mapping_scaled_features")
mapping_df = scaler.fit(mapping_df).transform(mapping_df)

## Combine the Transformed Data

In [0]:
# Join `cosmetic_df` and `mapping_df` on `cosmeticProductId`
cosmetic_mapped_df = cosmetic_df.join(mapping_df, cosmetic_df["cosmeticProductId"] == mapping_df["cosmeticProductId"], "inner")

# Join the result with `reviews_df` on `reviewProductId`
combined_df = cosmetic_mapped_df.join(reviews_df, cosmetic_mapped_df["reviewProductId"] == reviews_df["reviewProductId"], "inner")

# Drop the duplicate one
combined_df = combined_df.drop("cosmeticProductId", "reviewProductId", "event_type", "brand_name", "stemmed_title", "lemmatized_title", 
"stemmed_text", "lemmatized_text", "review_text_clean",  "review_title_clean")

In [0]:
# Save combined_df to Delta table
combined_data_path = "ecommerces_catalog.recommendation_schema.combined_data_path"
combined_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(combined_data_path)

In [0]:
combined_df.printSchema()

In [0]:
combined_df.show(5)

## Exploratory Data Analysis

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation


In [0]:
# Summary statistics for numeric columns
combined_df.describe().show()

In [0]:
# Calculate specific statistics
combined_df.select("cosmetic_price", "review_price", "product_rating", "product_rating_count").describe().show()


In [0]:
# Average product rating by brand
combined_df.groupBy("brand").agg({"product_rating": "avg"}).withColumnRenamed("avg(product_rating)", "average_rating").show()

### Distribution of Price

In [0]:
# Distribution of cosmetic_price and review_price to understand pricing across products

cosmetic_price_data = combined_df.select("cosmetic_price").rdd.flatMap(lambda x: x).collect()
review_price_data = combined_df.select("review_price").rdd.flatMap(lambda x: x).filter(lambda x: x is not None).collect()

plt.figure(figsize=(12, 6))

# Histogram for cosmetic_price
plt.subplot(1, 2, 1)
plt.hist(cosmetic_price_data, bins=50, color="blue", alpha=0.7)
plt.title("Distribution of Cosmetic Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")

# Histogram for review_price
plt.subplot(1, 2, 2)
plt.hist(review_price_data, bins=50, color="purple", alpha=0.7)
plt.title("Distribution of Review Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()


### Product Rating Distribution

In [0]:
# Collect rating data and filter out None values
rating_data = combined_df.select("product_rating").rdd.flatMap(lambda x: x).filter(lambda x: x is not None).collect()

plt.figure(figsize=(10, 6))
sns.histplot(rating_data, bins=10, kde=True, color="green", alpha=0.6)

# Mean rating calculation
mean_rating = np.mean(rating_data)
plt.axvline(mean_rating, color="red", linestyle="--", label=f"Mean Rating = {mean_rating:.2f}")

plt.title("Detailed Distribution of Product Ratings", fontsize=16)
plt.xlabel("Product Rating", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.legend(fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

### Customer Engagement Patterns Over Time

In [0]:
combined_df = combined_df.withColumn("event_date", col("event_time").cast("date"))

# Count engagement events per day
daily_engagement = combined_df.groupBy("event_date").count().orderBy("event_date")
daily_engagement.show()

date_data = daily_engagement.select("event_date").rdd.flatMap(lambda x: x).collect()
engagement_data = daily_engagement.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.plot(date_data, engagement_data, color="blue", marker="o")
plt.title("Daily Customer Engagement Events")
plt.xlabel("Date")
plt.ylabel("Event Count")
plt.show()


### Distribution of Customer Engagement Levels (Event Types)

In [0]:
# Count occurrences of each class
event_counts = cosmetic_df.groupBy("event_type_index").count().toPandas()

# Plot
plt.figure(figsize=(8, 5))
plt.bar(event_counts["event_type_index"], event_counts["count"], color="skyblue")
plt.title("Distribution of Event Types (Customer Engagement Levels)", fontsize=14)
plt.xlabel("Event Type (0=View, 1=Add-to-Cart, 2=Remove-from-Cart, 3=Purchase)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.show()

### Sentimental Analysis

In [0]:
sentiment_counts = reviews_df.groupBy("review_label").count().toPandas()

plt.figure(figsize=(8, 5))
plt.bar(sentiment_counts["review_label"], sentiment_counts["count"], color="skyblue")
plt.title("Sentiment Distribution", fontsize=16)
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()


In [0]:
# Sentiment by Product Rating
sentiment_rating = reviews_df.groupBy("review_label", "product_rating").count().toPandas()
pivot_data = sentiment_rating.pivot(index="review_label", columns="product_rating", values="count")

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_data, annot=True, fmt=".0f", cmap="Blues", cbar=True)
plt.title("Sentiment vs. Product Rating", fontsize=16)
plt.xlabel("Product Rating")
plt.ylabel("Sentiment")
plt.show()

In [0]:
from collections import Counter

# Collect lemmatized text into a single string
lemmatized_texts = reviews_df.select("lemmatized_text").rdd.flatMap(lambda x: x).collect()
all_words = " ".join([text for text in lemmatized_texts if text]).split()

# Calculate word frequencies
word_freq = Counter(all_words).most_common(20)

# Separate words and counts for plotting
words, counts = zip(*word_freq)

# Plot most common words
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x=list(counts), y=list(words), palette="magma")
plt.title("Top 20 Most Frequent Words in Lemmatized Review Text", fontsize=16)
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()



In [0]:
top_products = reviews_df.groupBy("product_title").count().orderBy("count", ascending=False).limit(10).toPandas()

plt.figure(figsize=(12, 6))
sns.barplot(x="count", y="product_title", data=top_products, palette="cool")
plt.title("Top 10 Most Reviewed Products", fontsize=16)
plt.xlabel("Review Count")
plt.ylabel("Product Title")
plt.show()

#### Time Series Analysis of Ratings

In [0]:
from pyspark.sql import functions as F

# Extract date from review_date
combined_df = combined_df.withColumn(
    "review_date_parsed",
    F.to_timestamp("review_date", "yyyy-MM-dd HH:mm:ss").cast("date")
)

# Proceed with grouping and aggregation
daily_ratings = combined_df.groupBy("review_date_parsed").agg(
    F.avg("product_rating").alias("avg_rating")
).orderBy("review_date_parsed")

daily_ratings.show()

# Plot time series
date_data = [row['review_date_parsed'] for row in daily_ratings.collect()]
rating_data = [row['avg_rating'] for row in daily_ratings.collect()]

plt.figure(figsize=(14, 7))
plt.plot(date_data, rating_data, marker='o', linestyle='-')
plt.title('Average Product Rating Over Time')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### User Behavior Analysis

In [0]:
# Number of events per user
user_events = combined_df.groupBy("user_id").count().orderBy(F.desc("count"))
user_events.show(10)

# Plot distribution of events per user
event_counts = user_events.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.hist(event_counts, bins=50, color="coral", alpha=0.7)
plt.title('Distribution of Events per User')
plt.xlabel('Number of Events')
plt.ylabel('Number of Users')
plt.show()

#### Distribution of Product Ratings by Brand

In [0]:
# Average rating per brand
brand_ratings = combined_df.groupBy("brand").agg(
    F.avg("product_rating").alias("avg_rating"),
    F.count("product_rating").alias("rating_count")
)

top_brands_ratings = brand_ratings.orderBy(F.desc("rating_count")).limit(10)
top_brands_pd = top_brands_ratings.toPandas()

# Bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='brand', y='avg_rating', data=top_brands_pd)
plt.xticks(rotation=45)
plt.title('Average Product Rating by Top 10 Brands')
plt.xlabel('Brand')
plt.ylabel('Average Rating')
plt.show()

#### Convert Review Ratings to Numeric and Analyze

In [0]:
# Convert review_rating to numeric
combined_df = combined_df.withColumn("review_rating_num", F.col("review_rating").cast("double"))

# Drop rows where conversion failed
combined_df = combined_df.na.drop(subset=["review_rating_num"])

# Histogram of review ratings
review_rating_data = combined_df.select("review_rating_num").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
sns.histplot(review_rating_data, bins=5, kde=False, color="skyblue")
plt.title('Distribution of Review Ratings')
plt.xlabel('Review Rating')
plt.ylabel('Frequency')
plt.show()

#### Session-Level Analysis

In [0]:
# Number of events per session
session_events = combined_df.groupBy("user_session").count().orderBy(F.desc("count"))
session_events.show(10)

# Distribution plot
session_counts = session_events.select("count").rdd.flatMap(lambda x: x).collect()

plt.figure(figsize=(12, 6))
plt.hist(session_counts, bins=50, color="limegreen", alpha=0.7)
plt.title('Distribution of Events per Session')
plt.xlabel('Number of Events')
plt.ylabel('Number of Sessions')
plt.show()

## Baseline Model

In [0]:
import pandas as pd
import numpy as np
import time
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from mlxtend.frequent_patterns import apriori, association_rules
import mlflow
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import collect_set, lit
import datetime
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [0]:
# Start MLflow tracking
mlflow.start_run(run_name="Recommendations and FP-Growth")

In [0]:
try:
    # Prepare purchase data for recommendations
    purchase_df = cosmetic_df.filter(cosmetic_df['event_type'] == 'purchase')
    purchase_df = purchase_df.withColumn("product_quantity", lit(1))

    # Convert PySpark DataFrame to pandas DataFrame
    purchase_df_pandas = purchase_df.toPandas()

    # Cosine Similarity Recommendations
    start_time = time.time()

    # Prepare data for sparse matrix
    orders = list(sorted(set(purchase_df_pandas['user_session'])))
    products = list(sorted(set(purchase_df_pandas['cosmeticProductId'])))
    quantities = list(purchase_df_pandas['product_quantity'])

    rs = pd.Categorical(purchase_df_pandas['user_session'], categories=orders).codes
    cs = pd.Categorical(purchase_df_pandas['cosmeticProductId'], categories=products).codes

    # Create sparse matrix
    sparse_matrix = sparse.csr_matrix((quantities, (rs, cs)), shape=(len(orders), len(products)))

    # Log sparsity
    matrix_size = sparse_matrix.shape[0] * sparse_matrix.shape[1]
    num_purchases = len(sparse_matrix.nonzero()[0])
    sparsity = round(100 * (1 - (float(num_purchases) / matrix_size)), 2)
    mlflow.log_metric("sparsity", sparsity)

    # Compute cosine similarity
    similarities = cosine_similarity(sparse_matrix.T)
    df_sim = pd.DataFrame(similarities, index=products, columns=products)
    mlflow.log_metric("cosine_similarity_calculation_time", round(time.time() - start_time, 2))

    # Generate recommendations
    start_time = time.time()
    top = 11
    df_match = pd.DataFrame(index=products, columns=[f'Rec {i}' for i in range(1, top)])
    df_score = pd.DataFrame(index=products, columns=[f'Score {i}' for i in range(1, top)])

    for i in range(len(products)):
        top_recs = df_sim.iloc[:, i].sort_values(ascending=False)
        top_recs = top_recs[top_recs.index != df_sim.index[i]]
        num_recs = min(top - 1, len(top_recs), df_match.shape[1])

        df_match.iloc[i, :num_recs] = top_recs.iloc[:num_recs].index
        df_score.iloc[i, :num_recs] = top_recs.iloc[:num_recs].values

    # Combine recommendations and scores
    df_new = df_match.merge(df_score, how="inner", left_index=True, right_index=True)
    df_new.index.names = ['product_id']

    # Save recommendations to file
    filename = "product_recs_cosine_similarity.csv"
    df_new.to_csv(filename)
    mlflow.log_artifact(filename)
    mlflow.log_metric("recommendation_generation_time", round(time.time() - start_time, 2))

    print("Cosine similarity recommendations generated.")
    print(df_new.head())

    # FP-Growth
    fp_data = purchase_df.groupBy("user_session").agg(collect_set("cosmeticProductId").alias("items"))

    # Train FP-Growth model
    min_support = 0.001
    min_confidence = 0.1
    fp_growth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence)
    model = fp_growth.fit(fp_data)

    # Get frequent itemsets
    frequent_itemsets = model.freqItemsets.toPandas()
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    frequent_itemsets_filename = f"frequent_itemsets.csv"
    frequent_itemsets.to_csv(frequent_itemsets_filename, index=False)
    mlflow.log_artifact(frequent_itemsets_filename)
    mlflow.log_metric("frequent_itemsets_count", len(frequent_itemsets))

    # Get association rules
    association_rules = model.associationRules.toPandas()
    association_rules_filename = f"association_rules.csv"
    association_rules.to_csv(association_rules_filename, index=False)
    mlflow.log_artifact(association_rules_filename)
    mlflow.log_metric("association_rules_count", len(association_rules))

    print("FP-Growth completed successfully.")
    print(frequent_itemsets.head())
    print(association_rules.head())

except Exception as e:
    mlflow.log_param("error", str(e))
    raise e
finally:
    mlflow.end_run()

In [0]:

# Save Recommendations to Unity Catalog
prod_recs_spark = spark.createDataFrame(df_new)
prod_recs_spark = prod_recs_spark.select(
    *[col(c).alias(c.replace(" ", "_").replace(".", "_")) for c in prod_recs_spark.columns]
)

prod_recs_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.product_recommendations")

In [0]:
	

# Save Frequent Itemsets to Unity Catalog
frequent_itemsets_spark = spark.createDataFrame(frequent_itemsets)
frequent_itemsets_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.frequent_itemsets")

# Save Association Rules to Unity Catalog
association_rules_spark = spark.createDataFrame(association_rules)
association_rules_spark.write.format("delta").mode("overwrite").saveAsTable("ecommerces_catalog.recommendation_schema.association_rules")