# H&M Product Recommendation Modelling

This notebook implements multiple recommendation system approaches for predicting customer product preferences and purchase behaviour.

## Modelling Approaches

1. **Collaborative Filtering** - Matrix factorization and neighborhood-based methods
2. **Content-Based Filtering** - Product and customer feature-based recommendations
3. **Hybrid Models** - Combining multiple approaches
4. **Purchase Prediction** - Binary classification for purchase likelihood

## Business Objectives

- Predict which products customers are likely to purchase
- Recommend relevant products to increase engagement
- Identify customer preferences and shopping patterns


In [None]:
import sys
import os
sys.path.append('../')

import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import H&M data modelling modules
from hnm_data_analysis.data_modelling import (
    CollaborativeFilteringModel,
    ContentBasedFilteringModel,
    PurchasePredictionModel,
    HybridRecommenderModel
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set display options
pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(15)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries and H&M modelling modules imported successfully")
print(f"Current working directory: {os.getcwd()}")

## Load Training and Test Data

Load the preprocessed training and test datasets created in the data preparation phase.


In [None]:
# Load training and test datasets
print("Loading training and test datasets...")

train_df = pl.read_parquet("../data/modelling_data/train_data.parquet")
test_df = pl.read_parquet("../data/modelling_data/test_data.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Total features: {train_df.shape[1]}")

# Check data consistency
print(f"\nData consistency checks:")
print(f"Training customers: {train_df['customer_id'].n_unique():,}")
print(f"Test customers: {test_df['customer_id'].n_unique():,}")
print(f"Training articles: {train_df['article_id'].n_unique():,}")
print(f"Test articles: {test_df['article_id'].n_unique():,}")

# Check for customer overlap (should be 0)
train_customers = set(train_df['customer_id'].unique())
test_customers = set(test_df['customer_id'].unique())
overlap = train_customers.intersection(test_customers)
print(f"Customer overlap: {len(overlap)} (should be 0)")

# Display sample data
print(f"\nSample training data:")
display(train_df.head(3))

print(f"\nColumn data types:")
for col, dtype in zip(train_df.columns, train_df.dtypes):
    print(f"{col}: {dtype}")

## Data Preparation for Recommendation Systems

Prepare the data for different recommendation approaches by creating interaction matrices and feature sets.


In [None]:
# Convert to pandas for easier manipulation with sklearn
print("Converting to pandas...")
train_pd = train_df.to_pandas()
test_pd = test_df.to_pandas()
print("Data converted to pandas for modeling")

# OPTIONAL: Use sampling for faster development/testing
USE_SAMPLING = False  # Set to False for full dataset
SAMPLE_SIZE = 100000  # Sample transactions for faster processing

if USE_SAMPLING and len(train_pd) > SAMPLE_SIZE:
    print(f"\nUsing sample of {SAMPLE_SIZE:,} transactions for faster processing...")
    train_pd = train_pd.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    print(f"Sampled training data shape: {train_pd.shape}")

print("Data preparation complete - ready for modelling with H&M modules")

## Model 1: Collaborative Filtering with Matrix Factorization

Implement collaborative filtering using SVD (Singular Value Decomposition) for matrix factorization.


In [None]:
print("=== Model 1: Collaborative Filtering ===\n")

# Initialize and train collaborative filtering model
cf_model = CollaborativeFilteringModel(n_components=50, random_state=42)
cf_model.fit(train_pd)

# Display model information
model_info = cf_model.get_model_info()
print(f"\nCollaborative Filtering Model Info:")
for key, value in model_info.items():
    if key == 'matrix_density':
        print(f"  {key}: {value:.4f}%")
    elif isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value:,}" if isinstance(value, int) else f"  {key}: {value}")

# Test recommendations for a sample customer
sample_customer = train_pd['customer_id'].iloc[0]
sample_recommendations = cf_model.get_recommendations(sample_customer, 5)

print(f"\nSample collaborative filtering recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nCollaborative filtering model trained successfully")

## Model 2: Content-Based Filtering

Implement content-based filtering using product features and customer preferences.


In [None]:
print("=== Model 2: Content-Based Filtering ===\n")

# Initialize and train content-based filtering model
cb_model = ContentBasedFilteringModel(
    max_features=1000,
    ngram_range=(1, 2),
    min_df=2
)
cb_model.fit(train_pd)

# Display model information
model_info = cb_model.get_model_info()
print(f"\nContent-Based Filtering Model Info:")
for key, value in model_info.items():
    if isinstance(value, int):
        print(f"  {key}: {value:,}")
    elif isinstance(value, tuple):
        print(f"  {key}: {value}")
    else:
        print(f"  {key}: {value}")

# Test content-based recommendations
sample_content_recommendations = cb_model.get_recommendations(sample_customer, 5)

print(f"\nSample content-based recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_content_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nContent-based filtering model trained successfully")

## Model 3: Purchase Prediction Classification

Build classification models to predict whether a customer will purchase a specific product.


In [None]:
print("=== Model 3: Purchase Prediction ===\n")

# Initialize and train purchase prediction model
pp_model = PurchasePredictionModel(test_size=0.2, random_state=42)
pp_model.fit(train_pd)

# Display model performance
model_scores = pp_model.get_model_scores()
print(f"\nPurchase Prediction Model Performance:")

for model_name, scores in model_scores.items():
    print(f"\n{model_name}:")
    print(f"  Accuracy:  {scores['accuracy']:.4f}")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-Score:  {scores['f1_score']:.4f}")
    print(f"  AUC-ROC:   {scores['auc_roc']:.4f}")

# Get best performing model
best_model_name = max(model_scores.keys(), key=lambda x: model_scores[x]['f1_score'])
print(f"\nBest performing model: {best_model_name} (F1-Score: {model_scores[best_model_name]['f1_score']:.4f})")

# Test purchase prediction for sample customer-article pairs
print(f"\nSample purchase predictions:")
sample_articles = train_pd['article_id'].unique()[:5]
for article_id in sample_articles:
    prob = pp_model.predict_purchase_probability(sample_customer, article_id)
    print(f"Customer {sample_customer} - Article {article_id}: {prob:.4f}")

print("\nPurchase prediction model trained successfully")

## Model 4: Hybrid Recommendation System

Combine collaborative filtering and content-based approaches for improved recommendations.


In [None]:
print("=== Model 4: Hybrid Recommendation System ===\n")

# Initialize and train hybrid recommender model
hybrid_model = HybridRecommenderModel(
    cf_weight=0.4,
    cb_weight=0.4, 
    pp_weight=0.2
)

# Train the hybrid model with all components
hybrid_model.fit(
    train_pd,
    cf_params={'n_components': 50},
    cb_params={'max_features': 1000},
    pp_params={'test_size': 0.2}
)

# Display model information
model_info = hybrid_model.get_model_info()
print(f"\nHybrid Recommender Model Info:")
print(f"  Component Models: {model_info['component_models']}")
print(f"  Model Weights: CF={model_info['cf_weight']:.1f}, CB={model_info['cb_weight']:.1f}, PP={model_info['pp_weight']:.1f}")
print(f"  Fitted: {model_info['fitted']}")

# Test hybrid recommendations
sample_hybrid_recommendations = hybrid_model.get_recommendations(sample_customer, 5)

print(f"\nSample hybrid recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_hybrid_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nHybrid recommendation system trained successfully")

## Save Models and Preprocessing Objects

Save all trained models and preprocessing objects for evaluation and future use.


In [None]:
# Create models directory
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

print("Saving trained models...")

# Save individual models
print("Saving collaborative filtering model...")
cf_model.save_model(models_dir / 'collaborative_filtering_model.pkl')

print("Saving content-based filtering model...")
cb_model.save_model(models_dir / 'content_based_filtering_model.pkl')

print("Saving purchase prediction model...")
pp_model.save_model(models_dir / 'purchase_prediction_model.pkl')

print("Saving hybrid recommender model...")
hybrid_model.save_model(models_dir / 'hybrid_recommender_model.pkl')

# Create and save performance summary
print("\nCreating performance summary...")

# Combine all model information
performance_data = {
    'Collaborative Filtering': cf_model.get_model_info(),
    'Content-Based Filtering': cb_model.get_model_info(), 
    'Purchase Prediction': {
        'best_model': max(pp_model.get_model_scores().keys(), 
                         key=lambda x: pp_model.get_model_scores()[x]['f1_score']),
        'best_f1_score': max(pp_model.get_model_scores().values(), 
                           key=lambda x: x['f1_score'])['f1_score']
    },
    'Hybrid Recommender': hybrid_model.get_model_info()
}

# Save performance summary as JSON for easier reading
import json
with open(models_dir / 'model_summary.json', 'w') as f:
    # Convert numpy types to native Python types for JSON serialization
    json_data = {}
    for model_name, info in performance_data.items():
        json_data[model_name] = {}
        for key, value in info.items():
            if isinstance(value, np.integer):
                json_data[model_name][key] = int(value)
            elif isinstance(value, np.floating):
                json_data[model_name][key] = float(value)
            else:
                json_data[model_name][key] = value
    
    json.dump(json_data, f, indent=2)

print(f"Model summary saved to: {models_dir / 'model_summary.json'}")

print("\n=== Modelling Complete ===\n")
print("Trained Models:")
print("  1. Collaborative Filtering (SVD-based matrix factorization)")
print("  2. Content-Based Filtering (TF-IDF product similarity)")
print("  3. Purchase Prediction (Multiple classification algorithms)")
print("  4. Hybrid Recommender (Combines all approaches)")

print(f"\nAll models saved to: {models_dir}/")
print("Ready for evaluation and deployment!")