# H&M Product Recommendation Modelling

This notebook implements multiple recommendation system approaches for predicting customer product preferences and purchase behaviour.

## Modelling Approaches
1. **Collaborative Filtering** - Matrix factorization and neighborhood-based methods
2. **Content-Based Filtering** - Product and customer feature-based recommendations
3. **Hybrid Models** - Combining multiple approaches
4. **Purchase Prediction** - Binary classification for purchase likelihood

## Business Objectives
- Predict which products customers are likely to purchase
- Recommend relevant products to increase engagement
- Identify customer preferences and shopping patterns

In [None]:
import sys
import os
sys.path.append('../')

import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, classification_report, confusion_matrix
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set display options
pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(15)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully")
print(f"Current working directory: {os.getcwd()}")

## Load Training and Test Data

Load the preprocessed training and test datasets created in the data preparation phase.

In [None]:
# Load training and test datasets
print("Loading training and test datasets...")

train_df = pl.read_parquet("../data/modelling_data/train_data.parquet")
test_df = pl.read_parquet("../data/modelling_data/test_data.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Total features: {train_df.shape[1]}")

# Check data consistency
print(f"\nData consistency checks:")
print(f"Training customers: {train_df['customer_id'].n_unique():,}")
print(f"Test customers: {test_df['customer_id'].n_unique():,}")
print(f"Training articles: {train_df['article_id'].n_unique():,}")
print(f"Test articles: {test_df['article_id'].n_unique():,}")

# Check for customer overlap (should be 0)
train_customers = set(train_df['customer_id'].unique())
test_customers = set(test_df['customer_id'].unique())
overlap = train_customers.intersection(test_customers)
print(f"Customer overlap: {len(overlap)} (should be 0)")

# Display sample data
print(f"\nSample training data:")
display(train_df.head(3))

print(f"\nColumn data types:")
for col, dtype in zip(train_df.columns, train_df.dtypes):
    print(f"{col}: {dtype}")

## Data Preparation for Recommendation Systems

Prepare the data for different recommendation approaches by creating interaction matrices and feature sets.

In [None]:
# Convert to pandas for easier manipulation with sklearn
train_pd = train_df.to_pandas()
test_pd = test_df.to_pandas()

print("Data converted to pandas for modeling")

# Create user-item interaction matrix for collaborative filtering
print("\nCreating user-item interaction matrices...")

# For simplicity, we'll use binary interactions (purchased = 1, not purchased = 0)
# In a real scenario, you might use ratings, purchase frequency, or purchase amounts

# Training interaction matrix
train_interactions = train_pd.groupby(['customer_id', 'article_id']).size().reset_index(name='interaction_count')
train_interactions['interaction'] = 1  # Binary interaction

print(f"Training interactions: {train_interactions.shape[0]:,}")
print(f"Unique customer-article pairs in training: {train_interactions.shape[0]:,}")

# Test interactions
test_interactions = test_pd.groupby(['customer_id', 'article_id']).size().reset_index(name='interaction_count')
test_interactions['interaction'] = 1

print(f"Test interactions: {test_interactions.shape[0]:,}")

# Create pivot tables for matrix factorization
print("\nCreating interaction matrices...")

# Get all unique customers and articles from training data
all_customers = sorted(train_pd['customer_id'].unique())
all_articles = sorted(train_pd['article_id'].unique())

print(f"Total customers in training: {len(all_customers):,}")
print(f"Total articles in training: {len(all_articles):,}")

# Create customer and article encoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

customer_encoder.fit(all_customers)
article_encoder.fit(all_articles)

# Encode interactions
train_interactions['customer_idx'] = customer_encoder.transform(train_interactions['customer_id'])
train_interactions['article_idx'] = article_encoder.transform(train_interactions['article_id'])

print("Customer and article encoding completed")

## Model 1: Collaborative Filtering with Matrix Factorization

Implement collaborative filtering using SVD (Singular Value Decomposition) for matrix factorization.

In [None]:
from scipy.sparse import csr_matrix

print("=== Collaborative Filtering with SVD ===")

# Create sparse interaction matrix
n_customers = len(all_customers)
n_articles = len(all_articles)

print(f"Creating {n_customers:,} x {n_articles:,} interaction matrix...")

# Create sparse matrix from interactions
interaction_matrix = csr_matrix(
    (train_interactions['interaction'], 
     (train_interactions['customer_idx'], train_interactions['article_idx'])),
    shape=(n_customers, n_articles)
)

print(f"Interaction matrix density: {interaction_matrix.nnz / (n_customers * n_articles) * 100:.4f}%")
print(f"Non-zero elements: {interaction_matrix.nnz:,}")

# Apply SVD for matrix factorization
print("\nApplying SVD matrix factorization...")

# Use TruncatedSVD for sparse matrices
n_components = min(50, min(n_customers, n_articles) - 1)  # Reduced for memory efficiency
svd_model = TruncatedSVD(n_components=n_components, random_state=42)

# Fit SVD on the interaction matrix
customer_factors = svd_model.fit_transform(interaction_matrix)
article_factors = svd_model.components_

print(f"SVD completed with {n_components} components")
print(f"Explained variance ratio: {svd_model.explained_variance_ratio_.sum():.4f}")
print(f"Customer factors shape: {customer_factors.shape}")
print(f"Article factors shape: {article_factors.shape}")

# Function to get recommendations for a customer
def get_svd_recommendations(customer_id, n_recommendations=10):
    """
    Get SVD-based recommendations for a customer
    """
    try:
        customer_idx = customer_encoder.transform([customer_id])[0]
        
        # Get customer's latent factors
        customer_vector = customer_factors[customer_idx]
        
        # Compute scores for all articles
        scores = np.dot(customer_vector, article_factors)
        
        # Get articles customer has already interacted with
        customer_articles = set(train_interactions[train_interactions['customer_id'] == customer_id]['article_id'])
        
        # Create recommendations excluding already purchased items
        recommendations = []
        article_scores = list(zip(all_articles, scores))
        article_scores.sort(key=lambda x: x[1], reverse=True)
        
        for article_id, score in article_scores:
            if article_id not in customer_articles and len(recommendations) < n_recommendations:
                recommendations.append((article_id, score))
        
        return recommendations
    except Exception as e:
        print(f"Error getting recommendations for customer {customer_id}: {e}")
        return []

# Test recommendations for a sample customer
sample_customer = all_customers[0]
sample_recommendations = get_svd_recommendations(sample_customer, 5)

print(f"\nSample recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nSVD Collaborative Filtering model ready")

## Model 2: Content-Based Filtering

Implement content-based filtering using product features and customer preferences.

In [None]:
print("=== Content-Based Filtering ===")

# Extract product features for content-based filtering
print("Preparing product features...")

# Get unique articles with their features
article_features = train_pd[[
    'article_id', 'product_type_name', 'product_group_name', 
    'graphical_appearance_name', 'colour_group_name', 
    'perceived_colour_value_name', 'perceived_colour_master_name',
    'department_name', 'index_name', 'index_group_name',
    'section_name', 'garment_group_name'
]].drop_duplicates(subset=['article_id'])

print(f"Articles with features: {article_features.shape[0]:,}")

# Convert categorical columns to string and handle missing values
text_columns = ['product_type_name', 'product_group_name', 'colour_group_name', 
                'department_name', 'section_name', 'garment_group_name']

for col in text_columns:
    if col in article_features.columns:
        # Convert to string and handle NaN values
        article_features[col] = article_features[col].astype(str).replace('nan', 'unknown')

# Create content features by combining text descriptions
article_features['content_features'] = (
    article_features['product_type_name'] + ' ' +
    article_features['product_group_name'] + ' ' +
    article_features['colour_group_name'] + ' ' +
    article_features['department_name'] + ' ' +
    article_features['section_name'] + ' ' +
    article_features['garment_group_name']
)

# Create TF-IDF vectors for content features
print("Creating TF-IDF vectors for product content...")

tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit features for memory efficiency
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

content_matrix = tfidf_vectorizer.fit_transform(article_features['content_features'])
print(f"Content matrix shape: {content_matrix.shape}")

# Calculate content similarity matrix
print("Computing content similarity matrix...")
content_similarity = cosine_similarity(content_matrix)
print(f"Content similarity matrix shape: {content_similarity.shape}")

# Create article index mapping
article_to_idx = {article_id: idx for idx, article_id in enumerate(article_features['article_id'])}
idx_to_article = {idx: article_id for article_id, idx in article_to_idx.items()}

def get_content_based_recommendations(customer_id, n_recommendations=10):
    """
    Get content-based recommendations for a customer based on their purchase history
    """
    # Get customer's purchased articles
    customer_articles = train_interactions[train_interactions['customer_id'] == customer_id]['article_id'].tolist()
    
    if not customer_articles:
        return []
    
    # Calculate average similarity scores for articles similar to purchased ones
    similarity_scores = np.zeros(len(article_features))
    
    for article_id in customer_articles:
        if article_id in article_to_idx:
            article_idx = article_to_idx[article_id]
            similarity_scores += content_similarity[article_idx]
    
    if len(customer_articles) > 0:
        similarity_scores /= len(customer_articles)
    
    # Get top recommendations excluding already purchased items
    customer_articles_set = set(customer_articles)
    recommendations = []
    
    # Sort by similarity score
    sorted_indices = np.argsort(similarity_scores)[::-1]
    
    for idx in sorted_indices:
        article_id = idx_to_article[idx]
        if article_id not in customer_articles_set and len(recommendations) < n_recommendations:
            recommendations.append((article_id, similarity_scores[idx]))
    
    return recommendations

# Test content-based recommendations
sample_content_recommendations = get_content_based_recommendations(sample_customer, 5)

print(f"\nSample content-based recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_content_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nContent-Based Filtering model ready")

## Model 3: Purchase Prediction Classification

Build classification models to predict whether a customer will purchase a specific product.

In [None]:
print("=== Purchase Prediction Classification ===")

# Prepare features for classification
print("Preparing features for purchase prediction...")

# Select numerical and categorical features
numerical_features = [
    'price', 'age', 'recency', 'frequency', 'monetary',
    'purchase_diversity_score', 'price_sensitivity_index',
    'colour_preference_entropy', 'style_consistency_score'
]

categorical_features = [
    'club_member_status', 'fashion_news_frequency', 'sales_channel_id',
    'product_type_name', 'colour_group_name', 'department_name'
]

# Prepare training data
print("Preparing training features...")

# Create a balanced dataset by sampling negative examples
# Positive examples: actual purchases
positive_samples = train_pd.copy()
positive_samples['purchased'] = 1

print(f"Positive samples (actual purchases): {len(positive_samples):,}")

# Create negative samples: customers who didn't purchase specific articles
print("Creating negative samples...")

# Sample a subset for negative examples to balance the dataset
n_negative_samples = min(len(positive_samples), 100000)  # Limit for memory

# Get random customer-article pairs that don't exist in positive samples
positive_pairs = set(zip(positive_samples['customer_id'], positive_samples['article_id']))

negative_samples = []
sample_customers = np.random.choice(all_customers, size=n_negative_samples, replace=True)
sample_articles = np.random.choice(all_articles, size=n_negative_samples, replace=True)

for customer_id, article_id in zip(sample_customers, sample_articles):
    if (customer_id, article_id) not in positive_pairs:
        negative_samples.append((customer_id, article_id))
    if len(negative_samples) >= n_negative_samples:
        break

print(f"Created {len(negative_samples):,} negative samples")

# Create negative samples dataframe
negative_df_list = []
for customer_id, article_id in negative_samples[:n_negative_samples//2]:  # Further reduce for memory
    # Get customer features
    customer_data = train_pd[train_pd['customer_id'] == customer_id].iloc[0]
    # Get article features  
    article_data = train_pd[train_pd['article_id'] == article_id].iloc[0]
    
    # Combine features
    negative_row = customer_data.copy()
    negative_row['article_id'] = article_id
    # Update article-specific features
    for col in ['product_type_name', 'product_group_name', 'colour_group_name', 
                'department_name', 'section_name', 'garment_group_name']:
        if col in article_data:
            negative_row[col] = article_data[col]
    
    negative_df_list.append(negative_row)

negative_df = pd.DataFrame(negative_df_list)
negative_df['purchased'] = 0

print(f"Negative samples dataframe: {negative_df.shape}")

# Combine positive and negative samples
classification_data = pd.concat([
    positive_samples.sample(n=len(negative_df), random_state=42),  # Sample to match negative samples
    negative_df
], ignore_index=True)

print(f"Combined classification dataset: {classification_data.shape}")
print(f"Class distribution:")
print(classification_data['purchased'].value_counts())

In [None]:
# Prepare features for modeling
print("Preparing features for classification models...")

# Handle missing values and encode categorical variables
classification_features = classification_data[numerical_features + categorical_features].copy()

# Fill missing values
for col in numerical_features:
    if col in classification_features.columns:
        classification_features[col] = classification_features[col].fillna(classification_features[col].median())

for col in categorical_features:
    if col in classification_features.columns:
        classification_features[col] = classification_features[col].fillna('Unknown')

# Encode categorical variables
print("Encoding categorical variables...")
encoded_features = classification_features.copy()

label_encoders = {}
for col in categorical_features:
    if col in encoded_features.columns:
        le = LabelEncoder()
        encoded_features[col] = le.fit_transform(encoded_features[col].astype(str))
        label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
numerical_cols = [col for col in numerical_features if col in encoded_features.columns]
encoded_features[numerical_cols] = scaler.fit_transform(encoded_features[numerical_cols])

# Prepare target variable
y = classification_data['purchased']
X = encoded_features

print(f"Final feature matrix shape: {X.shape}")
print(f"Features: {list(X.columns)}")

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Training class distribution: {y_train.value_counts().to_dict()}")
print(f"Validation class distribution: {y_val.value_counts().to_dict()}")

In [None]:
# Train multiple classification models
print("Training classification models...")

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

trained_models = {}
model_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred_proba)
    
    # Store results
    trained_models[name] = model
    model_scores[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  AUC-ROC: {auc:.4f}")

print("\n=== Model Training Complete ===")

## Model 4: Hybrid Recommendation System

Combine collaborative filtering and content-based approaches for improved recommendations.

In [None]:
print("=== Hybrid Recommendation System ===")

def get_hybrid_recommendations(customer_id, n_recommendations=10, cf_weight=0.6, cb_weight=0.4):
    """
    Get hybrid recommendations combining collaborative filtering and content-based approaches
    
    Args:
        customer_id: Customer ID
        n_recommendations: Number of recommendations to return
        cf_weight: Weight for collaborative filtering scores
        cb_weight: Weight for content-based scores
    """
    # Get recommendations from both approaches
    cf_recommendations = get_svd_recommendations(customer_id, n_recommendations * 2)
    cb_recommendations = get_content_based_recommendations(customer_id, n_recommendations * 2)
    
    # Convert to dictionaries for easier merging
    cf_scores = {article_id: score for article_id, score in cf_recommendations}
    cb_scores = {article_id: score for article_id, score in cb_recommendations}
    
    # Normalize scores to 0-1 range
    if cf_scores:
        cf_max = max(cf_scores.values())
        cf_min = min(cf_scores.values())
        if cf_max > cf_min:
            cf_scores = {k: (v - cf_min) / (cf_max - cf_min) for k, v in cf_scores.items()}
    
    if cb_scores:
        cb_max = max(cb_scores.values())
        cb_min = min(cb_scores.values())
        if cb_max > cb_min:
            cb_scores = {k: (v - cb_min) / (cb_max - cb_min) for k, v in cb_scores.items()}
    
    # Combine scores
    all_articles = set(cf_scores.keys()) | set(cb_scores.keys())
    hybrid_scores = {}
    
    for article_id in all_articles:
        cf_score = cf_scores.get(article_id, 0)
        cb_score = cb_scores.get(article_id, 0)
        hybrid_score = cf_weight * cf_score + cb_weight * cb_score
        hybrid_scores[article_id] = hybrid_score
    
    # Sort and return top recommendations
    sorted_recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_recommendations[:n_recommendations]

# Test hybrid recommendations
sample_hybrid_recommendations = get_hybrid_recommendations(sample_customer, 5)

print(f"Sample hybrid recommendations for customer {sample_customer}:")
for i, (article_id, score) in enumerate(sample_hybrid_recommendations, 1):
    print(f"{i}. Article {article_id}: Score {score:.4f}")

print("\nHybrid Recommendation System ready")

## Save Models and Preprocessing Objects

Save all trained models and preprocessing objects for evaluation and future use.

In [None]:
# Create models directory
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

print("Saving models and preprocessing objects...")

# Save all models and objects
model_artifacts = {
    'svd_model': svd_model,
    'customer_encoder': customer_encoder,
    'article_encoder': article_encoder,
    'customer_factors': customer_factors,
    'article_factors': article_factors,
    'tfidf_vectorizer': tfidf_vectorizer,
    'content_similarity': content_similarity,
    'article_to_idx': article_to_idx,
    'idx_to_article': idx_to_article,
    'article_features': article_features,
    'trained_models': trained_models,
    'label_encoders': label_encoders,
    'scaler': scaler,
    'model_scores': model_scores,
    'train_interactions': train_interactions,
    'all_customers': all_customers,
    'all_articles': all_articles
}

# Save to pickle file
with open(models_dir / 'recommendation_models.pkl', 'wb') as f:
    pickle.dump(model_artifacts, f)

print(f"Models saved to: {models_dir / 'recommendation_models.pkl'}")

# Save model performance summary
performance_summary = pd.DataFrame(model_scores).T
performance_summary.to_csv(models_dir / 'model_performance_summary.csv')

print(f"Performance summary saved to: {models_dir / 'model_performance_summary.csv'}")
print("\nModel Performance Summary:")
display(performance_summary[['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc']])

print("\n=== Modelling Complete ===")
print(f"Models trained: {list(trained_models.keys())}")
print(f"Recommendation approaches: SVD Collaborative Filtering, Content-Based, Hybrid")
print(f"Ready for evaluation in model_evaluation.ipynb")