# H&M Data Modelling

This notebook combines the final feature datasets for modelling purposes, removing data quality tracking flags and preparing a clean combined dataset.


In [1]:
import sys
import os
sys.path.append('../')

import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path

# Set display options
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(15)

polars.config.Config

## Load and Examine Datasets

Loading the three final datasets:

- Articles features (final)
- Customers features with clusters
- Cleaned transactions


In [2]:
# Load datasets
print("Loading articles features...")
articles_df = pl.read_parquet("../data/features/final/articles_features_final.parquet")
print(f"Articles features shape: {articles_df.shape}")
print(f"Articles columns: {articles_df.columns[:10]}...")
print()

print("Loading customers features...")
customers_df = pl.read_parquet("../data/features/final/customers_features_with_clusters.parquet")
print(f"Customers features shape: {customers_df.shape}")
print(f"Customers columns: {customers_df.columns[:10]}...")
print()

print("Loading transactions data...")
transactions_df = pl.read_parquet("../data/cleaned/transactions_cleaned.parquet")
print(f"Transactions shape: {transactions_df.shape}")
print(f"Transactions columns: {transactions_df.columns}")

Loading articles features...
Articles features shape: (42298, 14)
Articles columns: ['article_id', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name']...

Loading customers features...
Customers features shape: (525075, 26)
Customers columns: ['customer_id', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'recency', 'frequency', 'monetary']...

Loading transactions data...
Transactions shape: (3904391, 8)
Transactions columns: ['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id', 'price_outlier_capped', 'sales_channel_corrected', 'price_percentile_calibrated']


## Examine Data Quality Flags

Identify columns that contain data quality tracking information to remove them.


In [3]:
# Examine column names for data quality flags
print("Articles columns:")
for col in articles_df.columns:
    if any(keyword in col.lower() for keyword in ['flag', 'quality', 'missing', 'outlier', 'imputed', 'cleaned']):
        print(f"  - {col} (potential quality flag)")
    else:
        print(f"  - {col}")
        
print("\nCustomers columns:")
for col in customers_df.columns:
    if any(keyword in col.lower() for keyword in ['flag', 'quality', 'missing', 'outlier', 'imputed', 'cleaned']):
        print(f"  - {col} (potential quality flag)")
    else:
        print(f"  - {col}")
        
print("\nTransactions columns:")
for col in transactions_df.columns:
    if any(keyword in col.lower() for keyword in ['flag', 'quality', 'missing', 'outlier', 'imputed', 'cleaned']):
        print(f"  - {col} (potential quality flag)")
    else:
        print(f"  - {col}")

Articles columns:
  - article_id
  - product_type_name
  - product_group_name
  - graphical_appearance_name
  - colour_group_name
  - perceived_colour_value_name
  - perceived_colour_master_name
  - department_name
  - index_name
  - index_group_name
  - section_name
  - garment_group_name
  - detail_desc
  - bert_cluster

Customers columns:
  - customer_id
  - FN
  - Active
  - club_member_status
  - fashion_news_frequency
  - age
  - postal_code
  - recency
  - frequency
  - monetary
  - purchase_diversity_score
  - price_sensitivity_index
  - colour_preference_entropy
  - style_consistency_score
  - dataset_created_at
  - created_by
  - rfm_cluster
  - preference_cluster
  - hybrid_cluster
  - behavioural_cluster
  - rfm_cluster_label
  - preference_cluster_label
  - hybrid_cluster_label
  - behavioural_cluster_label
  - clustering_analysis_date
  - clustering_version

Transactions columns:
  - t_dat
  - customer_id
  - article_id
  - price
  - sales_channel_id
  - price_outlier_cap

## Remove Data Quality Flags

Remove any columns that are related to data quality tracking.


In [4]:
# Function to filter out data quality columns
def remove_quality_columns(df, dataset_name):
    original_cols = df.columns
    quality_keywords = ['flag', 'quality', 'missing', 'outlier', 'imputed', 'cleaned', '_qc', '_dq']
    
    # Additional columns to remove for modelling (metadata and processing indicators)
    modelling_exclude_cols = [
        'clustering_analysis_date', 
        'clustering_version',
        'dataset_created_at',
        'sales_channel_corrected',
        'price_percentile_calibrated'
    ]
    
    # Keep columns that don't contain quality tracking keywords or modelling exclusions
    cols_to_keep = [
        col for col in original_cols 
        if not any(keyword in col.lower() for keyword in quality_keywords)
        and col not in modelling_exclude_cols
    ]
    
    removed_cols = [col for col in original_cols if col not in cols_to_keep]
    
    if removed_cols:
        print(f"Removed {len(removed_cols)} quality/metadata columns from {dataset_name}:")
        for col in removed_cols:
            print(f"  - {col}")
    else:
        print(f"No quality/metadata columns found in {dataset_name}")
    
    return df.select(cols_to_keep)

# Remove quality columns from each dataset
articles_clean = remove_quality_columns(articles_df, "articles")
print()
customers_clean = remove_quality_columns(customers_df, "customers")
print()
transactions_clean = remove_quality_columns(transactions_df, "transactions")

print(f"\nCleaned shapes:")
print(f"Articles: {articles_clean.shape}")
print(f"Customers: {customers_clean.shape}")
print(f"Transactions: {transactions_clean.shape}")

No quality/metadata columns found in articles

Removed 3 quality/metadata columns from customers:
  - dataset_created_at
  - clustering_analysis_date
  - clustering_version

Removed 3 quality/metadata columns from transactions:
  - price_outlier_capped
  - sales_channel_corrected
  - price_percentile_calibrated

Cleaned shapes:
Articles: (42298, 14)
Customers: (525075, 23)
Transactions: (3904391, 5)


## Combine Datasets

Join transactions with customer and article features to create the final modelling dataset.


In [5]:
# Check join keys
print("Join key analysis:")
print(f"Transactions customer_id unique count: {transactions_clean['customer_id'].n_unique()}")
print(f"Customers customer_id unique count: {customers_clean['customer_id'].n_unique()}")
print(f"Transactions article_id unique count: {transactions_clean['article_id'].n_unique()}")
print(f"Articles article_id unique count: {articles_clean['article_id'].n_unique()}")

print("\nStarting joins...")
# Join transactions with customers
combined_df = transactions_clean.join(
    customers_clean,
    on="customer_id",
    how="left"
)
print(f"After customer join: {combined_df.shape}")

# Join with articles
combined_df = combined_df.join(
    articles_clean,
    on="article_id",
    how="left"
)
print(f"Final combined shape: {combined_df.shape}")
print(f"Total columns: {len(combined_df.columns)}")

Join key analysis:
Transactions customer_id unique count: 525075
Customers customer_id unique count: 525075
Transactions article_id unique count: 42298
Articles article_id unique count: 42298

Starting joins...
After customer join: (3904391, 27)
Final combined shape: (3904391, 40)
Total columns: 40


## Data Quality Check

Quick check of the combined dataset for any issues.


In [6]:
# Basic quality check
print("Combined dataset summary:")
print(f"Shape: {combined_df.shape}")
print(f"Memory usage: {combined_df.estimated_size('mb'):.1f} MB")

# Check for null values in key columns
null_counts = combined_df.null_count()
high_null_cols = [
    (col, count) for col, count in zip(null_counts.columns, null_counts.row(0))
    if count > 0
]

if high_null_cols:
    print(f"\nColumns with null values (showing first 10):")
    for col, count in sorted(high_null_cols, key=lambda x: x[1], reverse=True)[:10]:
        percentage = (count / combined_df.shape[0]) * 100
        print(f"  {col}: {count:,} ({percentage:.1f}%)")
else:
    print("\nNo null values found")

# Show first few rows
print(f"\nFirst 3 rows (first 10 columns):")
print(combined_df.select(combined_df.columns[:10]).head(3))

Combined dataset summary:
Shape: (3904391, 40)
Memory usage: 1445.0 MB

Columns with null values (showing first 10):
  bert_cluster: 2,460 (0.1%)

First 3 rows (first 10 columns):
shape: (3, 10)
┌───────────┬───────────┬───────────┬───────┬──────────┬─────┬────────┬──────────┬──────────┬──────┐
│ t_dat     ┆ customer_ ┆ article_i ┆ price ┆ sales_ch ┆ FN  ┆ Active ┆ club_mem ┆ fashion_ ┆ age  │
│ ---       ┆ id        ┆ d         ┆ ---   ┆ annel_id ┆ --- ┆ ---    ┆ ber_stat ┆ news_fre ┆ ---  │
│ date      ┆ ---       ┆ ---       ┆ f64   ┆ ---      ┆ f64 ┆ f64    ┆ us       ┆ quency   ┆ f64  │
│           ┆ str       ┆ i64       ┆       ┆ i64      ┆     ┆        ┆ ---      ┆ ---      ┆      │
│           ┆           ┆           ┆       ┆          ┆     ┆        ┆ cat      ┆ cat      ┆      │
╞═══════════╪═══════════╪═══════════╪═══════╪══════════╪═════╪════════╪══════════╪══════════╪══════╡
│ 2020-06-2 ┆ 0001b0127 ┆ 844409002 ┆ 14.33 ┆ 1        ┆ 1.0 ┆ 1.0    ┆ ACTIVE   ┆ Regularl ┆ 20.0

## Save Combined Dataset

Create the modelling_data directory and save the combined dataset.


In [7]:
# Create modelling_data directory
modelling_dir = Path("../data/modelling_data")
modelling_dir.mkdir(exist_ok=True)
print(f"Created directory: {modelling_dir.absolute()}")

# Save combined dataset
output_path = modelling_dir / "combined_modelling_data.parquet"
print(f"Saving combined dataset to: {output_path}")

combined_df.write_parquet(output_path)
print(f"✓ Saved combined dataset: {combined_df.shape}")

# Verify file size
file_size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"File size: {file_size_mb:.1f} MB")

Created directory: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\data\modelling_data
Saving combined dataset to: ..\data\modelling_data\combined_modelling_data.parquet
✓ Saved combined dataset: (3904391, 40)
File size: 2014.4 MB


## Generate Data Report

Create a data report for the combined modelling dataset.


In [8]:
# Import data report generator
from hnm_data_analysis.data_understanding.data_report_generator import generate_data_report

# Create modelling documentation directory
modelling_doc_dir = Path("../results/data_documentation/modelling")
modelling_doc_dir.mkdir(parents=True, exist_ok=True)
print(f"Created documentation directory: {modelling_doc_dir.absolute()}")

# Generate report for the combined modelling dataset
report_output_path = modelling_doc_dir / "combined_modelling_data_report.md"
report_path = generate_data_report(
    str(output_path.absolute()),
    output_path=str(report_output_path.absolute())
)

print(f"✓ Data report generated: {report_path}")

Created documentation directory: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\results\data_documentation\modelling
Data report generated successfully!
Analysed file: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\data\modelling_data\combined_modelling_data.parquet
Report saved to: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\results\data_documentation\modelling\combined_modelling_data_report.md
✓ Data report generated: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\results\data_documentation\modelling\combined_modelling_data_report.md


## Data Combination Summary

The combined modelling dataset has been successfully created with:

- Transactions data as the base
- Customer features and clusters joined
- Article features joined
- Data quality tracking columns removed
- Data report generated

The dataset is ready for modelling tasks.


## Train/Test Split

Split the combined modelling dataset into training and test sets for machine learning model development.


In [9]:
# Load the combined dataset
print("Loading combined modelling dataset...")
combined_df = pl.read_parquet("../data/modelling_data/combined_modelling_data.parquet")
print(f"Dataset shape: {combined_df.shape}")

# Create a stratified split based on customer_id to ensure customers are not split across train/test
# This prevents data leakage where the same customer appears in both training and test sets
print("\nCreating customer-based train/test split...")

# Get unique customers
unique_customers = combined_df.select("customer_id").unique()
print(f"Unique customers: {unique_customers.shape[0]:,}")

# Create random split of customers (80/20 split)
np.random.seed(42)  # For reproducibility
customer_indices = np.arange(unique_customers.shape[0])
np.random.shuffle(customer_indices)

# Split customers 80/20
split_idx = int(0.8 * len(customer_indices))
train_customer_indices = customer_indices[:split_idx]
test_customer_indices = customer_indices[split_idx:]

print(f"Training customers: {len(train_customer_indices):,}")
print(f"Test customers: {len(test_customer_indices):,}")

# Get customer IDs for each split
train_customers = unique_customers[train_customer_indices]["customer_id"]
test_customers = unique_customers[test_customer_indices]["customer_id"]

# Create training dataset
train_df = combined_df.filter(pl.col("customer_id").is_in(train_customers))
print(f"Training dataset shape: {train_df.shape}")

# Create test dataset  
test_df = combined_df.filter(pl.col("customer_id").is_in(test_customers))
print(f"Test dataset shape: {test_df.shape}")

# Verify no customer overlap
train_customer_set = set(train_df["customer_id"].unique())
test_customer_set = set(test_df["customer_id"].unique())
overlap = train_customer_set.intersection(test_customer_set)
print(f"Customer overlap between train/test: {len(overlap)} (should be 0)")

# Show split statistics
total_transactions = combined_df.shape[0]
train_pct = (train_df.shape[0] / total_transactions) * 100
test_pct = (test_df.shape[0] / total_transactions) * 100
print(f"\nSplit percentages:")
print(f"Training: {train_pct:.1f}% ({train_df.shape[0]:,} transactions)")
print(f"Test: {test_pct:.1f}% ({test_df.shape[0]:,} transactions)")

Loading combined modelling dataset...
Dataset shape: (3904391, 40)

Creating customer-based train/test split...
Unique customers: 525,075
Training customers: 420,060
Test customers: 105,015


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  train_df = combined_df.filter(pl.col("customer_id").is_in(train_customers))


Training dataset shape: (3122370, 40)


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  test_df = combined_df.filter(pl.col("customer_id").is_in(test_customers))


Test dataset shape: (782021, 40)
Customer overlap between train/test: 0 (should be 0)

Split percentages:
Training: 80.0% (3,122,370 transactions)
Test: 20.0% (782,021 transactions)


## Save Train/Test Datasets

Save the training and test datasets to separate files for model development.


In [10]:
# Save training dataset
train_path = modelling_dir / "train_data.parquet"
print(f"Saving training dataset to: {train_path}")
train_df.write_parquet(train_path)
train_size_mb = train_path.stat().st_size / (1024 * 1024)
print(f"✓ Training dataset saved: {train_df.shape}, {train_size_mb:.1f} MB")

# Save test dataset
test_path = modelling_dir / "test_data.parquet"
print(f"Saving test dataset to: {test_path}")
test_df.write_parquet(test_path)
test_size_mb = test_path.stat().st_size / (1024 * 1024)
print(f"✓ Test dataset saved: {test_df.shape}, {test_size_mb:.1f} MB")

# Save customer splits for reference
train_customers_path = modelling_dir / "train_customers.parquet"
test_customers_path = modelling_dir / "test_customers.parquet"

train_customers.to_frame().write_parquet(train_customers_path)
test_customers.to_frame().write_parquet(test_customers_path)

print(f"✓ Customer ID splits saved")
print(f"  - Training customers: {train_customers_path}")
print(f"  - Test customers: {test_customers_path}")

# Summary
print(f"\n=== Train/Test Split Summary ===")
print(f"Original dataset: {combined_df.shape[0]:,} transactions, {combined_df.shape[1]} features")
print(f"Training set: {train_df.shape[0]:,} transactions ({train_pct:.1f}%)")
print(f"Test set: {test_df.shape[0]:,} transactions ({test_pct:.1f}%)")
print(f"Customer split: {len(train_customers):,} / {len(test_customers):,}")
print(f"Files saved to: {modelling_dir.absolute()}")

Saving training dataset to: ..\data\modelling_data\train_data.parquet
✓ Training dataset saved: (3122370, 40), 1451.1 MB
Saving test dataset to: ..\data\modelling_data\test_data.parquet
✓ Test dataset saved: (782021, 40), 428.1 MB
✓ Customer ID splits saved
  - Training customers: ..\data\modelling_data\train_customers.parquet
  - Test customers: ..\data\modelling_data\test_customers.parquet

=== Train/Test Split Summary ===
Original dataset: 3,904,391 transactions, 40 features
Training set: 3,122,370 transactions (80.0%)
Test set: 782,021 transactions (20.0%)
Customer split: 420,060 / 105,015
Files saved to: c:\Users\tom\coding_projects\data_analytics_projects\h_and_m_data_analysis\notebooks\..\data\modelling_data
