# Clustering Experiments with Categorical Features

This notebook clusters articles using categorical features from the final dataset, excluding article_id and bert_cluster columns.


In [1]:
# Ensure project root is the working directory so relative paths resolve
import os
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
print('CWD:', os.getcwd())

CWD: /Users/tom/Data Analysis Projects/h_and_m_data_analysis


In [2]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path

# Import the ArticleClusterer module
from hnm_data_analysis.clustering.article_clustering import ArticleClusterer, ClusteringConfig

In [3]:
# Load the final dataset
data_path = 'data/features/final/articles_features_final.parquet'
df = pl.read_parquet(data_path)
print(f'Loaded data: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(f'Columns: {df.columns}')

Loaded data: 42,298 rows x 14 columns
Columns: ['article_id', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc', 'bert_cluster']


In [4]:
# Select features for clustering from all articles - exclude article_id and bert_cluster
exclude_cols = ['article_id', 'bert_cluster']
feature_cols = [col for col in df.columns if col not in exclude_cols]
print(f'Feature columns ({len(feature_cols)}): {feature_cols}')
print(f'Total articles for clustering: {df.shape[0]:,}')

Feature columns (12): ['product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']
Total articles for clustering: 42,298


In [5]:
# Convert to pandas for sklearn compatibility (using all articles)
df_features = df.select(feature_cols).to_pandas()
article_ids = df.select('article_id').to_pandas()['article_id'].values

print(f'Feature matrix shape: {df_features.shape}')
print(f'Article IDs: {len(article_ids)}')
print(f'\nFeature data types:')
print(df_features.dtypes)

Feature matrix shape: (42298, 12)
Article IDs: 42298

Feature data types:
product_type_name               category
product_group_name              category
graphical_appearance_name       category
colour_group_name               category
perceived_colour_value_name     category
perceived_colour_master_name    category
department_name                 category
index_name                      category
index_group_name                category
section_name                    category
garment_group_name              category
detail_desc                     category
dtype: object


In [6]:
# Preprocess categorical features (all columns are categorical now)
from sklearn.preprocessing import OneHotEncoder

# All feature columns are categorical (the dataset now only contains categorical columns)
categorical_cols = feature_cols
print(f'Categorical columns ({len(categorical_cols)}): {categorical_cols}')

# Create one-hot encoder for categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform the features
X_processed = encoder.fit_transform(df_features)
print(f'\nProcessed feature matrix shape: {X_processed.shape}')

Categorical columns (12): ['product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']

Processed feature matrix shape: (42298, 20307)


In [7]:
# Get feature names after preprocessing
feature_names = encoder.get_feature_names_out(categorical_cols)

print(f'Total features after encoding: {len(feature_names)}')
print(f'First 10 feature names: {feature_names[:10]}')

Total features after encoding: 20307
First 10 feature names: ['product_type_name_Alice band' 'product_type_name_Baby Bib'
 'product_type_name_Backpack' 'product_type_name_Bag'
 'product_type_name_Ballerinas' 'product_type_name_Beanie'
 'product_type_name_Belt' 'product_type_name_Bikini top'
 'product_type_name_Blazer' 'product_type_name_Blouse']


In [8]:
# Save processed categorical features for ArticleClusterer
# First apply PCA for dimensionality reduction (similar to BERT notebook)
n_components = min(50, X_processed.shape[1])  # Use up to 50 components or max available
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_processed)

print(f'PCA reduced features to {X_pca.shape[1]} components')
print(f'Explained variance ratio (first 10): {pca.explained_variance_ratio_[:10]}')
print(f'Total explained variance: {pca.explained_variance_ratio_.sum():.3f}')

# Create categorical features dataset compatible with ArticleClusterer
categorical_features_dir = Path('data/features/categorical')
categorical_features_dir.mkdir(parents=True, exist_ok=True)

# Save PCA features as parquet with article_id and feature columns
feature_column_names = [f'pca_{i:03d}' for i in range(X_pca.shape[1])]
categorical_features_df = pl.DataFrame({
    'article_id': article_ids,
    **{name: X_pca[:, i] for i, name in enumerate(feature_column_names)}
})

categorical_features_path = categorical_features_dir / 'pca_categorical_features.parquet'
categorical_features_df.write_parquet(categorical_features_path)
print(f'Saved categorical features to: {categorical_features_path}')

PCA reduced features to 50 components
Explained variance ratio (first 10): [0.06405741 0.06039663 0.04492148 0.03922934 0.0390047  0.03025237
 0.02824735 0.02521067 0.02293512 0.02048241]
Total explained variance: 0.677
Saved categorical features to: data/features/categorical/pca_categorical_features.parquet


In [9]:
# Initialize ArticleClusterer with categorical features
clusterer = ArticleClusterer(
    features_path=str(categorical_features_path),
    articles_metadata_path='data/features/final/articles_features_final.parquet'
)

# Load features and metadata
features, article_ids_loaded = clusterer.load_features()
clusterer.load_articles_metadata()

Loaded features: 42,298 articles x 50 features
Loaded articles metadata: 42,298 articles


article_id,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc,bert_cluster
i64,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,i64
652356082,"""Underwear bottom""","""Underwear""","""All over pattern""","""Orange""","""Dusty Light""","""Orange""","""Underwear Woven""","""Menswear""","""Menswear""","""Men Underwear""","""Under-, Nightwear""","""Boxer shorts in a cotton weave…",10
212629004,"""Dress""","""Garment Full body""","""Solid""","""Black""","""Dark""","""Black""","""Basic 1""","""Divided""","""Divided""","""Divided Basics""","""Jersey Basic""","""Long, sleeveless dress in jers…",28
619601011,"""Robe""","""Underwear""","""Mixed solid/pattern""","""Light Yellow""","""Light""","""Yellow""","""Kids Girl UW/NW""","""Children Sizes 92-140""","""Baby/Children""","""Girls Underwear & Basics""","""Under-, Nightwear""","""Dressing gown in terry with a …",29
680846003,"""Blouse""","""Garment Upper body""","""Solid""","""Light Pink""","""Dusty Light""","""Pink""","""Campaigns""","""Ladieswear""","""Ladieswear""","""Womens Everyday Collection""","""Special Offers""","""Blouse in airy crêpe with a sm…",14
777020002,"""Shorts""","""Garment Lower body""","""Solid""","""Light Turquoise""","""Light""","""Turquoise""","""Trousers & Skirt""","""Ladieswear""","""Ladieswear""","""Womens Trend""","""Trousers""","""Shorts in an airy, crinkled ny…",18
…,…,…,…,…,…,…,…,…,…,…,…,…,…
428291007,"""Hat/beanie""","""Accessories""","""Solid""","""Grey""","""Dusty Light""","""Grey""","""Knit & Woven""","""Divided""","""Divided""","""Divided Accessories""","""Accessories""","""Knitted headband containing so…",29
796671002,"""Trousers""","""Garment Lower body""","""All over pattern""","""Dark Blue""","""Dark""","""Blue""","""Kids Girl Trouser""","""Children Sizes 92-140""","""Baby/Children""","""Kids Girl""","""Trousers""","""Treggings in patterned stretch…",24
814594005,"""T-shirt""","""Garment Upper body""","""Front print""","""White""","""Light""","""White""","""Young Boy Jersey Fancy""","""Children Sizes 134-170""","""Baby/Children""","""Young Boy""","""Jersey Fancy""","""Classic T-shirt in soft cotton…",23
624444001,"""Trousers""","""Garment Lower body""","""All over pattern""","""Light Pink""","""Dusty Light""","""Pink""","""Newborn""","""Baby Sizes 50-98""","""Baby/Children""","""Baby Essentials & Complements""","""Jersey Fancy""","""Trousers in soft organic cotto…",2


In [None]:
# Create output directory
out_dir = Path('results/categorical_clustering')
out_dir.mkdir(parents=True, exist_ok=True)

# Find optimal k using different methods with ArticleClusterer
k_min, k_max = 2, 50
print('Finding optimal k using elbow method...')
opt_k_elbow, scores_elbow = clusterer.find_optimal_k(k_range=(k_min, k_max), method='elbow')
clusterer.plot_k_selection(scores_elbow, method='elbow', optimal_k=opt_k_elbow, 
                          save_path=str(out_dir / f'optimal_k_elbow_{k_min}_{k_max}.png'))

print('\nFinding optimal k using silhouette method...')
opt_k_sil, scores_sil = clusterer.find_optimal_k(k_range=(k_min, k_max), method='silhouette')
clusterer.plot_k_selection(scores_sil, method='silhouette', optimal_k=opt_k_sil,
                          save_path=str(out_dir / f'optimal_k_silhouette_{k_min}_{k_max}.png'))

print(f'\nOptimal k recommendations:')
print(f'Elbow: {opt_k_elbow}')
print(f'Silhouette: {opt_k_sil}')

# Use silhouette as primary recommendation
recommended_k = opt_k_sil
print(f'\nUsing k={recommended_k} (silhouette method)')

Finding optimal k using elbow method...
Finding optimal k using elbow method with kmeans...
k=2: elbow score = 254145.6875
k=3: elbow score = 237213.7188
k=4: elbow score = 226344.6562
k=5: elbow score = 216021.7031
k=6: elbow score = 208342.3750
k=7: elbow score = 203176.0781
k=8: elbow score = 196725.6875
k=9: elbow score = 191618.9688
k=10: elbow score = 185596.8281
k=11: elbow score = 180482.2188
k=12: elbow score = 176534.6406
k=13: elbow score = 172202.8906
k=14: elbow score = 168202.8438
k=15: elbow score = 165975.5625
k=16: elbow score = 162828.1250
k=17: elbow score = 159881.0000
k=18: elbow score = 156579.0938
k=19: elbow score = 154867.2188
k=20: elbow score = 153830.9375
k=21: elbow score = 151611.7188
k=22: elbow score = 149411.5000
k=23: elbow score = 148357.8750
k=24: elbow score = 146099.5781
k=25: elbow score = 143494.5156
k=26: elbow score = 142293.4375
k=27: elbow score = 142342.1094
k=28: elbow score = 139780.6875
k=29: elbow score = 139347.4062
k=30: elbow score = 

In [None]:
# Remove the manual calculation of additional metrics since ArticleClusterer handles this
# Perform final clustering with ArticleClusterer
print(f'Performing final clustering with k={recommended_k}')
config = ClusteringConfig(algorithm='kmeans', n_clusters=recommended_k)
results = clusterer.cluster(config)

print(f'Final clustering results:')
print(f'Number of clusters: {results.n_clusters}')
print(f'Silhouette Score: {results.silhouette:.4f}')
print(f'Calinski-Harabasz Index: {results.calinski_harabasz:.4f}')
print(f'Davies-Bouldin Index: {results.davies_bouldin:.4f}')

# Show cluster distribution
unique, counts = np.unique(results.labels, return_counts=True)
print(f'\nCluster distribution:')
for cluster_id, count in zip(unique, counts):
    print(f'Cluster {cluster_id}: {count} items ({count/len(results.labels)*100:.1f}%)')

In [None]:
# Remove duplicate clustering - ArticleClusterer already performed it in previous cell
# Just reference the existing results
cluster_labels = results.labels
recommended_k = results.n_clusters
silhouette = results.silhouette
calinski_harabasz = results.calinski_harabasz
davies_bouldin = results.davies_bouldin

print(f'Using clustering results from ArticleClusterer:')

In [None]:
# Use ArticleClusterer's built-in visualization methods
clusterer.visualise_clusters(method='pca', save_path=str(out_dir / 'clusters_pca.png'))
clusterer.visualise_clusters(method='tsne', save_path=str(out_dir / 'clusters_tsne.png'))

In [None]:
# Use ArticleClusterer's interpretation functionality
cluster_interpretations = clusterer.interpret_clusters()

# Display sample articles from different clusters
print('Sample cluster interpretations:')
for cluster_id in list(cluster_interpretations.keys())[:3]:  # Show first 3 clusters
    interpretation = cluster_interpretations[cluster_id]
    print(f'\nCluster {cluster_id}:')
    print(f'  Size: {interpretation["size"]} articles ({interpretation["percentage"]:.1f}%)')
    
    # Show top categories for this cluster
    for key, values in interpretation.items():
        if key.startswith('top_') and isinstance(values, dict):
            category_name = key.replace('top_', '').replace('_', ' ').title()
            print(f'  {category_name}: {dict(list(values.items())[:3])}')  # Show top 3 values

# Also display sample articles using the original function but with clusterer's data
def display_cluster_samples(clusterer_obj, labels, n_clusters_sample=3, n_articles_per_cluster=3):
    unique_clusters = np.unique(labels)
    sampled_clusters = np.random.choice(unique_clusters, 
                                      size=min(n_clusters_sample, len(unique_clusters)), 
                                      replace=False)
    
    print(f'\nDetailed samples from clusters: {sampled_clusters}')
    
    for cluster_id in sampled_clusters:
        cluster_indices = np.where(labels == cluster_id)[0]
        if len(cluster_indices) == 0:
            continue
            
        sample_indices = np.random.choice(cluster_indices, 
                                        size=min(n_articles_per_cluster, len(cluster_indices)), 
                                        replace=False)
        sampled_article_ids = [clusterer_obj.article_ids[i] for i in sample_indices]
        
        print(f'\nCluster {cluster_id} — {len(cluster_indices)} items; showing {len(sampled_article_ids)} samples')
        
        # Display sample articles from this cluster
        if clusterer_obj.articles_metadata is not None:
            sample_df = clusterer_obj.articles_metadata.filter(pl.col('article_id').is_in(sampled_article_ids))
            display_cols = ['article_id', 'product_group_name', 'product_type_name', 
                           'department_name', 'garment_group_name', 'colour_group_name',
                           'graphical_appearance_name', 'detail_desc', 'bert_cluster']
            
            # Only select columns that exist in the dataframe
            existing_cols = [col for col in display_cols if col in sample_df.columns]
            print(sample_df.select(existing_cols).to_pandas())

# Display sample clusters
display_cluster_samples(clusterer, cluster_labels)

In [None]:
# Use ArticleClusterer's save functionality
clusterer.save_results(str(out_dir))

# Also save preprocessing artifacts for reproducibility
joblib.dump(encoder, out_dir / 'encoder.joblib')
print(f'Saved encoder to: {out_dir / "encoder.joblib"}')

joblib.dump(pca, out_dir / 'pca_model.joblib')
print(f'Saved PCA model to: {out_dir / "pca_model.joblib"}')

# Save additional metadata about the categorical preprocessing
import json
preprocessing_metadata = {
    'preprocessing_method': 'one_hot_encoding_plus_pca',
    'n_features_original': int(X_processed.shape[1]),
    'n_features_pca': int(X_pca.shape[1]),
    'n_articles': int(len(article_ids)),
    'explained_variance_ratio': float(pca.explained_variance_ratio_.sum()),
    'feature_columns_used': feature_cols,
    'categorical_columns': categorical_cols,
    'categorical_features_file': str(categorical_features_path)
}

with open(out_dir / 'preprocessing_metadata.json', 'w') as f:
    json.dump(preprocessing_metadata, f, indent=2)
print(f'Saved preprocessing metadata to: {out_dir / "preprocessing_metadata.json"}')

In [None]:
# Create final dataset with both BERT and categorical clusters
print('Creating final dataset with categorical cluster labels...')

# Get categorical cluster labels from ArticleClusterer results
cat_labels_pl = pl.DataFrame({
    'article_id': clusterer.article_ids,
    'categorical_cluster': results.labels
})

# Join with original dataset (now includes ALL articles with categorical clusters)
final_dataset = df.join(cat_labels_pl, on='article_id', how='left')

print(f'Final dataset shape: {final_dataset.shape}')
print(f'Articles with categorical clusters: {final_dataset.filter(pl.col("categorical_cluster").is_not_null()).shape[0]:,}')
print(f'Articles with BERT clusters: {final_dataset.filter(pl.col("bert_cluster").is_not_null()).shape[0]:,}')

# Show overlap between clustering methods
both_clusters = final_dataset.filter(
    (pl.col("categorical_cluster").is_not_null()) & 
    (pl.col("bert_cluster").is_not_null())
).shape[0]
print(f'Articles with both cluster types: {both_clusters:,}')

# Save the enhanced dataset
output_path = Path('data/features/final/articles_features_with_clusters.parquet')
final_dataset.write_parquet(output_path)
print(f'Saved enhanced dataset to: {output_path}')

print(f'\nDataset summary:')
print('BERT clusters:', final_dataset.select('bert_cluster').filter(pl.col('bert_cluster').is_not_null()).shape[0])
print('Categorical clusters:', final_dataset.select('categorical_cluster').filter(pl.col('categorical_cluster').is_not_null()).shape[0])
print('\nCluster value ranges:')
print('BERT cluster range:', final_dataset.select(pl.col('bert_cluster').min().alias('min'), pl.col('bert_cluster').max().alias('max')))
print('Categorical cluster range:', final_dataset.select(pl.col('categorical_cluster').min().alias('min'), pl.col('categorical_cluster').max().alias('max')))

# Generate data report for the final enhanced dataset
from hnm_data_analysis.data_understanding.data_report_generator import generate_data_report
print(f'\nGenerating data report for enhanced dataset...')
report_path = generate_data_report(str(output_path))
print(f'Data report saved to: {report_path}')

In [None]:
from hnm_data_analysis.data_understanding.data_report_generator import generate_data_report
# Generate data report for data/features/final/articles_features_with_clusters.parquet
print(generate_data_report("data/features/final/articles_features_with_clusters.parquet"))