# SVD-only Article Clustering

This notebook clusters H&M articles using only SVD (LSA) embeddings derived from TF‑IDF of product descriptions.

- Loads dense SVD vectors (no categorical features)
- Selects k via the elbow method
- Performs clustering and interpretation using product metadata
- Visualises clusters with PCA and t‑SNE

All identifiers, comments, and Markdown use UK spellings.


In [None]:
# Imports and configuration
import sys
sys.path.append('../') 
import os
import polars as pl
import numpy as np

from hnm_data_analysis.clustering.article_clustering import ArticleClusterer, ClusteringConfig

# Set absolute project paths
BASE_DIR = "/Users/tom/Data Analysis Projects/h_and_m_data_analysis"
FEATURES_DIR = f"{BASE_DIR}/data/processed/features"
SVD_PATH = f"{FEATURES_DIR}/svd_embeddings.parquet"
ARTICLE_ID_INDEX = f"{FEATURES_DIR}/article_id_index.csv"
ARTICLES_PATH = f"{BASE_DIR}/data/cleaned/articles_last_3_months_cleaned.parquet"
RESULTS_DIR = f"{BASE_DIR}/results/svd_clustering"

os.makedirs(RESULTS_DIR, exist_ok=True)
print("Base directory:", BASE_DIR)


In [None]:
# 1) Load SVD embeddings and align article IDs
svd_df = pl.read_parquet(SVD_PATH)
print("SVD shape:", svd_df.shape)

# Ensure article_id column exists
assert "article_id" in svd_df.columns, "svd_embeddings.parquet must include 'article_id'"

# Sort by article_id to align deterministically
svd_df = svd_df.sort("article_id")

# Extract features (columns starting with 'svd_')
svd_cols = [c for c in svd_df.columns if c.startswith("svd_")]
X = svd_df.select(svd_cols).to_numpy()
article_ids = svd_df["article_id"].to_list()

print("Feature matrix:", X.shape)
print("Example columns:", svd_cols[:5])


In [None]:
# 2) Persist features in compatible format for clusterer
features_npy = f"{FEATURES_DIR}/svd_only_features.npy"
ids_csv = f"{FEATURES_DIR}/svd_only_article_id_index.csv"

np.save(features_npy, X)
pl.DataFrame({"article_id": article_ids}).write_csv(ids_csv)

print("Saved:", features_npy)
print("Saved:", ids_csv)


In [None]:
# 3) Cluster using ArticleClusterer on SVD-only features
clusterer = ArticleClusterer(
    features_path=features_npy,
    article_ids_path=ids_csv,
    articles_metadata_path=ARTICLES_PATH
)

clusterer.load_features()
clusterer.load_articles_metadata()

# Find optimal k via elbow
optimal_k, scores = clusterer.find_optimal_k(k_range=(3, 20), algorithm="kmeans")
print("Optimal k:", optimal_k)
print("Elbow scores (inertia):", scores)


In [None]:
# 4) Run K-means clustering and interpret
config = ClusteringConfig(
    algorithm="kmeans",
    n_clusters=optimal_k,
    random_state=42
)

results = clusterer.cluster(config)
summaries = clusterer.interpret_clusters()

print("\nCluster summaries (truncated):")
for cid, s in list(summaries.items())[:5]:
    print(f"Cluster {cid}: size={s['size']} ({s['percentage']:.1f}%)")


In [None]:
# 5) Visualisations and save results
clusterer.visualise_clusters(method="pca", save_path=f"{RESULTS_DIR}/clusters_pca.png")
clusterer.visualise_clusters(method="tsne", save_path=f"{RESULTS_DIR}/clusters_tsne.png")

clusterer.save_results(RESULTS_DIR)
print("Saved results to:", RESULTS_DIR)
