# Combined Article Clustering

This notebook clusters H&M articles using combined features:
- SVD (LSA) embeddings derived from TF‑IDF of `detail_desc` (200 dimensions)
- Categorical features from cleaned articles metadata (24 dimensions)

The combined approach provides both semantic similarity from text descriptions and categorical similarity from product attributes.

- Loads combined features (SVD + categorical) from the feature engineering pipeline
- Finds an optimal number of clusters via the elbow method
- Performs clustering and interprets clusters using product metadata
- Visualises clusters with PCA and t‑SNE

All identifiers, comments, and Markdown use UK spellings.

In [None]:
# Imports and configuration
import os
import sys
import polars as pl

# Add project root to path
sys.path.append('../') 

from hnm_data_analysis.clustering.article_clustering import ArticleClusterer, ClusteringConfig


In [None]:
# Paths
# Updated to use combined features (SVD + categorical)
BASE_DIR = PROJECT_ROOT
DATA_DIR = os.path.join(BASE_DIR, "data")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
FEATURES_DIR = os.path.join(DATA_DIR, "processed", "features")
RESULTS_DIR = os.path.join(BASE_DIR, "results", "combined_clustering")

# Use combined features (SVD embeddings + categorical features)
FEATURES_PATH = os.path.join(FEATURES_DIR, "combined_features.parquet")
ARTICLES_PATH = os.path.join(PROCESSED_DIR, "articles_last_3_months.parquet")

os.makedirs(RESULTS_DIR, exist_ok=True)
print("FEATURES_PATH:", FEATURES_PATH)
print("ARTICLES_PATH:", ARTICLES_PATH)
print("RESULTS_DIR:", RESULTS_DIR)

In [None]:
# Check for combined features (preferred) or fallback to SVD-only
ARTICLE_IDS_PATH = None
FEATURES_PATH_TO_USE = None

combined_path = FEATURES_PATH
svd_path = os.path.join(FEATURES_DIR, "svd_embeddings.parquet")
tfidf_path = os.path.join(FEATURES_DIR, "tfidf_features.npz")
index_path = os.path.join(FEATURES_DIR, "article_id_index.csv")

if os.path.exists(combined_path):
    FEATURES_PATH_TO_USE = combined_path
    print("Using combined features (SVD + categorical):", FEATURES_PATH_TO_USE)
elif os.path.exists(svd_path):
    FEATURES_PATH_TO_USE = svd_path
    print("Using SVD embeddings only:", FEATURES_PATH_TO_USE)
elif os.path.exists(tfidf_path) and os.path.exists(index_path):
    FEATURES_PATH_TO_USE = tfidf_path
    ARTICLE_IDS_PATH = index_path
    print("Using TF-IDF sparse matrix (will densify):", FEATURES_PATH_TO_USE)
    print("Article IDs index:", ARTICLE_IDS_PATH)
else:
    raise FileNotFoundError(
        "No feature files found. Generate combined, SVD or TF-IDF features first."
    )

In [None]:
# 1) Load features and prepare clusterer
clusterer = ArticleClusterer(
    features_path=FEATURES_PATH_TO_USE,
    article_ids_path=ARTICLE_IDS_PATH,
    articles_metadata_path=ARTICLES_PATH
)

clusterer.load_features()
clusterer.load_articles_metadata()


In [None]:
# 2) Find optimal k via elbow (K-means inertia) on combined features
optimal_k, scores = clusterer.find_optimal_k(k_range=(3, 20), algorithm="kmeans")
print("Optimal k:", optimal_k)
# Scores can be large; preview first few rounded entries
preview = {k: (round(v, 2) if isinstance(v, (int, float)) else v) for k, v in list(scores.items())[:5]}
print("Scores preview:", preview)

In [None]:
# 4) Cluster with K-means using the selected k
config = ClusteringConfig(
    algorithm="kmeans",
    n_clusters=optimal_k,
    random_state=42
)

results = clusterer.cluster(config)

# 5) Interpret clusters
summaries = clusterer.interpret_clusters()
print("\nCluster summaries (truncated):")
for cid, s in list(summaries.items())[:5]:
    print(f"Cluster {cid}: size={s['size']} ({s['percentage']:.1f}%)")


In [None]:
# 6) Visualisations
clusterer.visualise_clusters(method="pca", save_path=f"{RESULTS_DIR}/clusters_pca.png")
clusterer.visualise_clusters(method="tsne", save_path=f"{RESULTS_DIR}/clusters_tsne.png")


In [None]:
# 7) Save results
clusterer.save_results(RESULTS_DIR)
print("Saved results to:", RESULTS_DIR)
