In [28]:
# Load train data
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/train.csv', sep=";", header=0)
df_test = pd.read_csv('data/test.csv', sep=";", header=0)

In [None]:
# Columns to remove
cols_to_remove = ["heal_shape_type", "toe_cap_type",  ]

array([86], dtype=int64)

In [None]:
WAIST_APPLIES_TO = {
    'Jeans', 'Trousers', 'Skirts', 'Shorts', 'Leggings and joggers',
    'Intimate'
}

df["waist_applicable"] = df["family"].isin(WAIST_APPLIES_TO).astype(int)

def clean_waist_type(row):
    wt = row["waist_type"]
    applicable = row["waist_applicable"]

    if pd.notnull(wt):
        return wt                           # Real waist type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"          # Should exist but missing
        else:
            return "NOT_APPLICABLE"         # Attribute irrelevant for this category

df["waist_type"] = df.apply(clean_waist_type, axis=1)

In [29]:
df["color_rgb"]

0        255,215,0
1        255,215,0
2        255,215,0
3        255,215,0
4        255,215,0
           ...    
95334        0,0,0
95335        0,0,0
95336        0,0,0
95337        0,0,0
95338        0,0,0
Name: color_rgb, Length: 95339, dtype: object

In [None]:
import os
# Fix for Windows threadpoolctl/OpenBLAS issue
os.environ['OMP_NUM_THREADS'] = '1'

# Disable threadpoolctl to avoid OpenBLAS inspection error on Windows
try:
    from sklearn import config_context
    # Use config_context to disable threadpoolctl
    import sklearn.utils.fixes
    # Monkey patch threadpool_limits to be a no-op
    original_threadpool_limits = sklearn.utils.fixes.threadpool_limits
    
    class NoOpContext:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            return False
    
    def patched_threadpool_limits(*args, **kwargs):
        return NoOpContext()
    
    sklearn.utils.fixes.threadpool_limits = patched_threadpool_limits
except Exception:
    pass  # If patching fails, continue anyway

def parse_rgb(x):
    if pd.isna(x):
        return [np.nan, np.nan, np.nan]
    # remove parentheses and split
    x = str(x).replace("(", "").replace(")", "")
    parts = x.split(",")
    if len(parts) != 3:
        return [np.nan, np.nan, np.nan]
    return [float(parts[0]), float(parts[1]), float(parts[2])]

df[["R", "G", "B"]] = df["color_rgb"].apply(lambda x: pd.Series(parse_rgb(x)))

df = df.dropna(subset=["R", "G", "B"])

scaler = StandardScaler()
rgb_scaled = scaler.fit_transform(df[["R", "G", "B"]])

N_CLUSTERS = 12  # tweak if desired

kmeans = KMeans(n_clusters=12, random_state=42, n_init=10)  
df["color_cluster"] = kmeans.fit_predict(rgb_scaled)

# -------------------------
# 4. (Optional) Name the clusters
# -------------------------
# Compute the centroid color for each cluster
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)

def name_color(center):
    r, g, b = center
    # simple heuristic naming â€” you can refine this
    if r > 180 and g < 80 and b < 80:
        return "red"
    if r < 80 and g > 180 and b < 80:
        return "green"
    if r < 80 and g < 80 and b > 180:
        return "blue"
    if r > 200 and g > 200 and b < 80:
        return "yellow"
    if r > 200 and g > 200 and b > 200:
        return "white"
    if r < 60 and g < 60 and b < 60:
        return "black"
    if r > 150 and g < 100 and b > 150:
        return "purple"
    if r > 150 and g > 100 and b > 100:
        return "beige"
    return "other"

cluster_color_names = {
    i: name_color(center)
    for i, center in enumerate(cluster_centers)
}

df["color_cluster"] = df["color_cluster"].map(cluster_color_names)




In [None]:
df[color_cluster].unique()