In [1]:
# Load train data
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/train.csv', sep=";", header=0)
df_test = pd.read_csv('data/test.csv', sep=";", header=0)

In [2]:
# Drop columns
cols_to_remove = ["heel_shape_type", "toecap_type", "archetype"]
df.drop(columns=cols_to_remove, inplace=True)

In [3]:
WAIST_APPLIES_TO = {
    'Jeans', 'Trousers', 'Skirts', 'Shorts', 'Leggings and joggers',
    'Intimate'
}

df["waist_applicable"] = df["family"].isin(WAIST_APPLIES_TO).astype(int)

def clean_waist_type(row):
    wt = row["waist_type"]
    applicable = row["waist_applicable"]

    if pd.notnull(wt):
        return wt                           # Real waist type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"          # Should exist but missing
        else:
            return "NOT_APPLICABLE"         # Attribute irrelevant for this category

df["waist_type"] = df.apply(clean_waist_type, axis=1)

In [4]:
# Determine if length_type is applicable to each family
# A family is applicable if it has at least one non-null length_type value
family_length_applicable = df.groupby("family")["length_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["length_applicable"] = df["family"].map(family_length_applicable).astype(int)

def clean_length_type(row):
    lt = row["length_type"]
    applicable = row["length_applicable"]

    if pd.notnull(lt):
        return lt                           # Real length type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"          # Should exist but missing
        else:
            return "NOT_APPLICABLE"         # Attribute irrelevant for this category

df["length_type"] = df.apply(clean_length_type, axis=1)


In [5]:
# Determine if silhouette_type is applicable to each family
# A family is applicable if it has at least one non-null silhouette_type value
family_silhouette_applicable = df.groupby("family")["silhouette_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["silhouette_applicable"] = df["family"].map(family_silhouette_applicable).astype(int)

def clean_silhouette_type(row):
    st = row["silhouette_type"]
    applicable = row["silhouette_applicable"]

    if pd.notnull(st):
        return st                          # Real silhouette type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["silhouette_type"] = df.apply(clean_silhouette_type, axis=1)


In [6]:
# Determine if neck_lapel_type is applicable to each family
# A family is applicable if it has at least one non-null neck_lapel_type value
family_neck_lapel_applicable = df.groupby("family")["neck_lapel_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["neck_lapel_applicable"] = df["family"].map(family_neck_lapel_applicable).astype(int)

def clean_neck_lapel_type(row):
    nlt = row["neck_lapel_type"]
    applicable = row["neck_lapel_applicable"]

    if pd.notnull(nlt):
        return nlt                         # Real neck/lapel type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["neck_lapel_type"] = df.apply(clean_neck_lapel_type, axis=1)


In [7]:
# Determine if sleeve_length_type is applicable to each family
# A family is applicable if it has at least one non-null sleeve_length_type value
family_sleeve_length_applicable = df.groupby("family")["sleeve_length_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["sleeve_length_applicable"] = df["family"].map(family_sleeve_length_applicable).astype(int)

def clean_sleeve_length_type(row):
    slt = row["sleeve_length_type"]
    applicable = row["sleeve_length_applicable"]

    if pd.notnull(slt):
        return slt                         # Real sleeve length type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["sleeve_length_type"] = df.apply(clean_sleeve_length_type, axis=1)


In [8]:
# Determine if woven_structure is applicable to each family
# A family is applicable if it has at least one non-null woven_structure value
family_woven_structure_applicable = df.groupby("family")["woven_structure"].apply(
    lambda x: x.notna().any()
).to_dict()

df["woven_structure_applicable"] = df["family"].map(family_woven_structure_applicable).astype(int)

def clean_woven_structure(row):
    ws = row["woven_structure"]
    applicable = row["woven_structure_applicable"]

    if pd.notnull(ws):
        return ws                         # Real woven structure from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"        # Should exist but missing
        else:
            return "NOT_APPLICABLE"       # Attribute irrelevant for this category

df["woven_structure"] = df.apply(clean_woven_structure, axis=1)


In [9]:
# Determine if knit_structure is applicable to each family
# A family is applicable if it has at least one non-null knit_structure value
family_knit_structure_applicable = df.groupby("family")["knit_structure"].apply(
    lambda x: x.notna().any()
).to_dict()

df["knit_structure_applicable"] = df["family"].map(family_knit_structure_applicable).astype(int)

def clean_knit_structure(row):
    ks = row["knit_structure"]
    applicable = row["knit_structure_applicable"]

    if pd.notnull(ks):
        return ks                         # Real knit structure from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"       # Should exist but missing
        else:
            return "NOT_APPLICABLE"      # Attribute irrelevant for this category

df["knit_structure"] = df.apply(clean_knit_structure, axis=1)


In [10]:
# Create is_fall boolean column based on id_season
# Pattern: even id_season values are fall (1), odd values are not fall (0)
# Examples: 89->0, 88->1, 87->0, 86->1
df['is_fall'] = (df['id_season'] % 2 == 0).astype(int)


In [11]:
# Create weeks_since_launch column
# Group by ID and id_season, then rank by num_week_iso (1 = launch week, 2 = second week, etc.)
df = df.sort_values(['ID', 'id_season', 'num_week_iso'])

# Create weeks_since_launch: rank within each (ID, id_season) group
# The smallest num_week_iso gets 1, next gets 2, etc.
df['weeks_since_launch'] = df.groupby(['ID', 'id_season'])['num_week_iso'].rank(method='dense', ascending=True).astype(int)

In [12]:
# Replace all missing values in print_type with "Sin Estampado"
df["print_type"] = df["print_type"].fillna("Sin Estampado")

In [13]:
# Create weeks_since_launch column
# Group by ID and id_season, then rank by num_week_iso (0 = launch week, 1 = second week, etc.)
df = df.sort_values(['ID', 'id_season', 'num_week_iso'])

# Create weeks_since_launch: rank within each (ID, id_season) group
# The smallest num_week_iso gets 0, next gets 1, etc.
df['weeks_since_launch'] = (df.groupby(['ID', 'id_season'])['num_week_iso'].rank(method='dense', ascending=True) - 1).astype(int)

In [14]:
import os
# Fix for Windows threadpoolctl/OpenBLAS issue
os.environ['OMP_NUM_THREADS'] = '1'

# Disable threadpoolctl to avoid OpenBLAS inspection error on Windows
try:
    from sklearn import config_context
    # Use config_context to disable threadpoolctl
    import sklearn.utils.fixes
    # Monkey patch threadpool_limits to be a no-op
    original_threadpool_limits = sklearn.utils.fixes.threadpool_limits
    
    class NoOpContext:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            return False
    
    def patched_threadpool_limits(*args, **kwargs):
        return NoOpContext()
    
    sklearn.utils.fixes.threadpool_limits = patched_threadpool_limits
except Exception:
    pass  # If patching fails, continue anyway

def parse_rgb(x):
    # Format is "255,215,0" - just split by comma
    parts = str(x).split(",")
    return [float(parts[0]), float(parts[1]), float(parts[2])]

df[["R", "G", "B"]] = df["color_rgb"].apply(lambda x: pd.Series(parse_rgb(x)))

In [15]:
# Normalize RGB values to 0-1 range (divide by 255)
df[["R", "G", "B"]] = df[["R", "G", "B"]] / 255.0

In [16]:
df.drop(columns=["color_rgb", "waist_applicable", "length_applicable", "silhouette_applicable",
                 "neck_lapel_applicable", "sleeve_length_applicable", "woven_structure_applicable", "knit_structure_applicable" ], inplace=True)

In [21]:
# Set any negative values in 'weekly_sales' and 'weekly_demand' columns to 0
for col in ['weekly_sales', 'weekly_demand']:
    if col in df.columns:
        df[col] = df[col].clip(lower=0)


In [17]:
# CLUSTERING CODE - COMMENTED OUT
# # Determine optimal number of clusters using multiple methods
# from sklearn.metrics import silhouette_score
# import matplotlib.pyplot as plt
# 
# # Range of k values to test
# k_range = range(2, 21)  # Test k from 2 to 20
# inertias = []
# silhouette_scores = []
# 
# print("Evaluating different numbers of clusters...")
# for k in k_range:
#     kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
#     labels = kmeans_temp.fit_predict(rgb_scaled)
#     
#     inertias.append(kmeans_temp.inertia_)
#     silhouette_scores.append(silhouette_score(rgb_scaled, labels))
#     
#     if k % 5 == 0:
#         print(f"  Completed k={k}...")
# 
# # Find optimal k based on silhouette score
# optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]
# 
# print(f"\nOptimal k based on Silhouette Score: {optimal_k_silhouette} (score: {max(silhouette_scores):.3f})")
# 
# # Plot the results
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 
# # Elbow Method (Inertia)
# axes[0].plot(k_range, inertias, 'bo-')
# axes[0].set_xlabel('Number of Clusters (k)')
# axes[0].set_ylabel('Within-Cluster Sum of Squares (Inertia)')
# axes[0].set_title('Elbow Method')
# axes[0].grid(True)
# 
# # Silhouette Score
# axes[1].plot(k_range, silhouette_scores, 'ro-')
# axes[1].axvline(x=optimal_k_silhouette, color='g', linestyle='--', label=f'Optimal k={optimal_k_silhouette}')
# axes[1].set_xlabel('Number of Clusters (k)')
# axes[1].set_ylabel('Silhouette Score')
# axes[1].set_title('Silhouette Score (higher is better)')
# axes[1].legend()
# axes[1].grid(True)
# 
# plt.tight_layout()
# plt.show()
# 
# # Print detailed scores for manual inspection
# print("\nDetailed scores:")
# print(f"{'k':<5} {'Inertia':<12} {'Silhouette':<12}")
# print("-" * 30)
# for i, k in enumerate(k_range):
#     print(f"{k:<5} {inertias[i]:<12.2f} {silhouette_scores[i]:<12.3f}")


In [18]:
# CLUSTERING CODE - COMMENTED OUT
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.cluster import KMeans
# 
# # Convert centroids back to original RGB for plotting
# def get_centroids(k):
#     kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
#     kmeans_temp.fit(rgb_scaled)
#     # inverse-transform to original RGB scale
#     return scaler.inverse_transform(kmeans_temp.cluster_centers_)
# 
# # Plot
# Ks_to_plot = [10, 11, 12, 13, 14, 15, 16]   # choose any set of k values
# n_rows = len(Ks_to_plot)
# 
# fig, axes = plt.subplots(n_rows, 1, figsize=(14, 2*n_rows))
# 
# if n_rows == 1:
#     axes = [axes]
# 
# for idx, k in enumerate(Ks_to_plot):
#     centroids = get_centroids(k)
#     
#     # Clip values to RGB range
#     centroids = np.clip(centroids, 0, 255).astype(int)
#     
#     # Build an image strip where each square is a centroid
#     color_strip = np.zeros((50, 50*k, 3), dtype=np.uint8)
#     for i, color in enumerate(centroids):
#         color_strip[:, i*50:(i+1)*50, :] = color
#     
#     axes[idx].imshow(color_strip)
#     axes[idx].set_title(f"Centroids for k={k}", fontsize=12)
#     axes[idx].axis('off')
# 
# plt.tight_layout()
# plt.show()

In [19]:
# CLUSTERING CODE - COMMENTED OUT
# # Perform clustering with optimal k (or choose manually based on the plots above)
# # You can use optimal_k_silhouette or choose your own value
# N_CLUSTERS = 15
# 
# print(f"Using k={N_CLUSTERS} clusters")
# 
# kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)  
# df["color_cluster"] = kmeans.fit_predict(rgb_scaled)

In [None]:
# ============================================
# PROCESS TEST DATA
# Apply same preprocessing as training data
# ============================================

# Drop columns (same as training)
cols_to_remove = ["heel_shape_type", "toecap_type", "archetype"]
df_test.drop(columns=cols_to_remove, inplace=True)

# Apply same cleaning functions to test data
# Waist type cleaning
df_test["waist_applicable"] = df_test["family"].isin(WAIST_APPLIES_TO).astype(int)
df_test["waist_type"] = df_test.apply(clean_waist_type, axis=1)

# Length type cleaning
family_length_applicable = df.groupby("family")["length_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["length_applicable"] = df_test["family"].map(family_length_applicable).astype(int)
df_test["length_type"] = df_test.apply(clean_length_type, axis=1)

# Silhouette type cleaning
family_silhouette_applicable = df.groupby("family")["silhouette_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["silhouette_applicable"] = df_test["family"].map(family_silhouette_applicable).astype(int)
df_test["silhouette_type"] = df_test.apply(clean_silhouette_type, axis=1)

# Neck/lapel type cleaning
family_neck_lapel_applicable = df.groupby("family")["neck_lapel_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["neck_lapel_applicable"] = df_test["family"].map(family_neck_lapel_applicable).astype(int)
df_test["neck_lapel_type"] = df_test.apply(clean_neck_lapel_type, axis=1)

# Sleeve length type cleaning
family_sleeve_length_applicable = df.groupby("family")["sleeve_length_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["sleeve_length_applicable"] = df_test["family"].map(family_sleeve_length_applicable).astype(int)
df_test["sleeve_length_type"] = df_test.apply(clean_sleeve_length_type, axis=1)

# Woven structure cleaning
family_woven_structure_applicable = df.groupby("family")["woven_structure"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["woven_structure_applicable"] = df_test["family"].map(family_woven_structure_applicable).astype(int)
df_test["woven_structure"] = df_test.apply(clean_woven_structure, axis=1)

# Knit structure cleaning (using training data's applicability mapping)
family_knit_structure_applicable = df.groupby("family")["knit_structure"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["knit_structure_applicable"] = df_test["family"].map(family_knit_structure_applicable).astype(int)
df_test["knit_structure"] = df_test.apply(clean_knit_structure, axis=1)

print("Test data cleaning completed")


In [None]:
# Create is_fall boolean column for test data
df_test['is_fall'] = (df_test['id_season'] % 2 == 0).astype(int)


In [None]:
# Replace missing values in print_type for test data
df_test["print_type"] = df_test["print_type"].fillna("Sin Estampado")


In [None]:
# Parse RGB from color_rgb for test data
df_test[["R", "G", "B"]] = df_test["color_rgb"].apply(lambda x: pd.Series(parse_rgb(x)))

# Normalize RGB values to 0-1 range (divide by 255)
df_test[["R", "G", "B"]] = df_test[["R", "G", "B"]] / 255.0


In [None]:
# Drop helper columns from test data (same as training)
df_test.drop(columns=["color_rgb", "waist_applicable", "length_applicable", "silhouette_applicable",
                 "neck_lapel_applicable", "sleeve_length_applicable", "woven_structure_applicable", "knit_structure_applicable" ], inplace=True)

print("Test data preprocessing completed")


In [None]:
# Save processed test data
# Note: Test data doesn't have weekly_sales, so we don't need weeks_since_launch for test
# weeks_since_launch will be created during prediction time for each week
df_test.to_csv('test_processed.csv', index=False)
print("Processed test data saved to test_processed.csv")


In [22]:
df.to_csv('train_processed.csv', index=False)