In [1]:
# Load train data
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/train.csv', sep=";", header=0)
df_test = pd.read_csv('data/test.csv', sep=";", header=0)

In [2]:
# Drop columns
cols_to_remove = ["heel_shape_type", "toecap_type", "archetype"]
df.drop(columns=cols_to_remove, inplace=True)

In [3]:
# Determine if waist_type is applicable to each family
# A family is applicable if it has at least one non-null waist_type value
family_waist_applicable = df.groupby("family")["waist_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["waist_applicable"] = df["family"].map(family_waist_applicable).astype(int)

def clean_waist_type(row):
    wt = row["waist_type"]
    applicable = row["waist_applicable"]

    if pd.notnull(wt):
        return wt                           # Real waist type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"          # Should exist but missing
        else:
            return "NOT_APPLICABLE"         # Attribute irrelevant for this category

df["waist_type"] = df.apply(clean_waist_type, axis=1)
df.drop(columns=["waist_applicable"], inplace=True)

In [4]:
# Determine if length_type is applicable to each family
# A family is applicable if it has at least one non-null length_type value
family_length_applicable = df.groupby("family")["length_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["length_applicable"] = df["family"].map(family_length_applicable).astype(int)

def clean_length_type(row):
    lt = row["length_type"]
    applicable = row["length_applicable"]

    if pd.notnull(lt):
        return lt                           # Real length type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"          # Should exist but missing
        else:
            return "NOT_APPLICABLE"         # Attribute irrelevant for this category

df["length_type"] = df.apply(clean_length_type, axis=1)
df.drop(columns=["length_applicable"], inplace=True)


In [5]:
# Determine if silhouette_type is applicable to each family
# A family is applicable if it has at least one non-null silhouette_type value
family_silhouette_applicable = df.groupby("family")["silhouette_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["silhouette_applicable"] = df["family"].map(family_silhouette_applicable).astype(int)

def clean_silhouette_type(row):
    st = row["silhouette_type"]
    applicable = row["silhouette_applicable"]

    if pd.notnull(st):
        return st                          # Real silhouette type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["silhouette_type"] = df.apply(clean_silhouette_type, axis=1)
df.drop(columns=["silhouette_applicable"], inplace=True)


In [6]:
# Determine if neck_lapel_type is applicable to each family
# A family is applicable if it has at least one non-null neck_lapel_type value
family_neck_lapel_applicable = df.groupby("family")["neck_lapel_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["neck_lapel_applicable"] = df["family"].map(family_neck_lapel_applicable).astype(int)

def clean_neck_lapel_type(row):
    nlt = row["neck_lapel_type"]
    applicable = row["neck_lapel_applicable"]

    if pd.notnull(nlt):
        return nlt                         # Real neck/lapel type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["neck_lapel_type"] = df.apply(clean_neck_lapel_type, axis=1)
df.drop(columns=["neck_lapel_applicable"], inplace=True)


In [7]:
# Determine if sleeve_length_type is applicable to each family
# A family is applicable if it has at least one non-null sleeve_length_type value
family_sleeve_length_applicable = df.groupby("family")["sleeve_length_type"].apply(
    lambda x: x.notna().any()
).to_dict()

df["sleeve_length_applicable"] = df["family"].map(family_sleeve_length_applicable).astype(int)

def clean_sleeve_length_type(row):
    slt = row["sleeve_length_type"]
    applicable = row["sleeve_length_applicable"]

    if pd.notnull(slt):
        return slt                         # Real sleeve length type from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"         # Should exist but missing
        else:
            return "NOT_APPLICABLE"        # Attribute irrelevant for this category

df["sleeve_length_type"] = df.apply(clean_sleeve_length_type, axis=1)
df.drop(columns=["sleeve_length_applicable"], inplace=True)


In [8]:
# Determine if woven_structure is applicable to each family
# A family is applicable if it has at least one non-null woven_structure value
family_woven_structure_applicable = df.groupby("family")["woven_structure"].apply(
    lambda x: x.notna().any()
).to_dict()

df["woven_structure_applicable"] = df["family"].map(family_woven_structure_applicable).astype(int)

def clean_woven_structure(row):
    ws = row["woven_structure"]
    applicable = row["woven_structure_applicable"]

    if pd.notnull(ws):
        return ws                         # Real woven structure from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"        # Should exist but missing
        else:
            return "NOT_APPLICABLE"       # Attribute irrelevant for this category

df["woven_structure"] = df.apply(clean_woven_structure, axis=1)
df.drop(columns=["woven_structure_applicable"], inplace=True)

In [9]:
# Determine if knit_structure is applicable to each family
# A family is applicable if it has at least one non-null knit_structure value
family_knit_structure_applicable = df.groupby("family")["knit_structure"].apply(
    lambda x: x.notna().any()
).to_dict()

df["knit_structure_applicable"] = df["family"].map(family_knit_structure_applicable).astype(int)

def clean_knit_structure(row):
    ks = row["knit_structure"]
    applicable = row["knit_structure_applicable"]

    if pd.notnull(ks):
        return ks                         # Real knit structure from metadata
    else:
        if applicable == 1:
            return "MISSING_VALUE"       # Should exist but missing
        else:
            return "NOT_APPLICABLE"      # Attribute irrelevant for this category

df["knit_structure"] = df.apply(clean_knit_structure, axis=1)
df.drop(columns=["knit_structure_applicable"], inplace=True)


In [10]:
# Create is_fall boolean column based on id_season
# Pattern: even id_season values are fall (1), odd values are not fall (0)
# Examples: 89->0, 88->1, 87->0, 86->1
df['is_fall'] = (df['id_season'] % 2 == 0).astype(int)


In [11]:
# Replace all missing values in print_type with "Sin Estampado"
df["print_type"] = df["print_type"].fillna("Sin Estampado")

In [12]:
# Create weeks_since_launch column
# Group by ID and id_season, then rank by num_week_iso (0 = launch week, 1 = second week, etc.)
df = df.sort_values(['ID', 'id_season', 'year', 'num_week_iso'])

# Works because the difference in num_week_iso is always 1 (same year) or 51 (different year)
# Create weeks_since_launch: rank within each (ID, id_season) group
# The smallest num_week_iso gets 0, next gets 1, etc.
df['weeks_since_launch'] = (df.groupby(['ID', 'id_season'])['num_week_iso'].rank(method='dense', ascending=True) - 1).astype(int)

In [None]:
import os
# Fix for Windows threadpoolctl/OpenBLAS issue
os.environ['OMP_NUM_THREADS'] = '1'

# Disable threadpoolctl to avoid OpenBLAS inspection error on Windows
try:
    from sklearn import config_context
    # Use config_context to disable threadpoolctl
    import sklearn.utils.fixes
    # Monkey patch threadpool_limits to be a no-op
    original_threadpool_limits = sklearn.utils.fixes.threadpool_limits
    
    class NoOpContext:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            return False
    
    def patched_threadpool_limits(*args, **kwargs):
        return NoOpContext()
    
    sklearn.utils.fixes.threadpool_limits = patched_threadpool_limits
except Exception:
    pass  # If patching fails, continue anyway

# Color features - using color_name with k-means clustering instead of RGB
# We'll cluster color names using label encoding and k-means (k=15)
from sklearn.preprocessing import LabelEncoder

# Store color_name before dropping (we'll use it for clustering)
color_names_train = df["color_name"].copy()

In [None]:
# Color name clustering with k-means (k=15)
# Label encode color names for k-means
print("Creating color clusters from color_name...")
le_color = LabelEncoder()
color_encoded = le_color.fit_transform(color_names_train.fillna("UNKNOWN"))

# Reshape for k-means (1D array needs to be 2D)
color_encoded_2d = color_encoded.reshape(-1, 1)

# Apply k-means clustering on encoded color names
n_color_clusters = 15
kmeans_color = KMeans(n_clusters=n_color_clusters, n_init=10, random_state=42)
color_clusters = kmeans_color.fit_predict(color_encoded_2d)

# Add color cluster to dataframe
df["color_cluster"] = color_clusters.astype(int)

# Calculate distance to color cluster centroid
color_dists = kmeans_color.transform(color_encoded_2d)
df["color_cluster_dist"] = color_dists.min(axis=1)

print(f"Created {n_color_clusters} color clusters from color names")
print(f"Color clusters: {df['color_cluster'].nunique()} unique clusters")

In [None]:
# Drop RGB-related columns (R, G, B, color_rgb) - we're using color_name clustering instead
cols_to_drop = ["color_rgb"]
for col in ["R", "G", "B"]:
    if col in df.columns:
        cols_to_drop.append(col)
if len(cols_to_drop) > 1:
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"Dropped RGB columns: {cols_to_drop}")
else:
    df.drop(columns=["color_rgb"], inplace=True)
    print("Dropped color_rgb column")

In [None]:
# Change all negative values for weekly_sales and weekly_demand to 0
df.loc[df["weekly_sales"] < 0, "weekly_sales"] = 0
df.loc[df["weekly_demand"] < 0, "weekly_demand"] = 0

# Add seasonality features: week 23 and Black Friday
# Black Friday is typically around week 47-48 in ISO week numbering (late November)
# Week 23 is typically around late May/early June (end of spring/start of summer sales)
print("\nAdding seasonality features...")

# Week 23 indicator (specific shopping period)
df["is_week_23"] = (df["num_week_iso"] == 23).astype(int)

# Black Friday indicator (typically weeks 47-48, we'll use 47 as it's more common)
# Also check if it's around late November (week 47-48 in most years)
df["is_black_friday"] = (df["num_week_iso"].isin([47, 48])).astype(int)

print(f"Week 23 occurrences: {df['is_week_23'].sum()}")
print(f"Black Friday occurrences: {df['is_black_friday'].sum()}")

# Remove low importance features based on feature_importance.csv
# Low importance features to remove:
low_importance_features = [
    "cluster_velocity_1_6",      # importance: 125204640.0
    "cluster_peak_week",          # importance: 1397215083.0
    "family_demand_trend",        # importance: 1442911771.0
    "cluster_popularity",         # importance: 1717375991.0
    "cluster_demand_slope",       # importance: 2034475197.0
    "cluster_season_growth",      # importance: 2698429847.0
    "aggregated_family",          # importance: 3451807769.0 (if exists)
    "cluster_yoy_change",         # importance: 3807712455.0
]

# Remove these features if they exist (they may be created later in the notebook)
features_to_remove = [f for f in low_importance_features if f in df.columns]
if features_to_remove:
    df.drop(columns=features_to_remove, inplace=True)
    print(f"\nRemoved low importance features: {features_to_remove}")
else:
    print("\nLow importance features will be removed after they are created (later in notebook)")


In [17]:
# Save will be done at the end of the notebook
# df.to_csv('train_processed.csv', index=False)


In [18]:
# CLUSTERING CODE - COMMENTED OUT
# # Determine optimal number of clusters using multiple methods
# from sklearn.metrics import silhouette_score
# import matplotlib.pyplot as plt
# 
# # Range of k values to test
# k_range = range(2, 21)  # Test k from 2 to 20
# inertias = []
# silhouette_scores = []
# 
# print("Evaluating different numbers of clusters...")
# for k in k_range:
#     kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
#     labels = kmeans_temp.fit_predict(rgb_scaled)
#     
#     inertias.append(kmeans_temp.inertia_)
#     silhouette_scores.append(silhouette_score(rgb_scaled, labels))
#     
#     if k % 5 == 0:
#         print(f"  Completed k={k}...")
# 
# # Find optimal k based on silhouette score
# optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]
# 
# print(f"\nOptimal k based on Silhouette Score: {optimal_k_silhouette} (score: {max(silhouette_scores):.3f})")
# 
# # Plot the results
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 
# # Elbow Method (Inertia)
# axes[0].plot(k_range, inertias, 'bo-')
# axes[0].set_xlabel('Number of Clusters (k)')
# axes[0].set_ylabel('Within-Cluster Sum of Squares (Inertia)')
# axes[0].set_title('Elbow Method')
# axes[0].grid(True)
# 
# # Silhouette Score
# axes[1].plot(k_range, silhouette_scores, 'ro-')
# axes[1].axvline(x=optimal_k_silhouette, color='g', linestyle='--', label=f'Optimal k={optimal_k_silhouette}')
# axes[1].set_xlabel('Number of Clusters (k)')
# axes[1].set_ylabel('Silhouette Score')
# axes[1].set_title('Silhouette Score (higher is better)')
# axes[1].legend()
# axes[1].grid(True)
# 
# plt.tight_layout()
# plt.show()
# 
# # Print detailed scores for manual inspection
# print("\nDetailed scores:")
# print(f"{'k':<5} {'Inertia':<12} {'Silhouette':<12}")
# print("-" * 30)
# for i, k in enumerate(k_range):
#     print(f"{k:<5} {inertias[i]:<12.2f} {silhouette_scores[i]:<12.3f}")


In [19]:
# CLUSTERING CODE - COMMENTED OUT
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.cluster import KMeans
# 
# # Convert centroids back to original RGB for plotting
# def get_centroids(k):
#     kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
#     kmeans_temp.fit(rgb_scaled)
#     # inverse-transform to original RGB scale
#     return scaler.inverse_transform(kmeans_temp.cluster_centers_)
# 
# # Plot
# Ks_to_plot = [10, 11, 12, 13, 14, 15, 16]   # choose any set of k values
# n_rows = len(Ks_to_plot)
# 
# fig, axes = plt.subplots(n_rows, 1, figsize=(14, 2*n_rows))
# 
# if n_rows == 1:
#     axes = [axes]
# 
# for idx, k in enumerate(Ks_to_plot):
#     centroids = get_centroids(k)
#     
#     # Clip values to RGB range
#     centroids = np.clip(centroids, 0, 255).astype(int)
#     
#     # Build an image strip where each square is a centroid
#     color_strip = np.zeros((50, 50*k, 3), dtype=np.uint8)
#     for i, color in enumerate(centroids):
#         color_strip[:, i*50:(i+1)*50, :] = color
#     
#     axes[idx].imshow(color_strip)
#     axes[idx].set_title(f"Centroids for k={k}", fontsize=12)
#     axes[idx].axis('off')
# 
# plt.tight_layout()
# plt.show()

In [20]:
# CLUSTERING CODE - COMMENTED OUT
# # Perform clustering with optimal k (or choose manually based on the plots above)
# # You can use optimal_k_silhouette or choose your own value
# N_CLUSTERS = 15
# 
# print(f"Using k={N_CLUSTERS} clusters")
# 
# kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)  
# df["color_cluster"] = kmeans.fit_predict(rgb_scaled)

In [None]:
cols_to_remove = ["heel_shape_type", "toecap_type", "archetype"]
df_test.drop(columns=cols_to_remove, inplace=True)

family_waist_applicable = df.groupby("family")["waist_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["waist_applicable"] = df_test["family"].map(family_waist_applicable).astype(int)
df_test["waist_type"] = df_test.apply(clean_waist_type, axis=1)
df_test.drop(columns=["waist_applicable"], inplace=True)

family_length_applicable = df.groupby("family")["length_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["length_applicable"] = df_test["family"].map(family_length_applicable).astype(int)
df_test["length_type"] = df_test.apply(clean_length_type, axis=1)
df_test.drop(columns=["length_applicable"], inplace=True)

family_silhouette_applicable = df.groupby("family")["silhouette_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["silhouette_applicable"] = df_test["family"].map(family_silhouette_applicable).astype(int)
df_test["silhouette_type"] = df_test.apply(clean_silhouette_type, axis=1)
df_test.drop(columns=["silhouette_applicable"], inplace=True)

family_neck_lapel_applicable = df.groupby("family")["neck_lapel_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["neck_lapel_applicable"] = df_test["family"].map(family_neck_lapel_applicable).astype(int)
df_test["neck_lapel_type"] = df_test.apply(clean_neck_lapel_type, axis=1)
df_test.drop(columns=["neck_lapel_applicable"], inplace=True)

family_sleeve_length_applicable = df.groupby("family")["sleeve_length_type"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["sleeve_length_applicable"] = df_test["family"].map(family_sleeve_length_applicable).astype(int)
df_test["sleeve_length_type"] = df_test.apply(clean_sleeve_length_type, axis=1)
df_test.drop(columns=["sleeve_length_applicable"], inplace=True)

family_woven_structure_applicable = df.groupby("family")["woven_structure"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["woven_structure_applicable"] = df_test["family"].map(family_woven_structure_applicable).astype(int)
df_test["woven_structure"] = df_test.apply(clean_woven_structure, axis=1)
df_test.drop(columns=["woven_structure_applicable"], inplace=True)

family_knit_structure_applicable = df.groupby("family")["knit_structure"].apply(
    lambda x: x.notna().any()
).to_dict()
df_test["knit_structure_applicable"] = df_test["family"].map(family_knit_structure_applicable).astype(int)
df_test["knit_structure"] = df_test.apply(clean_knit_structure, axis=1)
df_test.drop(columns=["knit_structure_applicable"], inplace=True)

df_test['is_fall'] = (df_test['id_season'] % 2 == 0).astype(int)

df_test["print_type"] = df_test["print_type"].fillna("Sin Estampado")

# Color clustering for test data - use the same encoder and kmeans from train
print("Applying color clustering to test data...")
test_color_names = df_test["color_name"].fillna("UNKNOWN")

# Transform test colors using the same label encoder from train
# Handle unseen color names (not in train) by assigning to "UNKNOWN"
test_color_encoded = []
for color in test_color_names:
    try:
        test_color_encoded.append(le_color.transform([color])[0])
    except ValueError:
        # Color name not seen in training - assign to "UNKNOWN" encoding
        try:
            test_color_encoded.append(le_color.transform(["UNKNOWN"])[0])
        except ValueError:
            # If UNKNOWN doesn't exist, use 0 as fallback
            test_color_encoded.append(0)

test_color_encoded = np.array(test_color_encoded)
test_color_encoded_2d = test_color_encoded.reshape(-1, 1)

# Assign test items to color clusters using trained kmeans
test_color_clusters = kmeans_color.predict(test_color_encoded_2d)
df_test["color_cluster"] = test_color_clusters.astype(int)

# Calculate distance to color cluster centroid
test_color_dists = kmeans_color.transform(test_color_encoded_2d)
df_test["color_cluster_dist"] = test_color_dists.min(axis=1)

print(f"Assigned {len(df_test)} test items to color clusters")

# Drop RGB-related columns (R, G, B, color_rgb) - we're using color_name clustering instead
cols_to_drop = ["color_rgb"]
for col in ["R", "G", "B"]:
    if col in df_test.columns:
        cols_to_drop.append(col)
if len(cols_to_drop) > 1:
    df_test.drop(columns=cols_to_drop, inplace=True)
    print(f"Dropped RGB columns from test: {cols_to_drop}")
else:
    if "color_rgb" in df_test.columns:
        df_test.drop(columns=["color_rgb"], inplace=True)
        print("Dropped color_rgb column from test")

# Add seasonality features to test data (week 23 and Black Friday)
# Note: Test data doesn't have num_week_iso directly, but we'll add these features
# They will be set to 0 for now and can be calculated during prediction time based on actual weeks
print("\nAdding seasonality features to test data...")
# Note: For test data, these will be calculated during prediction when we know the actual week
# We'll set them to 0 for now (they'll be filled during prediction time in train_model.py)
# But let's check if num_week_iso exists in test data
if 'num_week_iso' in df_test.columns:
    df_test["is_week_23"] = (df_test["num_week_iso"] == 23).astype(int)
    df_test["is_black_friday"] = (df_test["num_week_iso"].isin([47, 48])).astype(int)
    print(f"Test - Week 23 occurrences: {df_test['is_week_23'].sum()}")
    print(f"Test - Black Friday occurrences: {df_test['is_black_friday'].sum()}")
else:
    # Will be added during prediction time in train_model.py
    df_test["is_week_23"] = 0
    df_test["is_black_friday"] = 0
    print("num_week_iso not in test data - seasonality features will be added during prediction")

In [22]:
# Save processed test data
# Note: Test data doesn't have weekly_sales, so we don't need weeks_since_launch for test
# weeks_since_launch will be created during prediction time for each week
# Save will be done at the end of the notebook
# df_test.to_csv('test_processed.csv', index=False)
# print("Processed test data saved to test_processed.csv")

In [23]:
import ast

def parse_embedding(x):
    """Robustly convert string/list/array embedding into np.ndarray."""
    if isinstance(x, np.ndarray):
        return x
    if isinstance(x, list):
        return np.array(x)
    if x is None:
        return None

    try:
        parsed = ast.literal_eval(str(x))
        return np.array(parsed)
    except Exception:
        return None


df["embedding_array"] = df["image_embedding"].apply(parse_embedding)
valid = df["embedding_array"].notna()

emb_matrix = np.vstack(df.loc[valid, "embedding_array"].values)
print("Embeddings shape:", emb_matrix.shape)

Embeddings shape: (95339, 512)


In [24]:
from sklearn.decomposition import PCA

# Standardize embeddings
scaler = StandardScaler()
emb_scaled = scaler.fit_transform(emb_matrix)

# Fit PCA with 83 components
n_components = 83
pca = PCA(n_components=n_components)
emb_pca = pca.fit_transform(emb_scaled)

# Add PCA features to dataframe
for i in range(n_components):
    df.loc[valid, f"emb_pca_{i+1}"] = emb_pca[:, i]
    df.loc[~valid, f"emb_pca_{i+1}"] = 0

print(f"Created {n_components} PCA features")

Created 83 PCA features


In [25]:
# Clustering with PCA
n_clusters = 22 # optimal number with elbow method and silhouette score
print(f"Using {n_clusters} clusters")

kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
clusters = kmeans.fit_predict(emb_pca)

df.loc[valid, "emb_cluster"] = clusters
df.loc[~valid, "emb_cluster"] = -1
df["emb_cluster"] = df["emb_cluster"].astype(int)

# Distance to centroid
dists = kmeans.transform(emb_pca)
df.loc[valid, "emb_dist"] = dists.min(axis=1)
df.loc[~valid, "emb_dist"] = -1

Using 22 clusters


In [26]:
# Step 1: Compute cluster-level trend features using train data only
# velocity_1_3: average sales per week during first 3 weeks of product life
print("Computing cluster-level trend features from train data...")

# Filter train data for first 3 weeks (weeks_since_launch: 0, 1, 2)
train_first_3_weeks = df[df['weeks_since_launch'] < 3].copy()

# Compute velocity_1_3 for each cluster
cluster_velocity = train_first_3_weeks.groupby('emb_cluster')['weekly_sales'].mean().reset_index()
cluster_velocity.columns = ['emb_cluster', 'velocity_1_3']

# Add cluster -1 with overall average if it doesn't exist (for invalid embeddings)
overall_velocity = train_first_3_weeks['weekly_sales'].mean()
if -1 not in cluster_velocity['emb_cluster'].values:
    cluster_velocity = pd.concat([
        cluster_velocity,
        pd.DataFrame([{'emb_cluster': -1, 'velocity_1_3': overall_velocity}])
    ], ignore_index=True)

print(f"Cluster trend features computed for {len(cluster_velocity)} clusters")
print(f"Velocity range: {cluster_velocity['velocity_1_3'].min():.2f} to {cluster_velocity['velocity_1_3'].max():.2f}")

# Step 2: Transform test embeddings using the same PCA and assign to clusters
print("\nProcessing test embeddings with trained PCA and KMeans...")

# Parse test embeddings
df_test["embedding_array"] = df_test["image_embedding"].apply(parse_embedding)
test_valid = df_test["embedding_array"].notna()

if test_valid.sum() > 0:
    # Stack valid test embeddings
    test_emb_matrix = np.vstack(df_test.loc[test_valid, "embedding_array"].values)
    
    # Apply the same scaler and PCA from train
    test_emb_scaled = scaler.transform(test_emb_matrix)
    test_emb_pca = pca.transform(test_emb_scaled)
    
    # Assign test items to clusters using trained KMeans
    test_clusters = kmeans.predict(test_emb_pca)
    
    # Add PCA features to test dataframe
    for i in range(n_components):
        df_test.loc[test_valid, f"emb_pca_{i+1}"] = test_emb_pca[:, i]
        df_test.loc[~test_valid, f"emb_pca_{i+1}"] = 0
    
    # Add cluster assignments to test dataframe
    df_test.loc[test_valid, "emb_cluster"] = test_clusters
    df_test.loc[~test_valid, "emb_cluster"] = -1
    df_test["emb_cluster"] = df_test["emb_cluster"].astype(int)
    
    # Compute distance to centroid for test items
    test_dists = kmeans.transform(test_emb_pca)
    df_test.loc[test_valid, "emb_dist"] = test_dists.min(axis=1)
    df_test.loc[~test_valid, "emb_dist"] = -1
    
    print(f"Assigned {test_valid.sum()} test items to clusters")
else:
    print("Warning: No valid embeddings found in test data")
    # Add PCA features with zeros
    for i in range(n_components):
        df_test[f"emb_pca_{i+1}"] = 0
    df_test["emb_cluster"] = -1
    df_test["emb_dist"] = -1

# Step 3: Attach cluster trend features to both train and test
print("\nAttaching cluster trend features to train and test data...")

# Merge velocity_1_3 to train
# If velocity_1_3 already exists in df, drop it first to avoid merge conflicts
if 'velocity_1_3' in df.columns:
    df = df.drop(columns=['velocity_1_3'])

df = df.merge(cluster_velocity, on='emb_cluster', how='left')
# Check for any missing clusters (shouldn't happen if all embeddings are valid)
if df['velocity_1_3'].isna().any():
    overall_velocity = train_first_3_weeks['weekly_sales'].mean()
    df['velocity_1_3'] = df['velocity_1_3'].fillna(overall_velocity)
    print(f"Warning: Some clusters missing velocity, filled with overall average: {overall_velocity:.2f}")

# Merge velocity_1_3 to test
# If velocity_1_3 already exists in df_test, drop it first to avoid merge conflicts
if 'velocity_1_3' in df_test.columns:
    df_test = df_test.drop(columns=['velocity_1_3'])

df_test = df_test.merge(cluster_velocity, on='emb_cluster', how='left')

# Check for any missing clusters (may happen if test has new clusters)
if df_test['velocity_1_3'].isna().any():
    overall_velocity = train_first_3_weeks['weekly_sales'].mean()
    missing_count = df_test['velocity_1_3'].isna().sum()
    print(f"Warning: {missing_count} test items have missing velocity, filling with overall average: {overall_velocity:.2f}")
    df_test['velocity_1_3'] = df_test['velocity_1_3'].fillna(overall_velocity)

print(f"Train data: velocity_1_3 range {df['velocity_1_3'].min():.2f} to {df['velocity_1_3'].max():.2f}")
print(f"Test data: velocity_1_3 range {df_test['velocity_1_3'].min():.2f} to {df_test['velocity_1_3'].max():.2f}")

print("\nCluster trend feature implementation complete!")
print(f"Added feature: velocity_1_3 (average sales per week during first 3 weeks)")
print(f"This feature reflects how similar style groups (clusters) performed historically.")


Computing cluster-level trend features from train data...
Cluster trend features computed for 23 clusters
Velocity range: 459.79 to 1469.42

Processing test embeddings with trained PCA and KMeans...
Assigned 2250 test items to clusters

Attaching cluster trend features to train and test data...
Train data: velocity_1_3 range 459.79 to 1469.42
Test data: velocity_1_3 range 459.79 to 1469.42

Cluster trend feature implementation complete!
Added feature: velocity_1_3 (average sales per week during first 3 weeks)
This feature reflects how similar style groups (clusters) performed historically.


  df_test.loc[test_valid, f"emb_pca_{i+1}"] = test_emb_pca[:, i]
  df_test.loc[test_valid, "emb_cluster"] = test_clusters
  df_test.loc[test_valid, "emb_dist"] = test_dists.min(axis=1)


In [27]:
# Engineer trend score feature based on similarity to top/bottom performers
# This captures whether a product's style is similar to historically successful or unsuccessful products

from sklearn.metrics.pairwise import cosine_similarity

print("Engineering trend score feature...")

# Step 1: Compute total weekly_sales per product (sum across all weeks and seasons)
print("Computing total sales per product...")
product_sales = df.groupby('ID')['weekly_sales'].sum().reset_index()
product_sales.columns = ['ID', 'total_weekly_sales']

# Step 2: Identify top 20% and bottom 20% products by total sales
top_threshold = product_sales['total_weekly_sales'].quantile(0.8)
bottom_threshold = product_sales['total_weekly_sales'].quantile(0.2)

top_product_ids = product_sales[product_sales['total_weekly_sales'] >= top_threshold]['ID'].values
bottom_product_ids = product_sales[product_sales['total_weekly_sales'] <= bottom_threshold]['ID'].values

print(f"Top 20% threshold: {top_threshold:.2f} ({len(top_product_ids)} products)")
print(f"Bottom 20% threshold: {bottom_threshold:.2f} ({len(bottom_product_ids)} products)")

# Step 3: Get unique product rows (one per ID) with their PCA embeddings
# Take first occurrence of each product to get its embedding
train_products = df.groupby('ID').first().reset_index()

# Step 4: Compute centroid embeddings for top and bottom performers
pca_cols = [f'emb_pca_{i+1}' for i in range(n_components)]

top_train_products = train_products[train_products['ID'].isin(top_product_ids)]
bottom_train_products = train_products[train_products['ID'].isin(bottom_product_ids)]

# Compute mean embedding for top performers
top_centroid = top_train_products[pca_cols].mean().values
bottom_centroid = bottom_train_products[pca_cols].mean().values

print(f"Top centroid computed from {len(top_train_products)} products")
print(f"Bottom centroid computed from {len(bottom_train_products)} products")

# Step 5: Compute similarity to centroids for all train products
print("\nComputing similarities for train products...")

def compute_similarity_to_centroid(embedding_values, centroid):
    """Compute cosine similarity between product embedding and centroid."""
    # Convert to numpy array if it's a pandas Series
    if hasattr(embedding_values, 'values'):
        embedding_arr = embedding_values.values
    else:
        embedding_arr = np.array(embedding_values)
    
    if embedding_arr is None or np.isnan(embedding_arr).any():
        return 0.0
    # Reshape to 2D arrays for sklearn
    embedding_2d = embedding_arr.reshape(1, -1)
    centroid_2d = centroid.reshape(1, -1)
    return cosine_similarity(embedding_2d, centroid_2d)[0][0]

# Get embeddings for train products
train_embeddings = train_products[pca_cols]

# Compute similarities
train_products['sim_to_top'] = train_embeddings.apply(
    lambda row: compute_similarity_to_centroid(row, top_centroid), axis=1
)
train_products['sim_to_bottom'] = train_embeddings.apply(
    lambda row: compute_similarity_to_centroid(row, bottom_centroid), axis=1
)

# Step 6: Merge similarities back to full train dataframe
# Drop existing columns if they exist to avoid merge conflicts
for col in ['sim_to_top', 'sim_to_bottom', 'trend_score']:
    if col in df.columns:
        df = df.drop(columns=[col])

df = df.merge(train_products[['ID', 'sim_to_top', 'sim_to_bottom']], on='ID', how='left')

# Fill any missing values with 0 (shouldn't happen)
df['sim_to_top'] = df['sim_to_top'].fillna(0)
df['sim_to_bottom'] = df['sim_to_bottom'].fillna(0)

# Step 7: Compute trend_score for train
df['trend_score'] = df['sim_to_top'] - df['sim_to_bottom']

print(f"Train: trend_score range {df['trend_score'].min():.3f} to {df['trend_score'].max():.3f}")
print(f"Train: Positive trend_score (emerging) count: {(df['trend_score'] > 0).sum()}")
print(f"Train: Negative trend_score (declining) count: {(df['trend_score'] < 0).sum()}")

# Step 8: Compute similarities for test products
print("\nComputing similarities for test products...")

# Get unique test products (one per ID)
test_products = df_test.groupby('ID').first().reset_index()
test_embeddings = test_products[pca_cols]

# Compute similarities for test
test_products['sim_to_top'] = test_embeddings.apply(
    lambda row: compute_similarity_to_centroid(row, top_centroid), axis=1
)
test_products['sim_to_bottom'] = test_embeddings.apply(
    lambda row: compute_similarity_to_centroid(row, bottom_centroid), axis=1
)

# Merge back to full test dataframe
# Drop existing columns if they exist to avoid merge conflicts
for col in ['sim_to_top', 'sim_to_bottom', 'trend_score']:
    if col in df_test.columns:
        df_test = df_test.drop(columns=[col])

df_test = df_test.merge(test_products[['ID', 'sim_to_top', 'sim_to_bottom']], on='ID', how='left')

# Ensure columns exist and fill any missing values with 0
if 'sim_to_top' not in df_test.columns:
    df_test['sim_to_top'] = 0
else:
    df_test['sim_to_top'] = df_test['sim_to_top'].fillna(0)

if 'sim_to_bottom' not in df_test.columns:
    df_test['sim_to_bottom'] = 0
else:
    df_test['sim_to_bottom'] = df_test['sim_to_bottom'].fillna(0)

# Compute trend_score for test
df_test['trend_score'] = df_test['sim_to_top'] - df_test['sim_to_bottom']

print(f"Test: trend_score range {df_test['trend_score'].min():.3f} to {df_test['trend_score'].max():.3f}")
print(f"Test: Positive trend_score (emerging) count: {(df_test['trend_score'] > 0).sum()}")
print(f"Test: Negative trend_score (declining) count: {(df_test['trend_score'] < 0).sum()}")

print("\nTrend score feature engineering complete!")
print("Added features: sim_to_top, sim_to_bottom, trend_score")
print("trend_score = sim_to_top - sim_to_bottom")
print("  Positive → emerging trend (similar to top performers)")
print("  Negative → declining trend (similar to bottom performers)")


Engineering trend score feature...
Computing total sales per product...
Top 20% threshold: 14666.80 (1969 products)
Bottom 20% threshold: 1112.00 (1973 products)
Top centroid computed from 1969 products
Bottom centroid computed from 1973 products

Computing similarities for train products...
Train: trend_score range -1.212 to 1.162
Train: Positive trend_score (emerging) count: 50609
Train: Negative trend_score (declining) count: 44730

Computing similarities for test products...
Test: trend_score range -1.161 to 0.986
Test: Positive trend_score (emerging) count: 1037
Test: Negative trend_score (declining) count: 1213

Trend score feature engineering complete!
Added features: sim_to_top, sim_to_bottom, trend_score
trend_score = sim_to_top - sim_to_bottom
  Positive → emerging trend (similar to top performers)
  Negative → declining trend (similar to bottom performers)


In [28]:
# Advanced feature engineering
# Cluster, similarity, and family-level features

from scipy import stats
from sklearn.neighbors import NearestNeighbors

print("=" * 60)
print("Engineering Advanced Features")
print("=" * 60)

# ========== CLUSTER-LEVEL FEATURES ==========
print("\n1. Cluster-level features...")

# Cluster velocity 1-6 (mean sales per week during first 6 weeks)
train_first_6_weeks = df[df['weeks_since_launch'] < 6].copy()
cluster_velocity_1_6 = train_first_6_weeks.groupby('emb_cluster')['weekly_sales'].mean().reset_index()
cluster_velocity_1_6.columns = ['emb_cluster', 'cluster_velocity_1_6']

# Cluster demand last season (mean weekly_sales for each cluster in previous season) - MEMORY OPTIMIZED
# For each row, get demand of same cluster in previous season (id_season - 1)
print("Computing cluster demand last season...")

# Compute means per cluster and season
cluster_season_means = df.groupby(['emb_cluster', 'id_season'])['weekly_sales'].mean().reset_index()
cluster_season_means.columns = ['emb_cluster', 'id_season', 'cluster_season_mean']

# Compute overall cluster means as fallback
cluster_overall_means = df.groupby('emb_cluster')['weekly_sales'].mean().reset_index()
cluster_overall_means.columns = ['emb_cluster', 'cluster_overall_mean']

# Create previous season mapping
cluster_season_prev = cluster_season_means.copy()
cluster_season_prev['id_season'] = cluster_season_prev['id_season'] + 1
cluster_season_prev.columns = ['emb_cluster', 'id_season', 'cluster_demand_last_season']

# Merge to get previous season demand
cluster_demand_last_season_df = df[['emb_cluster', 'id_season']].drop_duplicates().merge(
    cluster_season_prev[['emb_cluster', 'id_season', 'cluster_demand_last_season']],
    on=['emb_cluster', 'id_season'],
    how='left'
)

# Fill missing with cluster overall mean
cluster_demand_last_season_df = cluster_demand_last_season_df.merge(
    cluster_overall_means,
    on='emb_cluster',
    how='left'
)
cluster_demand_last_season_df['cluster_demand_last_season'] = cluster_demand_last_season_df['cluster_demand_last_season'].fillna(
    cluster_demand_last_season_df['cluster_overall_mean']
)
cluster_demand_last_season_df = cluster_demand_last_season_df[['emb_cluster', 'id_season', 'cluster_demand_last_season']]
# Drop existing column if it exists to avoid merge conflicts
if 'cluster_demand_last_season' in df.columns:
    df = df.drop(columns=['cluster_demand_last_season'])

df = df.merge(cluster_demand_last_season_df, on=['emb_cluster', 'id_season'], how='left')
# Fill missing with cluster overall mean - VECTORIZED (no apply)
if 'cluster_demand_last_season' in df.columns:
    # Fill missing values using vectorized operations
    cluster_overall_mean = df.groupby('emb_cluster')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['cluster_demand_last_season'] = df['cluster_demand_last_season'].fillna(
        df['emb_cluster'].map(cluster_overall_mean).fillna(overall_mean_value)
    )
else:
    # If column doesn't exist after merge, create it
    cluster_overall_mean = df.groupby('emb_cluster')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['cluster_demand_last_season'] = df['emb_cluster'].map(cluster_overall_mean).fillna(overall_mean_value)

# Cluster demand slope across weeks (linear regression slope)
# For each cluster, compute slope of weekly_sales vs weeks_since_launch
cluster_slopes = []
for cluster in df['emb_cluster'].unique():
    cluster_data = df[df['emb_cluster'] == cluster].copy()
    if len(cluster_data) > 1:
        weeks = cluster_data['weeks_since_launch'].values
        sales = cluster_data['weekly_sales'].values
        if len(weeks) > 1 and weeks.std() > 0:
            slope, _, _, _, _ = stats.linregress(weeks, sales)
        else:
            slope = 0
    else:
        slope = 0
    cluster_slopes.append({'emb_cluster': cluster, 'cluster_demand_slope': slope})

cluster_slopes_df = pd.DataFrame(cluster_slopes)
# Drop existing column if it exists
if 'cluster_demand_slope' in df.columns:
    df = df.drop(columns=['cluster_demand_slope'])

df = df.merge(cluster_slopes_df, on='emb_cluster', how='left')
if 'cluster_demand_slope' in df.columns:
    df['cluster_demand_slope'] = df['cluster_demand_slope'].fillna(0)
else:
    df['cluster_demand_slope'] = 0

# Cluster popularity (# unique products per cluster)
cluster_popularity = df.groupby('emb_cluster')['ID'].nunique().reset_index()
cluster_popularity.columns = ['emb_cluster', 'cluster_popularity']
# Drop existing column if it exists
if 'cluster_popularity' in df.columns:
    df = df.drop(columns=['cluster_popularity'])

df = df.merge(cluster_popularity, on='emb_cluster', how='left')

# Cluster season-on-season growth - MEMORY OPTIMIZED
# Compute growth rate between consecutive seasons for each cluster
print("Computing cluster season-on-season growth...")

# Use the cluster_season_means we already computed
cluster_season_means = df.groupby(['emb_cluster', 'id_season'])['weekly_sales'].mean().reset_index()
cluster_season_means.columns = ['emb_cluster', 'id_season', 'cluster_season_mean']
cluster_season_means = cluster_season_means.sort_values(['emb_cluster', 'id_season'])

# Compute previous season mean by shifting within each cluster
cluster_season_means['prev_season_mean'] = cluster_season_means.groupby('emb_cluster')['cluster_season_mean'].shift(1)

# Compute growth
cluster_season_means['cluster_season_growth'] = np.where(
    cluster_season_means['prev_season_mean'] > 0,
    (cluster_season_means['cluster_season_mean'] - cluster_season_means['prev_season_mean']) / 
    cluster_season_means['prev_season_mean'],
    np.where(cluster_season_means['cluster_season_mean'] > 0, 1.0, 0)
)

# Fill first season in each cluster with 0 (no previous season)
cluster_season_means['cluster_season_growth'] = cluster_season_means['cluster_season_growth'].fillna(0)

cluster_growth_df = cluster_season_means[['emb_cluster', 'id_season', 'cluster_season_growth']].copy()
# Drop existing column if it exists
if 'cluster_season_growth' in df.columns:
    df = df.drop(columns=['cluster_season_growth'])

df = df.merge(cluster_growth_df, on=['emb_cluster', 'id_season'], how='left')
if 'cluster_season_growth' in df.columns:
    df['cluster_season_growth'] = df['cluster_season_growth'].fillna(0)
else:
    df['cluster_season_growth'] = 0

# Cluster demand Y/Y change (year-over-year trend) - MEMORY OPTIMIZED
# Compare same season across different years
print("Computing cluster Y/Y change...")
df['season_type'] = (df['id_season'] % 2).astype(int)  # 0 or 1

# Compute season_type means per cluster, year, and season_type
cluster_yoy_means = df.groupby(['emb_cluster', 'year', 'season_type'])['weekly_sales'].mean().reset_index()
cluster_yoy_means.columns = ['emb_cluster', 'year', 'season_type', 'cluster_season_mean']

# Merge current year means
cluster_yoy_computed = df[['emb_cluster', 'ID', 'id_season', 'year', 'season_type']].copy()
cluster_yoy_computed = cluster_yoy_computed.merge(
    cluster_yoy_means, 
    on=['emb_cluster', 'year', 'season_type'], 
    how='left'
)

# Merge previous year means
cluster_yoy_means_prev = cluster_yoy_means.copy()
cluster_yoy_means_prev['year'] = cluster_yoy_means_prev['year'] + 1
cluster_yoy_means_prev.columns = ['emb_cluster', 'year', 'season_type', 'cluster_season_mean_prev']
cluster_yoy_computed = cluster_yoy_computed.merge(
    cluster_yoy_means_prev[['emb_cluster', 'year', 'season_type', 'cluster_season_mean_prev']],
    on=['emb_cluster', 'year', 'season_type'],
    how='left'
)

# Fill missing previous year means with current year mean
cluster_yoy_computed['cluster_season_mean_prev'] = cluster_yoy_computed['cluster_season_mean_prev'].fillna(
    cluster_yoy_computed['cluster_season_mean']
)

# Compute Y/Y change
cluster_yoy_computed['cluster_yoy_change'] = np.where(
    cluster_yoy_computed['cluster_season_mean_prev'] > 0,
    (cluster_yoy_computed['cluster_season_mean'] - cluster_yoy_computed['cluster_season_mean_prev']) / 
    cluster_yoy_computed['cluster_season_mean_prev'],
    0
)

cluster_yoy_df = cluster_yoy_computed[['emb_cluster', 'ID', 'id_season', 'cluster_yoy_change']].copy()

# Clean up temporary columns
df = df.drop(columns=['season_type'])
# Drop existing column if it exists
if 'cluster_yoy_change' in df.columns:
    df = df.drop(columns=['cluster_yoy_change'])

df = df.merge(cluster_yoy_df, on=['emb_cluster', 'ID', 'id_season'], how='left')
if 'cluster_yoy_change' in df.columns:
    df['cluster_yoy_change'] = df['cluster_yoy_change'].fillna(0)
else:
    df['cluster_yoy_change'] = 0

# Cluster sell-through curve (peak week)
# For each cluster, find the week with highest average sales
cluster_peak_week = df.groupby(['emb_cluster', 'weeks_since_launch'])['weekly_sales'].mean().reset_index()
cluster_peak_week = cluster_peak_week.loc[cluster_peak_week.groupby('emb_cluster')['weekly_sales'].idxmax()]
cluster_peak_week = cluster_peak_week[['emb_cluster', 'weeks_since_launch']].copy()
cluster_peak_week.columns = ['emb_cluster', 'cluster_peak_week']
# Drop existing column if it exists
if 'cluster_peak_week' in df.columns:
    df = df.drop(columns=['cluster_peak_week'])

df = df.merge(cluster_peak_week, on='emb_cluster', how='left')
if 'cluster_peak_week' in df.columns:
    df['cluster_peak_week'] = df['cluster_peak_week'].fillna(df['weeks_since_launch'].median())
else:
    df['cluster_peak_week'] = df['weeks_since_launch'].median()

# Merge velocity_1_6 to df
# Drop existing column if it exists
if 'cluster_velocity_1_6' in df.columns:
    df = df.drop(columns=['cluster_velocity_1_6'])

df = df.merge(cluster_velocity_1_6, on='emb_cluster', how='left')
overall_velocity_1_6 = train_first_6_weeks['weekly_sales'].mean()
if 'cluster_velocity_1_6' in df.columns:
    df['cluster_velocity_1_6'] = df['cluster_velocity_1_6'].fillna(overall_velocity_1_6)
else:
    df['cluster_velocity_1_6'] = overall_velocity_1_6

print(f"  ✓ Cluster velocity 1-6")
print(f"  ✓ Cluster demand last season")
print(f"  ✓ Cluster demand slope")
print(f"  ✓ Cluster popularity")
print(f"  ✓ Cluster season-on-season growth")
print(f"  ✓ Cluster Y/Y change")
print(f"  ✓ Cluster peak week")

# ========== SIMILARITY-TO-PREVIOUS-PRODUCTS FEATURES ==========
print("\n2. Similarity-to-previous-products features...")

# Get unique products with their embeddings (PCA features) - MEMORY OPTIMIZED
pca_cols = [f'emb_pca_{i+1}' for i in range(n_components)]
train_products_unique = df.groupby('ID').first().reset_index()

# Prepare embeddings for similarity computation - use float32 to save memory
train_products_embeddings = train_products_unique[pca_cols].fillna(0).astype(np.float32).values

# Pre-compute product demand statistics to avoid repeated queries
print("Pre-computing product statistics...")
product_stats = df.groupby('ID').agg({
    'weekly_sales': ['mean', 'median']
}).reset_index()
product_stats.columns = ['ID', 'product_demand_mean', 'product_demand_median']

product_velocity = df[df['weeks_since_launch'] < 3].groupby('ID')['weekly_sales'].mean().reset_index()
product_velocity.columns = ['ID', 'product_velocity_1_3']

# Get overall stats for fallback
overall_mean = df['weekly_sales'].mean()
overall_median = df['weekly_sales'].median()
overall_velocity = df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()

# For each product, find similar products (using KNN on embeddings) - MEMORY OPTIMIZED
# Use 10 nearest neighbors, process in batches
n_neighbors = min(10, len(train_products_unique) - 1)
batch_size = 100  # Process in batches to save memory

if n_neighbors > 0 and len(train_products_unique) > 1:
    print(f"Computing similarities for {len(train_products_unique)} products in batches of {batch_size}...")
    knn = NearestNeighbors(n_neighbors=n_neighbors + 1, metric='cosine', algorithm='brute')
    knn.fit(train_products_embeddings)
    
    # Get similar products for each product in batches
    similar_product_features = []
    product_ids_array = train_products_unique['ID'].values  # Pre-extract for faster access
    
    for batch_start in range(0, len(train_products_unique), batch_size):
        batch_end = min(batch_start + batch_size, len(train_products_unique))
        batch_embeddings = train_products_embeddings[batch_start:batch_end]
        
        # Get neighbors for this batch
        distances, indices = knn.kneighbors(batch_embeddings)
        
        for i, idx in enumerate(range(batch_start, batch_end)):
            product_id = product_ids_array[idx]
            
            # Exclude self (first neighbor is itself)
            similar_indices = indices[i][1:]
            similar_product_ids = product_ids_array[similar_indices]
            
            # Get demand statistics from pre-computed stats
            similar_stats = product_stats[product_stats['ID'].isin(similar_product_ids)]
            similar_velocities = product_velocity[product_velocity['ID'].isin(similar_product_ids)]
            
            if len(similar_stats) > 0:
                similar_demand_mean = similar_stats['product_demand_mean'].mean()
                similar_demand_median = similar_stats['product_demand_median'].median()
                
                if len(similar_velocities) > 0:
                    similar_velocity_1_3 = similar_velocities['product_velocity_1_3'].mean()
                else:
                    similar_velocity_1_3 = similar_demand_mean
            else:
                similar_demand_mean = overall_mean
                similar_demand_median = overall_median
                similar_velocity_1_3 = overall_velocity
            
            similar_product_features.append({
                'ID': product_id,
                'similar_product_demand_mean': similar_demand_mean,
                'similar_product_demand_median': similar_demand_median,
                'similar_product_velocity_1_3': similar_velocity_1_3
            })
        
        if (batch_start // batch_size + 1) % 10 == 0:
            print(f"  Processed {batch_end}/{len(train_products_unique)} products...")
    
    similar_features_df = pd.DataFrame(similar_product_features)
    
    # Merge to full dataframe
    # Drop existing columns if they exist
    for col in ['similar_product_demand_mean', 'similar_product_demand_median', 'similar_product_velocity_1_3']:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    df = df.merge(similar_features_df, on='ID', how='left')
    # Fill missing with overall means
    if 'similar_product_demand_mean' in df.columns:
        df['similar_product_demand_mean'] = df['similar_product_demand_mean'].fillna(df['weekly_sales'].mean())
    else:
        df['similar_product_demand_mean'] = df['weekly_sales'].mean()
    
    if 'similar_product_demand_median' in df.columns:
        df['similar_product_demand_median'] = df['similar_product_demand_median'].fillna(df['weekly_sales'].median())
    else:
        df['similar_product_demand_median'] = df['weekly_sales'].median()
    
    if 'similar_product_velocity_1_3' in df.columns:
        df['similar_product_velocity_1_3'] = df['similar_product_velocity_1_3'].fillna(
            df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
        )
    else:
        df['similar_product_velocity_1_3'] = df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
else:
    # Fallback if not enough products
    overall_mean = df['weekly_sales'].mean()
    overall_median = df['weekly_sales'].median()
    overall_velocity = df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
    df['similar_product_demand_mean'] = overall_mean
    df['similar_product_demand_median'] = overall_median
    df['similar_product_velocity_1_3'] = overall_velocity

print(f"  ✓ Similar product demand mean")
print(f"  ✓ Similar product demand median")
print(f"  ✓ Similar product velocity 1-3")

# ========== FAMILY-LEVEL TREND FEATURES ==========
print("\n3. Family-level trend features...")

# Family velocity 1-3 last season - MEMORY OPTIMIZED
print("Computing family velocity 1-3 last season...")

# Compute family velocity for first 3 weeks per family and season
family_first_3 = df[df['weeks_since_launch'] < 3]
family_season_velocity = family_first_3.groupby(['family', 'id_season'])['weekly_sales'].mean().reset_index()
family_season_velocity.columns = ['family', 'id_season', 'family_velocity_1_3']

# Compute overall family velocity for first 3 weeks as fallback
family_overall_velocity_stats = family_first_3.groupby('family')['weekly_sales'].mean().reset_index()
family_overall_velocity_stats.columns = ['family', 'family_overall_velocity']

# Create previous season mapping
family_velocity_prev = family_season_velocity.copy()
family_velocity_prev['id_season'] = family_velocity_prev['id_season'] + 1
family_velocity_prev.columns = ['family', 'id_season', 'family_velocity_1_3_last_season']

# Merge to get previous season velocity
family_velocity_last_season_df = df[['family', 'id_season']].drop_duplicates().merge(
    family_velocity_prev[['family', 'id_season', 'family_velocity_1_3_last_season']],
    on=['family', 'id_season'],
    how='left'
)

# Fill missing with family overall velocity
family_velocity_last_season_df = family_velocity_last_season_df.merge(
    family_overall_velocity_stats,
    on='family',
    how='left'
)
family_velocity_last_season_df['family_velocity_1_3_last_season'] = family_velocity_last_season_df['family_velocity_1_3_last_season'].fillna(
    family_velocity_last_season_df['family_overall_velocity']
)
family_velocity_last_season_df = family_velocity_last_season_df[['family', 'id_season', 'family_velocity_1_3_last_season']]
# Drop existing column if it exists
if 'family_velocity_1_3_last_season' in df.columns:
    df = df.drop(columns=['family_velocity_1_3_last_season'])

df = df.merge(family_velocity_last_season_df, on=['family', 'id_season'], how='left')
# Fill missing with family overall mean for first 3 weeks - VECTORIZED (no apply)
if 'family_velocity_1_3_last_season' in df.columns:
    # Fill missing values using vectorized operations
    family_overall_velocity = df[df['weeks_since_launch'] < 3].groupby('family')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['family_velocity_1_3_last_season'] = df['family_velocity_1_3_last_season'].fillna(
        df['family'].map(family_overall_velocity).fillna(overall_mean_value)
    )
else:
    family_overall_velocity = df[df['weeks_since_launch'] < 3].groupby('family')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['family_velocity_1_3_last_season'] = df['family'].map(family_overall_velocity).fillna(overall_mean_value)

# Family demand mean last season - MEMORY OPTIMIZED
print("Computing family demand mean last season...")

# Compute family means per season
family_season_means = df.groupby(['family', 'id_season'])['weekly_sales'].mean().reset_index()
family_season_means.columns = ['family', 'id_season', 'family_season_mean']

# Compute overall family means as fallback
family_overall_means = df.groupby('family')['weekly_sales'].mean().reset_index()
family_overall_means.columns = ['family', 'family_overall_mean']

# Create previous season mapping
family_season_prev = family_season_means.copy()
family_season_prev['id_season'] = family_season_prev['id_season'] + 1
family_season_prev.columns = ['family', 'id_season', 'family_demand_mean_last_season']

# Merge to get previous season demand
family_demand_last_season_df = df[['family', 'id_season']].drop_duplicates().merge(
    family_season_prev[['family', 'id_season', 'family_demand_mean_last_season']],
    on=['family', 'id_season'],
    how='left'
)

# Fill missing with family overall mean
family_demand_last_season_df = family_demand_last_season_df.merge(
    family_overall_means,
    on='family',
    how='left'
)
family_demand_last_season_df['family_demand_mean_last_season'] = family_demand_last_season_df['family_demand_mean_last_season'].fillna(
    family_demand_last_season_df['family_overall_mean']
)
family_demand_last_season_df = family_demand_last_season_df[['family', 'id_season', 'family_demand_mean_last_season']]
# Drop existing column if it exists
if 'family_demand_mean_last_season' in df.columns:
    df = df.drop(columns=['family_demand_mean_last_season'])

df = df.merge(family_demand_last_season_df, on=['family', 'id_season'], how='left')
# Fill missing with family overall mean - VECTORIZED (no apply)
if 'family_demand_mean_last_season' in df.columns:
    # Fill missing values using vectorized operations
    family_overall_mean = df.groupby('family')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['family_demand_mean_last_season'] = df['family_demand_mean_last_season'].fillna(
        df['family'].map(family_overall_mean).fillna(overall_mean_value)
    )
else:
    family_overall_mean = df.groupby('family')['weekly_sales'].mean().to_dict()
    overall_mean_value = df['weekly_sales'].mean()
    df['family_demand_mean_last_season'] = df['family'].map(family_overall_mean).fillna(overall_mean_value)

# Family demand trend (slope over last 4 seasons) - MEMORY OPTIMIZED
print("Computing family demand trend...")

# Use the family_season_means we already computed
family_season_means = df.groupby(['family', 'id_season'])['weekly_sales'].mean().reset_index()
family_season_means.columns = ['family', 'id_season', 'family_season_mean']
family_season_means = family_season_means.sort_values(['family', 'id_season'])

# Compute trend for each family using last 4 seasons
family_trends = []
for family in family_season_means['family'].unique():
    family_seasons = family_season_means[family_season_means['family'] == family].sort_values('id_season')
    recent_seasons = family_seasons.tail(4)  # Last 4 seasons
    
    if len(recent_seasons) > 1:
        x = np.arange(len(recent_seasons))
        y = recent_seasons['family_season_mean'].values
        slope, _, _, _, _ = stats.linregress(x, y)
    else:
        slope = 0
    
    family_trends.append({'family': family, 'family_demand_trend': slope})

family_trend_map = pd.DataFrame(family_trends)

# Merge trend to all family-season combinations
family_trend_df = df[['family', 'id_season']].drop_duplicates().merge(
    family_trend_map,
    on='family',
    how='left'
)
family_trend_df['family_demand_trend'] = family_trend_df['family_demand_trend'].fillna(0)
# Drop existing column if it exists
if 'family_demand_trend' in df.columns:
    df = df.drop(columns=['family_demand_trend'])

df = df.merge(family_trend_df, on=['family', 'id_season'], how='left')
if 'family_demand_trend' in df.columns:
    df['family_demand_trend'] = df['family_demand_trend'].fillna(0)
else:
    df['family_demand_trend'] = 0

print(f"  ✓ Family velocity 1-3 last season")
print(f"  ✓ Family demand mean last season")
print(f"  ✓ Family demand trend")

print("\n" + "=" * 60)
print("Advanced feature engineering complete!")
print("=" * 60)


Engineering Advanced Features

1. Cluster-level features...
Computing cluster demand last season...
Computing cluster season-on-season growth...
Computing cluster Y/Y change...
  ✓ Cluster velocity 1-6
  ✓ Cluster demand last season
  ✓ Cluster demand slope
  ✓ Cluster popularity
  ✓ Cluster season-on-season growth
  ✓ Cluster Y/Y change
  ✓ Cluster peak week

2. Similarity-to-previous-products features...
Pre-computing product statistics...
Computing similarities for 9843 products in batches of 100...
  Processed 1000/9843 products...
  Processed 2000/9843 products...
  Processed 3000/9843 products...
  Processed 4000/9843 products...
  Processed 5000/9843 products...
  Processed 6000/9843 products...
  Processed 7000/9843 products...
  Processed 8000/9843 products...
  Processed 9000/9843 products...
  ✓ Similar product demand mean
  ✓ Similar product demand median
  ✓ Similar product velocity 1-3

3. Family-level trend features...
Computing family velocity 1-3 last season...
Computi

In [None]:
# Compute features for test data using train-based statistics
# Test data doesn't have sales, so we use train data for cluster/family features and similarity

print("=" * 60)
print("Computing Features for Test Data")
print("=" * 60)

# ========== CLUSTER FEATURES FOR TEST ==========
print("\n1. Adding cluster features to test data...")

# Cluster velocity 1-6 (already computed from train, merge by cluster)
df_test = df_test.merge(cluster_velocity_1_6, on='emb_cluster', how='left')
overall_velocity_1_6 = train_first_6_weeks['weekly_sales'].mean()
df_test['cluster_velocity_1_6'] = df_test['cluster_velocity_1_6'].fillna(overall_velocity_1_6)

# Cluster demand last season - use last season from train for test clusters
# For test, we'll use the most recent season's cluster stats
latest_season = df['id_season'].max()
cluster_latest_season_demand = df[df['id_season'] == latest_season].groupby('emb_cluster')['weekly_sales'].mean().reset_index()
cluster_latest_season_demand.columns = ['emb_cluster', 'cluster_demand_last_season']
df_test = df_test.merge(cluster_latest_season_demand, on='emb_cluster', how='left')
cluster_overall_mean = df.groupby('emb_cluster')['weekly_sales'].mean().to_dict()
df_test['cluster_demand_last_season'] = df_test.apply(
    lambda row: cluster_overall_mean.get(row['emb_cluster'], df['weekly_sales'].mean()) 
    if pd.isna(row['cluster_demand_last_season']) 
    else row['cluster_demand_last_season'], axis=1
)

# Cluster demand slope (from train)
df_test = df_test.merge(cluster_slopes_df, on='emb_cluster', how='left')
df_test['cluster_demand_slope'] = df_test['cluster_demand_slope'].fillna(0)

# Cluster popularity (from train)
df_test = df_test.merge(cluster_popularity, on='emb_cluster', how='left')
df_test['cluster_popularity'] = df_test['cluster_popularity'].fillna(0)

# Cluster season-on-season growth - use latest growth from train
latest_growth = cluster_growth_df[cluster_growth_df['id_season'] == latest_season].copy()
if len(latest_growth) > 0:
    latest_growth = latest_growth[['emb_cluster', 'cluster_season_growth']]
    df_test = df_test.merge(latest_growth, on='emb_cluster', how='left')
else:
    df_test['cluster_season_growth'] = 0
df_test['cluster_season_growth'] = df_test['cluster_season_growth'].fillna(0)

# Cluster Y/Y change - use latest from train
latest_yoy = cluster_yoy_df[cluster_yoy_df['id_season'] == latest_season].groupby('emb_cluster')['cluster_yoy_change'].mean().reset_index()
df_test = df_test.merge(latest_yoy, on='emb_cluster', how='left')
df_test['cluster_yoy_change'] = df_test['cluster_yoy_change'].fillna(0)

# Cluster peak week (from train)
df_test = df_test.merge(cluster_peak_week, on='emb_cluster', how='left')
df_test['cluster_peak_week'] = df_test['cluster_peak_week'].fillna(df['weeks_since_launch'].median() if 'weeks_since_launch' in df_test.columns else 10)

print(f"  ✓ All cluster features added")

# ========== SIMILARITY FEATURES FOR TEST ==========
print("\n2. Computing similarity features for test products...")

# Get unique test products
test_products_unique = df_test.groupby('ID').first().reset_index()
test_products_embeddings = test_products_unique[pca_cols].fillna(0).values

# Find similar products from TRAIN data for each test product
if len(test_products_unique) > 0 and len(train_products_unique) > 0:
    # Use KNN on train embeddings to find similar train products for test products
    knn_test = NearestNeighbors(n_neighbors=min(10, len(train_products_unique)), metric='cosine')
    knn_test.fit(train_products_embeddings)  # Fit on train embeddings
    
    test_similar_features = []
    
    for idx, test_product_row in test_products_unique.iterrows():
        test_product_id = test_product_row['ID']
        test_product_embedding = test_products_embeddings[idx:idx+1]
        distances, indices = knn_test.kneighbors(test_product_embedding)
        
        # Get similar product IDs from train
        similar_indices = indices[0]
        similar_product_ids = train_products_unique.iloc[similar_indices]['ID'].values
        
        # Get demand statistics for similar products from TRAIN data
        similar_products_data = df[df['ID'].isin(similar_product_ids)]
        
        if len(similar_products_data) > 0:
            similar_demand_mean = similar_products_data['weekly_sales'].mean()
            similar_demand_median = similar_products_data['weekly_sales'].median()
            
            # Velocity 1-3 for similar products
            similar_first_3_weeks = similar_products_data[similar_products_data['weeks_since_launch'] < 3]
            if len(similar_first_3_weeks) > 0:
                similar_velocity_1_3 = similar_first_3_weeks['weekly_sales'].mean()
            else:
                similar_velocity_1_3 = similar_demand_mean
        else:
            similar_demand_mean = df['weekly_sales'].mean()
            similar_demand_median = df['weekly_sales'].median()
            similar_velocity_1_3 = df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
        
        test_similar_features.append({
            'ID': test_product_id,
            'similar_product_demand_mean': similar_demand_mean,
            'similar_product_demand_median': similar_demand_median,
            'similar_product_velocity_1_3': similar_velocity_1_3
        })
    
    test_similar_features_df = pd.DataFrame(test_similar_features)
    df_test = df_test.merge(test_similar_features_df, on='ID', how='left')
    
    # Fill missing with overall means from train
    df_test['similar_product_demand_mean'] = df_test['similar_product_demand_mean'].fillna(df['weekly_sales'].mean())
    df_test['similar_product_demand_median'] = df_test['similar_product_demand_median'].fillna(df['weekly_sales'].median())
    df_test['similar_product_velocity_1_3'] = df_test['similar_product_velocity_1_3'].fillna(
        df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
    )
else:
    # Fallback
    overall_mean = df['weekly_sales'].mean()
    overall_median = df['weekly_sales'].median()
    overall_velocity = df[df['weeks_since_launch'] < 3]['weekly_sales'].mean()
    df_test['similar_product_demand_mean'] = overall_mean
    df_test['similar_product_demand_median'] = overall_median
    df_test['similar_product_velocity_1_3'] = overall_velocity

print(f"  ✓ Similarity features computed using train data")

# ========== FAMILY FEATURES FOR TEST ==========
print("\n3. Adding family features to test data...")

# Family velocity 1-3 last season - use latest season from train
family_latest_velocity = family_velocity_last_season_df[family_velocity_last_season_df['id_season'] == latest_season].copy()
if len(family_latest_velocity) > 0:
    family_latest_velocity = family_latest_velocity[['family', 'family_velocity_1_3_last_season']]
    df_test = df_test.merge(family_latest_velocity, on='family', how='left')
else:
    df_test['family_velocity_1_3_last_season'] = df_test['family'].map(family_overall_velocity).fillna(df['weekly_sales'].mean())
df_test['family_velocity_1_3_last_season'] = df_test['family_velocity_1_3_last_season'].fillna(
    df_test['family'].map(family_overall_velocity).fillna(df['weekly_sales'].mean())
)

# Family demand mean last season - use latest season from train
family_latest_demand = family_demand_last_season_df[family_demand_last_season_df['id_season'] == latest_season].copy()
if len(family_latest_demand) > 0:
    family_latest_demand = family_latest_demand[['family', 'family_demand_mean_last_season']]
    df_test = df_test.merge(family_latest_demand, on='family', how='left')
else:
    df_test['family_demand_mean_last_season'] = df_test['family'].map(family_overall_mean).fillna(df['weekly_sales'].mean())
df_test['family_demand_mean_last_season'] = df_test['family_demand_mean_last_season'].fillna(
    df_test['family'].map(family_overall_mean).fillna(df['weekly_sales'].mean())
)

# Family demand trend - use latest trend from train
family_latest_trend = family_trend_df[family_trend_df['id_season'] == latest_season].copy()
if len(family_latest_trend) > 0:
    family_latest_trend = family_latest_trend[['family', 'family_demand_trend']]
    df_test = df_test.merge(family_latest_trend, on='family', how='left')
else:
    df_test['family_demand_trend'] = 0
df_test['family_demand_trend'] = df_test['family_demand_trend'].fillna(0)

print(f"  ✓ All family features added")

# Save updated dataframes (excluding embedding columns for faster save)
print("\n" + "=" * 60)
print("Saving final processed dataframes (excluding embeddings)...")

# Exclude embedding columns
embedding_cols_to_exclude = ['image_embedding', 'embedding_array'] + [f'emb_pca_{i+1}' for i in range(n_components)]
train_cols_to_save = [col for col in df.columns if col not in embedding_cols_to_exclude]
test_cols_to_save = [col for col in df_test.columns if col not in embedding_cols_to_exclude]

df[train_cols_to_save].to_csv('train_processed.csv', index=False)
df_test[test_cols_to_save].to_csv('test_processed.csv', index=False)

print("✓ train_processed.csv saved successfully!")
print("✓ test_processed.csv saved successfully!")
print(f"  Excluded {len(embedding_cols_to_exclude)} embedding columns for faster processing")

print("\n" + "=" * 60)
print("All feature engineering complete!")
print("=" * 60)


Computing Features for Test Data

1. Adding cluster features to test data...
  ✓ All cluster features added

2. Computing similarity features for test products...
Computing test similarities in batches of 100...
  Processed 1000/2250 test products...
  Processed 2000/2250 test products...
  ✓ Similarity features computed using train data

3. Adding family features to test data...
  ✓ All family features added

Saving final processed dataframes (excluding embeddings)...
✓ train_processed.csv saved successfully!
✓ test_processed.csv saved successfully!
  Excluded 85 embedding columns for faster processing

All feature engineering complete!
