In [None]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from copy import deepcopy
import warnings

warnings.filterwarnings('ignore')

# --- Configuration ---
TRAIN_CSV_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_with_engineered_features.csv'
TRAIN_TEXT_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_text_embeddings_CUSTOM.npy'
TRAIN_IMG_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_image_embeddings_siglip_with_id .npy'
N_SPLITS = 5
TEXT_PCA_COMPONENTS = 128
IMG_PCA_COMPONENTS = 128
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Model Hyperparameters ---
lgbm_params = {'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 2500, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 40, 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt', 'device': 'cpu'}
xgb_params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'n_estimators': 2500, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.1, 'lambda': 0.1, 'alpha': 0.1, 'max_depth': 7, 'n_jobs': -1, 'seed': 42, 'tree_method': 'hist', 'early_stopping_rounds': 100, 'device': 'cpu' if DEVICE.type == 'cpu' else 'cuda'}
cat_params = {'iterations': 2500, 'learning_rate': 0.01, 'depth': 8, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': 42, 'verbose': 0, 'allow_writing_files': False, 'task_type': 'CPU' if DEVICE.type == 'cpu' else 'GPU'}
mlp_params = {'batch_size': 256, 'epochs': 100, 'learning_rate': 1e-3, 'weight_decay': 0.01, 'patience': 5, 'scheduler_patience': 3}


# --- SMAPE Metric ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

# --- Load and Prepare All Data ---
print("Loading and preparing all data...")
train_df = pd.read_csv(TRAIN_CSV_PATH)

# Simplified loading for the direct .npy array format
print("Loading text embeddings from .npy file...")
aligned_text_embed = np.load(TRAIN_TEXT_EMBEDDINGS_PATH)

if len(train_df) != len(aligned_text_embed):
    raise ValueError(f"Row count mismatch between CSV ({len(train_df)}) and text embedding .npy ({len(aligned_text_embed)}).")

# Load and align image embeddings
print("Loading image embeddings from .npy file...")
train_img_embed_data = np.load(TRAIN_IMG_EMBEDDINGS_PATH, allow_pickle=True)
id_to_img_embedding_map = {item['sample_id']: item['embedding'] for item in train_img_embed_data if isinstance(item, dict) and 'sample_id' in item and 'embedding' in item}
if not id_to_img_embedding_map:
    raise ValueError(f"Could not find any valid image embeddings in '{TRAIN_IMG_EMBEDDINGS_PATH}'.")
img_embedding_dim = next(iter(id_to_img_embedding_map.values())).shape[0]
aligned_img_embed = np.array([id_to_img_embedding_map.get(sid, np.zeros(img_embedding_dim)) for sid in train_df['sample_id']])

# Data preparation
if 'log_price' not in train_df.columns and 'price_log' in train_df.columns: train_df.rename(columns={'price_log': 'log_price'}, inplace=True)
if 'log_price' not in train_df.columns: raise KeyError("Target column ('log_price' or 'price_log') not found.")
is_valid_mask = train_df['log_price'].notna()
train_df = train_df[is_valid_mask].reset_index(drop=True)
train_text_embed = aligned_text_embed[is_valid_mask]
train_img_embed = aligned_img_embed[is_valid_mask]
numerical_features = ['value', 'ipq', 'value_per_item','is_organic', 'is_sugar_free','is_premium_keyword',
    'is_dietary_specific']
categorical_features = ['unit_standardized', 'brand_cleaned']
for col in numerical_features: train_df[col] = train_df[col].fillna(train_df[col].median())
train_numerical_df = train_df[numerical_features]
train_categorical_df = train_df[categorical_features].fillna('unknown').astype(str)
y = train_df['log_price'].values
y_true_price = np.expm1(y)

# Label Encode before CV loop
print("Label Encoding all categorical features before cross-validation...")
train_categorical_le_list = []
for col in categorical_features:
    le = LabelEncoder()
    encoded_col = le.fit_transform(train_categorical_df[col]).reshape(-1, 1)
    train_categorical_le_list.append(encoded_col)
train_categorical_le = np.hstack(train_categorical_le_list)
print("Data loading and preparation complete.")


# --- PyTorch MLP Model Definition ---
class TabularDataset(Dataset):
    def __init__(self, X, y): self.X, self.y = torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 1))
    def forward(self, x): return self.model(x).squeeze(-1)


# --- Cross-Validation and Ensembling Loop ---
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
oof_lgbm, oof_xgb, oof_cat, oof_mlp = [np.zeros(len(train_df)) for _ in range(4)]
print(f"\nStarting {N_SPLITS}-Fold CV for 4 models...")

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Data Prep for this Fold
    text_pca = PCA(n_components=TEXT_PCA_COMPONENTS, random_state=42)
    train_text_pca, val_text_pca = text_pca.fit_transform(train_text_embed[train_idx]), text_pca.transform(train_text_embed[val_idx])
    img_pca = PCA(n_components=IMG_PCA_COMPONENTS, random_state=42)
    train_img_pca, val_img_pca = img_pca.fit_transform(train_img_embed[train_idx]), img_pca.transform(train_img_embed[val_idx])
    train_cat_le_fold, val_cat_le_fold = train_categorical_le[train_idx], train_categorical_le[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Use pd.concat to create a DataFrame, preserving dtypes for CatBoost
    X_train_tree = pd.concat([pd.DataFrame(train_numerical_df.iloc[train_idx].values), pd.DataFrame(train_cat_le_fold), pd.DataFrame(train_text_pca), pd.DataFrame(train_img_pca)], axis=1)
    X_val_tree = pd.concat([pd.DataFrame(train_numerical_df.iloc[val_idx].values), pd.DataFrame(val_cat_le_fold), pd.DataFrame(val_text_pca), pd.DataFrame(val_img_pca)], axis=1)
    X_train_tree.columns = [str(i) for i in range(X_train_tree.shape[1])]
    X_val_tree.columns = [str(i) for i in range(X_val_tree.shape[1])]

    # Train LightGBM
    print("Training LightGBM...")
    model_lgbm = lgb.LGBMRegressor(**lgbm_params)
    model_lgbm.fit(X_train_tree, y_train, eval_set=[(X_val_tree, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    oof_lgbm[val_idx] = model_lgbm.predict(X_val_tree)
    
    # Train XGBoost
    print("Training XGBoost...")
    model_xgb = xgb.XGBRegressor(**xgb_params)
    model_xgb.fit(X_train_tree, y_train, eval_set=[(X_val_tree, y_val)], verbose=False)
    oof_xgb[val_idx] = model_xgb.predict(X_val_tree)
    
    # Train CatBoost
    print("Training CatBoost...")
    cat_feature_indices = list(range(train_numerical_df.shape[1], train_numerical_df.shape[1] + train_cat_le_fold.shape[1]))
    model_cat = cb.CatBoostRegressor(**cat_params)
    model_cat.fit(X_train_tree, y_train, cat_features=cat_feature_indices, eval_set=[(X_val_tree, y_val)], early_stopping_rounds=100, verbose=0)
    oof_cat[val_idx] = model_cat.predict(X_val_tree)
    
    # Train MLP
    print("Training MLP...")
    scaler = StandardScaler()
    X_train_nn, X_val_nn = scaler.fit_transform(X_train_tree.values), scaler.transform(X_val_tree.values)
    train_loader, val_loader = DataLoader(TabularDataset(X_train_nn, y_train), batch_size=mlp_params['batch_size'], shuffle=True), DataLoader(TabularDataset(X_val_nn, y_val), batch_size=mlp_params['batch_size'], shuffle=False)
    model_mlp = MLPModel(X_train_nn.shape[1]).to(DEVICE)
    criterion, optimizer = nn.MSELoss(), torch.optim.AdamW(model_mlp.parameters(), lr=mlp_params['learning_rate'], weight_decay=mlp_params['weight_decay'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=mlp_params['scheduler_patience'])
    best_val_loss, patience_counter, best_model_state = float('inf'), 0, None
    for epoch in range(mlp_params['epochs']):
        # MLP training loop
        model_mlp.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model_mlp(batch_X.to(DEVICE))
            loss = criterion(outputs, batch_y.to(DEVICE))
            loss.backward()
            optimizer.step()
        model_mlp.eval()
        current_val_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model_mlp(batch_X.to(DEVICE))
                loss = criterion(outputs, batch_y.to(DEVICE))
                current_val_loss += loss.item()
        avg_val_loss = current_val_loss / len(val_loader)
        scheduler.step(avg_val_loss)
        if avg_val_loss < best_val_loss: best_val_loss, patience_counter, best_model_state = avg_val_loss, 0, deepcopy(model_mlp.state_dict())
        else: patience_counter += 1
        if patience_counter >= mlp_params['patience']: break
    if best_model_state: model_mlp.load_state_dict(best_model_state)
    model_mlp.eval()
    val_preds = []
    with torch.no_grad():
        for batch_X, _ in val_loader: val_preds.extend(model_mlp(batch_X.to(DEVICE)).cpu().numpy())
    oof_mlp[val_idx] = np.array(val_preds)
    
    print(f"Fold {fold+1} complete.")

# --- Final Results ---
print("\n" + "="*50)
print("Cross-Validation Complete. Calculating final scores...")
print("="*50)
score_lgbm, score_xgb, score_cat, score_mlp = [smape(y_true_price, np.expm1(oof)) for oof in [oof_lgbm, oof_xgb, oof_cat, oof_mlp]]
print(f"--- Individual Model OOF Scores ---")
print(f"LightGBM SMAPE: {score_lgbm:.4f}")
print(f"XGBoost SMAPE:  {score_xgb:.4f}")
print(f"CatBoost SMAPE: {score_cat:.4f}")
print(f"MLP SMAPE:      {score_mlp:.4f}")
print("\n--- Ensemble Score ---")
ensemble_preds = (oof_lgbm + oof_xgb + oof_cat + oof_mlp) / 4
score_ensemble = smape(y_true_price, np.expm1(ensemble_preds))
print(f"Simple Averaging Ensemble SMAPE: {score_ensemble:.4f}")
print("="*50)

# --- NEW: Save OOF Predictions for Stacking ---
print("\n--- Saving OOF Predictions for Stacking ---")

np.save('oof_lgbm_preds.npy', oof_lgbm)
np.save('oof_xgb_preds.npy', oof_xgb)
np.save('oof_cat_preds.npy', oof_cat)
np.save('oof_mlp_preds.npy', oof_mlp)

print("OOF predictions saved successfully.")
print("Files created: oof_lgbm_preds.npy, oof_xgb_preds.npy, oof_cat_preds.npy, oof_mlp_preds.npy")
# --- END NEW SECTION ---