In [None]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from copy import deepcopy
import joblib # Used for saving tree-based models and transformers
import warnings

warnings.filterwarnings('ignore')

# --- Configuration (Must match your original script) ---
TRAIN_CSV_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_with_engineered_features.csv'
TRAIN_TEXT_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_text_embeddings_CUSTOM.npy'
TRAIN_IMG_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/train_image_embeddings_siglip_with_id .npy'
TEXT_PCA_COMPONENTS = 128
IMG_PCA_COMPONENTS = 128
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Model Hyperparameters (Must match your original script) ---
# Note: Removed early_stopping related params as we train on full data now
lgbm_params = {'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 2500, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 40, 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt', 'device': 'cpu'}
xgb_params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'n_estimators': 2500, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.1, 'lambda': 0.1, 'alpha': 0.1, 'max_depth': 7, 'n_jobs': -1, 'seed': 42, 'tree_method': 'hist', 'device': 'cpu' if DEVICE.type == 'cpu' else 'cuda'}
cat_params = {'iterations': 2500, 'learning_rate': 0.01, 'depth': 8, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': 42, 'verbose': 0, 'allow_writing_files': False, 'task_type': 'CPU' if DEVICE.type == 'cpu' else 'GPU'}
mlp_params = {'batch_size': 256, 'epochs': 15, 'learning_rate': 1e-3, 'weight_decay': 0.01} # Reduced epochs for full training
# Note: For full training, MLP training complexity is reduced or patience removed.

# --- PyTorch MLP Model Definition (Reused) ---
class TabularDataset(Dataset):
    def __init__(self, X, y): self.X, self.y = torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 1))
    def forward(self, x): return self.model(x).squeeze(-1)

# --- 1. Load and Preprocess ALL Data (Reused logic) ---
print("Loading and preparing ALL data for final training...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
aligned_text_embed = np.load(TRAIN_TEXT_EMBEDDINGS_PATH)
train_img_embed_data = np.load(TRAIN_IMG_EMBEDDINGS_PATH, allow_pickle=True)
id_to_img_embedding_map = {item['sample_id']: item['embedding'] for item in train_img_embed_data if isinstance(item, dict) and 'sample_id' in item and 'embedding' in item}
img_embedding_dim = next(iter(id_to_img_embedding_map.values())).shape[0]
aligned_img_embed = np.array([id_to_img_embedding_map.get(sid, np.zeros(img_embedding_dim)) for sid in train_df['sample_id']])

if 'log_price' not in train_df.columns and 'price_log' in train_df.columns: train_df.rename(columns={'price_log': 'log_price'}, inplace=True)
is_valid_mask = train_df['log_price'].notna()
train_df = train_df[is_valid_mask].reset_index(drop=True)
train_text_embed = aligned_text_embed[is_valid_mask]
train_img_embed = aligned_img_embed[is_valid_mask]

numerical_features =  ['value', 'ipq', 'value_per_item','is_organic', 'is_sugar_free','is_premium_keyword',
    'is_dietary_specific']
categorical_features = ['unit_standardized', 'brand_cleaned']
for col in numerical_features: train_df[col] = train_df[col].fillna(train_df[col].median())
train_numerical_df = train_df[numerical_features]
train_categorical_df = train_df[categorical_features].fillna('unknown').astype(str)
y = train_df['log_price'].values

# Fit and Transform Preprocessing Objects on ALL data
print("Fitting PCA and Label Encoders on ALL data...")
text_pca = PCA(n_components=TEXT_PCA_COMPONENTS, random_state=42)
train_text_pca = text_pca.fit_transform(train_text_embed)
img_pca = PCA(n_components=IMG_PCA_COMPONENTS, random_state=42)
train_img_pca = img_pca.fit_transform(train_img_embed)

# Fit Label Encoders
train_categorical_le_list = []
le_encoders = {}
for i, col in enumerate(categorical_features):
    le = LabelEncoder()
    encoded_col = le.fit_transform(train_categorical_df[col]).reshape(-1, 1)
    train_categorical_le_list.append(encoded_col)
    le_encoders[col] = le
train_categorical_le = np.hstack(train_categorical_le_list)

# Combine ALL features into the final training matrix (X_train_full)
X_train_full = pd.concat([pd.DataFrame(train_numerical_df.values),
                          pd.DataFrame(train_categorical_le),
                          pd.DataFrame(train_text_pca),
                          pd.DataFrame(train_img_pca)], axis=1)
X_train_full.columns = [str(i) for i in range(X_train_full.shape[1])]
cat_feature_indices = list(range(train_numerical_df.shape[1], train_numerical_df.shape[1] + train_categorical_le.shape[1]))
print(f"Final training matrix shape: {X_train_full.shape}")
print("-" * 50)

# --- 2. Train and Save Final Base Models (Level 1) ---

# --- A. LightGBM ---
print("Training and Saving Final LightGBM Model...")
final_model_lgbm = lgb.LGBMRegressor(**lgbm_params)
final_model_lgbm.fit(X_train_full, y)
joblib.dump(final_model_lgbm, 'final_model_lgbm.joblib')
print("LightGBM saved.")

# --- B. XGBoost ---
print("Training and Saving Final XGBoost Model...")
final_model_xgb = xgb.XGBRegressor(**xgb_params)
final_model_xgb.fit(X_train_full, y, verbose=False)
joblib.dump(final_model_xgb, 'final_model_xgb.joblib')
print("XGBoost saved.")

# --- C. CatBoost ---
print("Training and Saving Final CatBoost Model...")
final_model_cat = cb.CatBoostRegressor(**cat_params)
final_model_cat.fit(X_train_full, y, cat_features=cat_feature_indices, verbose=0)
joblib.dump(final_model_cat, 'final_model_cat.joblib')
print("CatBoost saved.")

# --- D. MLP (Neural Network) ---
print("Training and Saving Final MLP Model and StandardScaler...")
# Fit StandardScaler on the *full* final feature matrix
scaler_nn = StandardScaler()
X_train_nn = scaler_nn.fit_transform(X_train_full.values)
train_loader = DataLoader(TabularDataset(X_train_nn, y), batch_size=mlp_params['batch_size'], shuffle=True)

final_model_mlp = MLPModel(X_train_nn.shape[1]).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(final_model_mlp.parameters(), lr=mlp_params['learning_rate'], weight_decay=mlp_params['weight_decay'])

# Simple training loop for full data (no validation/early stopping)
for epoch in range(mlp_params['epochs']):
    final_model_mlp.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = final_model_mlp(batch_X.to(DEVICE))
        loss = criterion(outputs, batch_y.to(DEVICE))
        loss.backward()
        optimizer.step()
    print(f"MLP Epoch {epoch+1}/{mlp_params['epochs']} finished.")

torch.save(final_model_mlp.state_dict(), 'final_model_mlp.pt')
joblib.dump(scaler_nn, 'final_scaler_nn.joblib')
print("MLP model state dict and StandardScaler saved.")
print("-" * 50)

# --- 3. Save Preprocessing Objects ---
print("Saving final PCA transformers and Label Encoders...")
joblib.dump(text_pca, 'final_text_pca.joblib')
joblib.dump(img_pca, 'final_img_pca.joblib')
joblib.dump(le_encoders, 'final_label_encoders.joblib')
print("Preprocessing objects saved.")
print("\nALL FINAL BASE MODELS AND TRANSFORMERS ARE READY FOR TEST PREDICTION.")