In [None]:
# SCRIPT: final_prediction_pipeline.py - FINAL VERSION

import pandas as pd
import numpy as np
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')

# --- Configuration & Paths ---
# !!! IMPORTANT: VERIFY THESE PATHS MATCH YOUR LOCAL SETUP !!!
TEST_CSV_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/test_with_engineered_features.csv'  # Test CSV path
TEST_TEXT_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/test_text_embeddings_CUSTOM.npy'
TEST_IMG_EMBEDDINGS_PATH = '/Users/visheshbishnoi/Desktop/amazon/code/test_image_embeddings_siglip_with_id (1).npy'
SUBMISSION_FILE_PATH = 'test_out1.csv'  # Final output CSV name
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Model & Transformer Paths ---
# NOTE: Removed invisible U+00A0 characters from this block to fix SyntaxError
BASE_MODEL_PATHS = {
    'lgbm': 'final_model_lgbm.joblib',
    'xgb': 'final_model_xgb.joblib',
    'cat': 'final_model_cat.joblib',
    'mlp_state': 'final_model_mlp.pt',
    'mlp_scaler': 'final_scaler_nn.joblib'
}
META_MODEL_PATH = 'final_meta_model.joblib' 

TRANSFORMER_PATHS = {
    'text_pca': 'final_text_pca.joblib',
    'img_pca': 'final_img_pca.joblib',
    'label_encoders': 'final_label_encoders.joblib'
}

numerical_features = ['value', 'ipq', 'value_per_item', 'is_organic', 'is_sugar_free', 'is_premium_keyword',
    'is_dietary_specific']
categorical_features = ['unit_standardized', 'brand_cleaned']


# --- PyTorch MLP Model Definition (MUST be identical to training) ---
class TabularDataset(Dataset):
    def __init__(self, X): self.X = torch.tensor(X, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx]

class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 1))
    def forward(self, x): return self.model(x).squeeze(-1)


def generate_submission():
    print("--- Starting Final Prediction Pipeline ---")

    # --- 1. Load Data and Saved Components ---
    print("Loading Data and Saved Objects...")
    try:
        test_df = pd.read_csv(TEST_CSV_PATH)
        original_test_ids = test_df['sample_id']

        test_text_embed = np.load(TEST_TEXT_EMBEDDINGS_PATH)
        test_img_embed_data = np.load(TEST_IMG_EMBEDDINGS_PATH, allow_pickle=True)
        id_to_img_embedding_map = {item['sample_id']: item['embedding'] for item in test_img_embed_data if isinstance(item, dict) and 'sample_id' in item and 'embedding' in item}
    
        if id_to_img_embedding_map:
            # Use the shape of the first loaded image embedding
            IMG_EMBEDDING_DIM = next(iter(id_to_img_embedding_map.values())).shape[0]
        else:
            # Fallback if map is empty
            IMG_EMBEDDING_DIM = 768 # Default SigLIP/CLIP dimension
            print(f"Warning: Image embedding map is empty. Using default IMG_EMBEDDING_DIM={IMG_EMBEDDING_DIM}")

        # This line now uses the correct IMG_EMBEDDING_DIM for alignment/padding
        aligned_img_embed = np.array([id_to_img_embedding_map.get(sid, np.zeros(IMG_EMBEDDING_DIM)) for sid in test_df['sample_id']])
 

        # Load transformers and models
        text_pca = joblib.load(TRANSFORMER_PATHS['text_pca'])
        img_pca = joblib.load(TRANSFORMER_PATHS['img_pca'])
        le_encoders = joblib.load(TRANSFORMER_PATHS['label_encoders'])
        mlp_scaler = joblib.load(BASE_MODEL_PATHS['mlp_scaler'])
        
        final_meta_model = joblib.load(META_MODEL_PATH)
        final_model_lgbm = joblib.load(BASE_MODEL_PATHS['lgbm'])
        final_model_xgb = joblib.load(BASE_MODEL_PATHS['xgb'])
        final_model_cat = joblib.load(BASE_MODEL_PATHS['cat'])

        mlp_input_size = text_pca.n_components + img_pca.n_components + len(le_encoders) + len(numerical_features)
        final_model_mlp = MLPModel(mlp_input_size).to(DEVICE)
        final_model_mlp.load_state_dict(torch.load(BASE_MODEL_PATHS['mlp_state'], map_location=DEVICE))
        final_model_mlp.eval()

    except FileNotFoundError as e:
        print(f"Error: Required file not found: {e}")
        print("Please ensure all model and transformer files are in the current directory.")
        return
    
    print(" All data and objects loaded successfully.")
    
    # --- 2. Test Data Preprocessing (Using TRAIN-fitted transformers) ---
    print("\n--- Preprocessing Test Data ---")
    
    # Imputation 
    for col in numerical_features:
        test_df[col] = test_df[col].fillna(test_df[col].median()) 
        
    test_numerical_df = test_df[numerical_features]
    test_categorical_df = test_df[categorical_features].fillna('unknown').astype(str)

    # Apply PCA (using fitted transformers)
    test_text_pca = text_pca.transform(test_text_embed)
    test_img_pca = img_pca.transform(aligned_img_embed)

    # Apply Label Encoding (Handling unseen labels)
    print("Applying Label Encoding (handling unseen labels)...")
    test_categorical_le_list = []
    
    for col in categorical_features:
        le = le_encoders[col]
        
        # Create mapping from fitted classes
        le_mapping = {label: i for i, label in enumerate(le.classes_)}
        
        # Assign the unknown value (one greater than the max encoded value)
        unknown_value = len(le.classes_) 
        
        # Apply mapping using lambda function: map known labels, assign unknown_value to new labels
        encoded_col = test_categorical_df[col].apply(lambda x: le_mapping.get(x, unknown_value)).values.reshape(-1, 1)
        
        test_categorical_le_list.append(encoded_col)
        
    test_categorical_le = np.hstack(test_categorical_le_list)

    # Combine all features for tree models and for NN scaling
    X_test_full = pd.concat([pd.DataFrame(test_numerical_df.values), 
                             pd.DataFrame(test_categorical_le),
                             pd.DataFrame(test_text_pca), 
                             pd.DataFrame(test_img_pca)], axis=1)
    X_test_full.columns = [str(i) for i in range(X_test_full.shape[1])]
    print(f"Final test feature matrix shape: {X_test_full.shape}")

    # --- 3. Level 1: Generate 4 Base Predictions (Meta-Features) ---
    print("\n--- Level 1: Generating 4 Base Predictions (Log-Price) ---")
    meta_features_test = pd.DataFrame()

    # Tree Models
    meta_features_test['lgbm_pred'] = final_model_lgbm.predict(X_test_full)
    meta_features_test['xgb_pred'] = final_model_xgb.predict(X_test_full)
    meta_features_test['cat_pred'] = final_model_cat.predict(X_test_full)

    # MLP (NN)
    X_test_nn = mlp_scaler.transform(X_test_full.values) # Use saved StandardScaler
    test_loader = DataLoader(TabularDataset(X_test_nn), batch_size=256, shuffle=False)
    mlp_preds = []
    with torch.no_grad():
        for batch_X in test_loader:
            outputs = final_model_mlp(batch_X.to(DEVICE))
            mlp_preds.extend(outputs.cpu().numpy())
    meta_features_test['mlp_pred'] = np.array(mlp_preds)
    
    print(f"Meta-features (Level 1 Predictions) shape: {meta_features_test.shape}")

    # --- 4. Level 2: Final Prediction using Meta-Model ---
    print("\n--- Level 2: Applying LightGBM Stacker (Final Log-Price Prediction) ---")
    # The meta-model predicts the final log-transformed price
    final_log_price_preds = final_meta_model.predict(meta_features_test)

    # --- 5. Reverse Transformation and Submission ---
    # Convert from log-transformed price back to actual price: price = exp(log_price) - 1 (np.expm1)
    final_price_preds = np.expm1(final_log_price_preds)

    # Clip negative predictions to 0 (since price cannot be negative)
    final_price_preds[final_price_preds < 0] = 0 

    submission_df = pd.DataFrame({
        'sample_id': original_test_ids,
        'price': final_price_preds
    })

    submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)

    print("\n" + "="*50)
    print(f"**Final Submission File Created Successfully!**")
    print(f"File: {SUBMISSION_FILE_PATH}")
    print(f"Total predictions made: {len(submission_df)}")
    print("="*50)

if __name__ == '__main__':
    generate_submission()