In [None]:

import os
import re
import gc
import urllib
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import partial
from PIL import Image
import joblib

# Machine Learning and Deep Learning Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision import models, transforms
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error

# Import Optuna for hyperparameter tuning
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)


# Configuration
class Config:
    DATA_PATH = "/kaggle/input/amazon-ml-final"
    WORK_DIR = "/kaggle/working/work_dir"
    TRAIN_IMG_DIR = "./train_images"
    TEST_IMG_DIR = "./test_images"
    TEXT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # produces 384-dim embeddings
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    IMG_SIZE = 224

os.makedirs(Config.WORK_DIR, exist_ok=True)
print(f"Using device: {Config.DEVICE}")

# Load Data
def load_data():
    """Loads train and test CSV files."""
    train_df = pd.read_csv(os.path.join(Config.DATA_PATH, "train.csv"))
    test_df = pd.read_csv(os.path.join(Config.DATA_PATH, "test.csv"))
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"\nPrice statistics:\n{train_df['price'].describe()}")
    return train_df, test_df

train_df, test_df = load_data()

# Image Downloading Utilities
train_df["image_path"] = train_df["sample_id"].apply(lambda x: os.path.join(Config.TRAIN_IMG_DIR, f"{x}.jpg"))
test_df["image_path"] = test_df["sample_id"].apply(lambda x: os.path.join(Config.TEST_IMG_DIR, f"{x}.jpg"))

# Text Preprocessing & Feature Extraction
def preprocess_text(text):
    if pd.isna(text): return ""
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

class TextFeatureExtractor:
    def init(self, model_name=Config.TEXT_MODEL):
        print(f"Loading text model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.model.to(Config.DEVICE)

    def extract_features(self, texts, batch_size=32):
        texts = [preprocess_text(t) for t in texts]
        return self.model.encode(
            texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True
        )

print("\nExtracting text embeddings...")
text_extractor = TextFeatureExtractor()
train_text_features = text_extractor.extract_features(train_df["catalog_content"].values)
test_text_features = text_extractor.extract_features(test_df["catalog_content"].values)
print(f"Text features shape: {train_text_features.shape}")
del text_extractor; gc.collect(); torch.cuda.empty_cache()

# Compress Features using Autoencoder
class AutoEncoder(nn.Module):
    def init(self, input_dim, bottleneck_dim=128):
        super().init()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(), nn.Linear(512, bottleneck_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 512), nn.ReLU(), nn.Linear(512, input_dim)
        )
    def forward(self, x):
        enc = self.encoder(x)
        dec = self.decoder(enc)
        return enc, dec

def train_autoencoder(features, input_dim, bottleneck_dim=128, epochs=8, batch_size=512):
    device = Config.DEVICE
    model = AutoEncoder(input_dim, bottleneck_dim).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()
    dataset = TensorDataset(torch.tensor(features, dtype=torch.float32))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    print("\nTraining Autoencoder...")
    model.train()
    for ep in range(epochs):
        total_loss = 0
        for batch in loader:
            x = batch[0].to(device)
            _, dec = model(x)
            loss = loss_fn(dec, x)
            opt.zero_grad(); loss.backward(); opt.step()
            total_loss += loss.item()
        print(f"Epoch {ep+1}/{epochs} - Loss: {total_loss/len(loader):.6f}")
    return model

def apply_autoencoder(model, features, batch_size=512):
    device = Config.DEVICE
    model.eval()
    dataset = TensorDataset(torch.tensor(features, dtype=torch.float32))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    compressed = []
    with torch.no_grad():
        for batch in loader:
            x = batch[0].to(device)
            enc, _ = model(x)
            compressed.append(enc.cpu().numpy())
    return np.vstack(compressed)

embed_dim = train_text_features.shape[1]
autoencoder_model = train_autoencoder(train_text_features, input_dim=embed_dim, bottleneck_dim=128)
print("Applying autoencoder to train and test data...")
train_text_features_small = apply_autoencoder(autoencoder_model, train_text_features)
test_text_features_small = apply_autoencoder(autoencoder_model, test_text_features)
print(f"Compressed text shape: {train_text_features_small.shape}")
del train_text_features, test_text_features, autoencoder_model; gc.collect(); torch.cuda.empty_cache()

# Image Feature Extraction
class ImageDataset(Dataset):
    def init(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform
    def len(self): return len(self.image_paths)
    def getitem(self, idx):
        path = self.image_paths[idx]
        if not path or not os.path.exists(path):
            image = Image.new("RGB", (Config.IMG_SIZE, Config.IMG_SIZE), "black")
        else:
            try: image = Image.open(path).convert("RGB")
            except Exception: image = Image.new("RGB", (Config.IMG_SIZE, Config.IMG_SIZE), "black")
        return self.transform(image) if self.transform else image

class ImageFeatureExtractor:
    def init(self):
        print("\nLoading EfficientNet-B5 backbone...")
        self.model = models.efficientnet_b5(weights='IMAGENET1K_V1')
        self.model.classifier = nn.Identity()
        self.model.to(Config.DEVICE).eval()
        self.transform = transforms.Compose([
            transforms.Resize((Config.IMG_SIZE, Config.IMG_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    def extract_features(self, paths, batch_size=32):
        ds = ImageDataset(paths, self.transform)
        dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
        feats = []
        with torch.no_grad():
            for batch in tqdm(dl, desc="Processing images"):
                batch = batch.to(Config.DEVICE)
                feats.append(self.model(batch).cpu().numpy())
        return np.vstack(feats)

image_extractor = ImageFeatureExtractor()
train_image_features = image_extractor.extract_features(train_df["image_path"].values)
test_image_features = image_extractor.extract_features(test_df["image_path"].values)
print(f"Image features shape: {train_image_features.shape}")
del image_extractor; gc.collect(); torch.cuda.empty_cache()

# Combine & Scale
print("\nCombining and scaling features...")
train_features = np.concatenate([train_text_features_small, train_image_features], axis=1)
test_features = np.concatenate([test_text_features_small, test_image_features], axis=1)
print(f"Combined feature shape: {train_features.shape}")

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)
del train_features, test_features, train_text_features_small, test_text_features_small, train_image_features, test_image_features; gc.collect()

# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    train_features_scaled, train_df["price"].values, test_size=0.1, random_state=42
)


# ### MODIFICATION ###: Model Training with Optuna Hyperparameter Tuning
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denom
    diff[denom == 0] = 0
    return np.mean(diff) * 100

def objective(trial):
    params = {
        'objective': 'regression_l1', # MAE
        'metric': 'mae',
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 32, 512),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

    model = LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(100, verbose=False)]
    )

    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    return mae

print("\nRunning Optuna to find best hyperparameters...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"Best trial found: MAE = {study.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

print("\nRetraining on full training data with best hyperparameters...")

# Get the best parameters from the Optuna study
best_params = study.best_params
best_params['random_state'] = 42
best_params['n_jobs'] = -1
best_params['objective'] = 'regression_l1'
best_params['metric'] = 'mae'

# First, train a temporary model on the train/val split to find the optimal number of estimators
temp_model = LGBMRegressor(**best_params, n_estimators=4000)
temp_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(100, verbose=False)]
)
# Use the best iteration from this temporary model for the final model
best_iteration = temp_model.best_iteration_ if temp_model.best_iteration_ else 4000
print(f"Optimal number of estimators found: {best_iteration}")

# Now, create and train the final model on the ENTIRE training dataset
final_model = LGBMRegressor(**best_params, n_estimators=best_iteration)
final_model.fit(train_features_scaled, train_df["price"].values)
print("✅ Final model training complete.")

# Make predictions on the test set
test_preds = np.clip(final_model.predict(test_features_scaled), 0, None)

# ### MODIFICATION ###: Save Model, Scaler, and Submission

# --- Save the scaler ---
scaler_path = os.path.join(Config.WORK_DIR, "standard_scaler.bin")
joblib.dump(scaler, scaler_path)
print(f"✅ Scaler saved to: {scaler_path}")

# --- Save the LightGBM model ---
model_path = os.path.join(Config.WORK_DIR, "lgbm_final_model.txt")
final_model.save_model(model_path)
print(f"✅ Final model saved to: {model_path}")

# --- Create and save the submission file ---
submission = pd.DataFrame({"sample_id": test_df["sample_id"], "price": test_preds})
out_path = os.path.join(Config.WORK_DIR, "submission1.csv")
submission.to_csv(out_path, index=False)
print(f"\n✅ Submission file saved: {out_path}")
print(submission.head())