# Final Ensemble Model

This notebook combines text and image features to create the final ensemble model.


In [None]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from scipy.sparse import hstack
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')


In [None]:
# SMAPE metric function
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100

# Load preprocessed data
train_df = pd.read_csv('../dataset/train_preprocessed.csv')
print(f"Training data shape: {train_df.shape}")

# Load image features if available
try:
    image_features_df = pd.read_csv('../dataset/image_features.csv', index_col='sample_id')
    print(f"Image features shape: {image_features_df.shape}")
    has_image_features = True
except FileNotFoundError:
    print("Image features not found, using text features only")
    has_image_features = False


In [None]:
# Prepare text features
print("Preparing text features...")

# TF-IDF features
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=5,
    max_df=0.95
)

tfidf_features = tfidf.fit_transform(train_df['cleaned_text'])
print(f"TF-IDF features shape: {tfidf_features.shape}")

# Numerical features
numerical_features = ['ipq', 'text_length', 'word_count']
X_numerical = train_df[numerical_features].values

# Combine text features
X_text = hstack([tfidf_features, X_numerical])
print(f"Combined text features shape: {X_text.shape}")

# Prepare image features if available
if has_image_features:
    # Align image features with training data
    train_with_images = train_df[train_df['sample_id'].isin(image_features_df.index)]
    image_features_aligned = image_features_df.loc[train_with_images['sample_id']].values
    
    print(f"Training samples with images: {len(train_with_images)}")
    
    # Combine text and image features
    X_combined = hstack([X_text, image_features_aligned])
    y = train_with_images['price'].values
    
    print(f"Final combined features shape: {X_combined.shape}")
else:
    X_combined = X_text
    y = train_df['price'].values
    print(f"Using text features only, shape: {X_combined.shape}")


In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)
print(f"Train set: {X_train.shape}, Validation set: {X_val.shape}")

# Train final LightGBM model with optimized parameters
print("Training final LightGBM model...")
final_lgb = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=10,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbose=-1
)

final_lgb.fit(X_train, y_train)
lgb_pred = final_lgb.predict(X_val)

lgb_mae = mean_absolute_error(y_val, lgb_pred)
lgb_smape = smape(y_val, lgb_pred)

print(f"Final LightGBM - MAE: {lgb_mae:.2f}, SMAPE: {lgb_smape:.2f}%")


In [None]:
# Train final XGBoost model
print("Training final XGBoost model...")
final_xgb = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbosity=0
)

final_xgb.fit(X_train, y_train)
xgb_pred = final_xgb.predict(X_val)

xgb_mae = mean_absolute_error(y_val, xgb_pred)
xgb_smape = smape(y_val, xgb_pred)

print(f"Final XGBoost - MAE: {xgb_mae:.2f}, SMAPE: {xgb_smape:.2f}%")


In [None]:
# Final ensemble
print("Creating final ensemble...")
ensemble_pred = (lgb_pred + xgb_pred) / 2

ensemble_mae = mean_absolute_error(y_val, ensemble_pred)
ensemble_smape = smape(y_val, ensemble_pred)

print(f"Final Ensemble - MAE: {ensemble_mae:.2f}, SMAPE: {ensemble_smape:.2f}%")

# Results summary
print("\nFinal Model Performance Summary:")
print(f"LightGBM: MAE={lgb_mae:.2f}, SMAPE={lgb_smape:.2f}%")
print(f"XGBoost: MAE={xgb_mae:.2f}, SMAPE={xgb_smape:.2f}%")
print(f"Ensemble: MAE={ensemble_mae:.2f}, SMAPE={ensemble_smape:.2f}%")


In [None]:
# Save final models and preprocessing objects
import os
os.makedirs('../models', exist_ok=True)

# Save models
joblib.dump(final_lgb, '../models/final_lgb.pkl')
joblib.dump(final_xgb, '../models/final_xgb.pkl')
joblib.dump(tfidf, '../models/final_tfidf.pkl')

# Save feature information
feature_info = {
    'numerical_features': numerical_features,
    'has_image_features': has_image_features,
    'tfidf_max_features': 5000
}

joblib.dump(feature_info, '../models/feature_info.pkl')

print("Final models and preprocessing objects saved!")
