In [19]:
# Importing necessary libraries for modeling
import numpy as np
import pandas as pd
from sklearn.base import clone # For cloning estimators in cross-validation
from sklearn.compose import ColumnTransformer # To apply different transformers to different columns
from sklearn.pipeline import Pipeline # To chain multiple processing steps and a final estimator
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer, OneHotEncoder # Various encoding/discretization methods
from sklearn.model_selection import StratifiedKFold # Cross-validation strategy
from xgboost import XGBRegressor # Gradient Boosting Machine from XGBoost
from lightgbm import LGBMRegressor # Gradient Boosting Machine from LightGBM
from catboost import CatBoostRegressor # Gradient Boosting Machine from CatBoost
import category_encoders as ce # Advanced categorical encoders (install with: pip install category-encoders)

# General Parameters for reproducibility and control
SEED = 92       # Random seed for reproducibility
N_SPLITS = 10   # Number of folds for cross-validation

TARGET = 'log_price' # The logarithmically transformed target variable for modeling

# Columns to be dropped from the feature set (X) before training
# These include original identifiers, the original price, and engineered features
# that might be redundant or explicitly excluded from the model.
DROP_COLS = [
    'id', 'plate', 'price', 'log_price', 'is_train', # Essential IDs and target variables
    # Specific engineered features that might be dropped if they are redundant or not performing well:
    "is_number_000", "is_number_444", "is_number_222", "is_number_700", 
    "is_number_555", "quarter", "day_of_week", "is_weekend", # Time-based features
    "prestige_score", # Dropping the combined score if its components are used directly or if it leads to multicollinearity
    "is_number_300","is_number_333","is_number_400" # Potentially redundant number pattern flags
] 

In [20]:
train_df = pd.read_pickle('data/processed_train.pkl')
test_df = pd.read_pickle('data/processed_test.pkl')

In [21]:
# Defining the features (X) and the target (y) for the training set,
# and features for the test set (X_test).
# 'errors='ignore'' handles cases where a column in DROP_COLS might not exist, preventing errors.
X = train_df.drop(columns=DROP_COLS, errors='ignore')
y = train_df[TARGET].copy()
X_test = test_df.drop(columns=DROP_COLS, errors='ignore')

print("Data split into training and testing sets.")
print(f"Training features (X) shape: {X.shape}")
print(f"Training target (y) shape: {y.shape}")
print(f"Test features (X_test) shape: {X_test.shape}")

Data split into training and testing sets.
Training features (X) shape: (51640, 50)
Training target (y) shape: (51640,)
Test features (X_test) shape: (7695, 50)


#### Automatic Column Detection

In [22]:
# This section is crucial for handling different data types dynamically.
# It automatically identifies numerical, boolean, and categorical columns,
# and further segments categorical columns by their cardinality to apply
# appropriate encoding strategies.

def detect_columns(X):
    """
    Detects and segments columns by their data type and cardinality.
    This helps in applying specific preprocessing steps to different column types.
    """
    bool_cols = [c for c in X.columns if X[c].dtype == 'bool'] # Identify boolean columns
    num_cols = [c for c in X.columns if X[c].dtype.kind in 'if' and c not in bool_cols] # Identify numerical (int/float) columns, excluding booleans
    cat_cols = [c for c in X.columns if c not in num_cols + bool_cols]  # Remaining columns are treated as categorical

    # Further segmentation of categorical columns by cardinality (number of unique values)
    # Different encoding strategies are optimal for different cardinalities.
    cat_low = [c for c in cat_cols if X[c].nunique() <= 20] # Low cardinality for One-Hot Encoding
    cat_mid = [c for c in cat_cols if 20 < X[c].nunique() <= 200] # Medium cardinality for Ordinal Encoding
    cat_high = [c for c in cat_cols if X[c].nunique() > 200] # High cardinality for Target Encoding

    print('\nColumn Summary ➜ Numerical:', len(num_cols),
          '| Boolean:', len(bool_cols),
          '| Low Cardinality Categorical:', len(cat_low),
          '| Medium Cardinality Categorical:', len(cat_mid),
          '| High Cardinality Categorical:', len(cat_high))

    return num_cols, bool_cols, cat_low, cat_mid, cat_high

# Apply the column detection function to the training features
num_cols, bool_cols, cat_low, cat_mid, cat_high = detect_columns(X)


Column Summary ➜ Numerical: 24 | Boolean: 16 | Low Cardinality Categorical: 3 | Medium Cardinality Categorical: 3 | High Cardinality Categorical: 4


#### Preprocessing Pipeline

In [23]:
# The `ColumnTransformer` is the core component here. It allows applying
# different transformations to different subsets of columns in parallel.
# This ensures that each column type is handled appropriately before feeding to the model.

preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols), # Numerical columns: 'passthrough' means no transformation
        ('bool', 'passthrough', bool_cols), # Boolean columns: 'passthrough' as they are already binary
        ('low', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_low),  # One-Hot Encoding for low cardinality categories
                                                                                        # 'handle_unknown='ignore'' prevents errors if new categories appear in test set
                                                                                        # 'sparse_output=False' returns a dense NumPy array
        ('mid', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_mid),  # Ordinal Encoding for medium cardinality categories
                                                                                                # Assigns a unique integer to each category.
                                                                                                # 'unknown_value=-1' handles unseen categories.
        ('high', ce.TargetEncoder(cols=cat_high, smoothing=0.2), cat_high)  # Target Encoding for high cardinality categories
                                                                            # Replaces category with mean of target. 'smoothing' helps prevent overfitting.
    ],
    remainder='drop',  # Drops any columns not explicitly specified in `transformers` (safer approach)
    n_jobs=-1  # Utilizes all available CPU cores for parallel processing during transformation
)
print("\nPreprocessing pipeline (ColumnTransformer) defined.")


Preprocessing pipeline (ColumnTransformer) defined.


#### MODELS AND PARAMETERS (Optimized and Modular)

In [27]:
# XGBoost Parameters
xgb_params = {
    'n_estimators': 1433,
    'max_depth': 12,
    'learning_rate': 0.01852160907217988,
    'subsample': 0.6786672470738663,
    'colsample_bytree': 0.46208650739218005,
    'reg_alpha': 0.017519138973638618,
    'reg_lambda': 0.2839310763317462,
    'gamma': 0.0033995958574628547,
    'tweedie_variance_power': 1.0869464555654937, # Tweedie objective is suitable for target variables with a skewed distribution and many zero values, which can be the case for prices.
    'objective': 'reg:tweedie',
    'n_jobs': -1,
    'random_state': SEED
}

# LightGBM Parameters
lgb_params = {
    'n_estimators': 999,
    'max_depth': 11,
    'learning_rate': 0.07607568555547708,
    'subsample': 0.6363036032688429,
    'colsample_bytree': 0.5072021102992719,
    'min_child_samples': 97,
    'reg_alpha': 0.16671454380081874,
    'reg_lambda': 0.6455320711051608,
    'n_jobs': -1,
    'random_state': SEED
}

# CatBoost Parameters
cat_params = {
    'iterations': 991,
    'depth': 10,
    'learning_rate': 0.06462213707942074,
    'l2_leaf_reg': 1.9289204888270515,
    'subsample': 0.7213225292844163,
    'bagging_temperature': 0.4361642090192932,
    'random_strength': 6.443179917768372,
    'min_data_in_leaf': 72,
    'loss_function': 'RMSE', # Root Mean Squared Error, common for regression tasks
    'verbose': 0, # Suppress training output for cleaner logs
    'random_state': SEED
}

# Dictionary of models to be trained. Easily extensible to include more models.
# Uncomment LGBM and CatBoost to include them in the ensemble.
models = {
    'XGB': XGBRegressor(**xgb_params),
    #'LGBM': LGBMRegressor(**lgb_params),
    #'CatBoost': CatBoostRegressor(**cat_params)
}
print("\nModels and their optimized hyperparameters defined.")

# Construct the full pipeline for each model: preprocessing + estimator
# Each pipeline handles all necessary data transformations before training the model.
pipelines = {name: Pipeline(steps=[('prep', preprocess), ('model', model)]) for name, model in models.items()}
print("Pipelines constructed: Preprocessing -> Model.")


Models and their optimized hyperparameters defined.
Pipelines constructed: Preprocessing -> Model.


#### SMAPE METRIC Definition

In [28]:
# The Symmetric Mean Absolute Percentage Error (SMAPE) is often used in forecasting
# and is robust to zero values in the actuals. It's defined once to ensure consistency.

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    Formula: (1/n) * Sum(|y_true - y_pred| / ((|y_true| + |y_pred|) / 2)) * 100
    This metric handles cases where y_true or y_pred (or both) are zero.
    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    
    # Handle division by zero: if denominator is zero (i.e., both y_true and y_pred are zero),
    # the corresponding term for SMAPE is defined as 0.0 to avoid NaN/Inf.
    smape_term = np.zeros_like(diff, dtype=float)
    non_zero_denom = denominator != 0 # Identify where denominator is not zero
    smape_term[non_zero_denom] = diff[non_zero_denom] / denominator[non_zero_denom]
    
    return np.mean(smape_term) * 100

print("\nSMAPE evaluation metric defined.")


SMAPE evaluation metric defined.


In [29]:
# Stratified K-Fold cross-validation is used to ensure that each fold has a
# representative distribution of the target variable. This is especially important
# for skewed targets or when specific target ranges are more critical.

# Bin the target variable ('log_price') to create "strata" for StratifiedKFold.
# This effectively treats the regression problem as a classification problem for splitting purposes,
# ensuring similar target distributions across folds.
y_bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') \
    .fit_transform(y.values.reshape(-1, 1)).astype(int).ravel()
print(f"\nTarget variable ('{TARGET}') binned into {y_bins.max() + 1} strata for stratification.")

# Initialize StratifiedKFold with the specified number of splits and random state.
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# Dictionaries to store out-of-fold (OOF) predictions and test predictions for each model.
# OOF predictions are used for ensemble weighting and final CV evaluation.
# Test predictions are accumulated across folds for final submission.
oof_preds = {name: np.zeros(len(y)) for name in models}
test_preds = {name: np.zeros(len(X_test)) for name in models}
feature_importances = {} # To store feature importances for each model (if available)

print('\n===== CROSS-VALIDATION TRAINING =====')
# Iterate through each defined model and perform cross-validation
for model_name, pipeline in pipelines.items():
    print(f"\nInitiating training for model: {model_name}...")
    try:
        # Loop through each fold generated by StratifiedKFold
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_bins), 1):
            print(f"  Fold {fold:02d}/{N_SPLITS}")
            X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx] # Training data for the current fold
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx] # Validation data for the current fold

            # Train the pipeline on the training data
            pipeline.fit(X_tr, y_tr)
            
            # Store out-of-fold (OOF) predictions for the validation set
            oof_preds[model_name][val_idx] = pipeline.predict(X_val)
            
            # Accumulate test predictions by averaging predictions from each fold
            # This is a simple form of ensemble averaging.
            test_preds[model_name] += pipeline.predict(X_test) / N_SPLITS

        # Calculate the overall cross-validation SMAPE for the current model
        # Predictions are converted back from log-price scale to original price scale for SMAPE calculation.
        cv_smape = smape(np.exp(y), np.exp(oof_preds[model_name]))
        print(f'⮕  Overall CV SMAPE for {model_name}: {cv_smape:.2f}%')

        # Extract feature importances from the trained model (if the model supports it)
        if hasattr(pipeline['model'], 'feature_importances_'):
            # For models like LightGBM, scikit-learn tree models
            feature_importances[model_name] = pipeline['model'].feature_importances_
        elif hasattr(pipeline['model'], 'get_booster'): 
            # For XGBoost, get_booster() allows accessing internal booster attributes
            feature_importances[model_name] = pipeline['model'].get_booster().get_score(importance_type='weight') 
            # 'importance_type' can be 'weight' (number of times a feature is used in splits), 'gain' (average gain across splits), 'cover', etc.
        else:
            feature_importances[model_name] = None # No direct importance available

    except Exception as e:
        print(f"Error during training of model {model_name}: {str(e)}")
        continue  # Skip to the next model in case of an error


Target variable ('log_price') binned into 10 strata for stratification.

===== CROSS-VALIDATION TRAINING =====

Initiating training for model: XGB...
  Fold 01/10
  Fold 02/10
  Fold 03/10
  Fold 04/10
  Fold 05/10
  Fold 06/10
  Fold 07/10
  Fold 08/10
  Fold 09/10
  Fold 10/10
⮕  Overall CV SMAPE for XGB: 35.60%


#### ENSEMBLE PREDICTIONS (Inverse Error Weighting)

In [30]:
# This ensembling method is simple yet effective: models that perform better
# (i.e., have a lower SMAPE on the validation set) are given higher weights
# in the final blended prediction.

# Calculate errors (SMAPE) for each model based on their OOF predictions
errors = {name: smape(np.exp(y), np.exp(oof_preds[name])) for name in models}

# Calculate inverse errors (1 / SMAPE). A lower SMAPE means a higher inverse error.
inv_errors = {k: 1 / v for k, v in errors.items()}

# Normalize inverse errors to get weights that sum to 1.
# These normalized weights determine each model's contribution to the final ensemble.
norm_weights = {k: v / sum(inv_errors.values()) for k, v in inv_errors.items()}

# Combine test predictions from individual models using the calculated normalized weights.
# The predictions are combined in the log-price space, then converted back to original price.
ensemble_preds_log = sum(test_preds[k] * norm_weights[k] for k in test_preds)
ensemble_preds = np.exp(ensemble_preds_log) # Convert back from log-price to original price scale

# Calculate the SMAPE of the ensemble model on the out-of-fold validation set.
# This gives an estimate of the ensemble's performance on unseen data.
ensemble_oof_log = sum(oof_preds[k] * norm_weights[k] for k in oof_preds)
ensemble_smape = smape(np.exp(y), np.exp(ensemble_oof_log))
print(f"\n⮕  Final Ensemble Model CV SMAPE: {ensemble_smape:.2f}%")

# Display the weights of each model in the ensemble, along with their individual SMAPE.
print("\nModel Weights in Ensemble:")
for model_name, weight in norm_weights.items():
    print(f"- {model_name}: {weight:.4f} (Individual OOF SMAPE: {errors[model_name]:.2f}%)")


⮕  Final Ensemble Model CV SMAPE: 35.60%

Model Weights in Ensemble:
- XGB: 1.0000 (Individual OOF SMAPE: 35.60%)


In [31]:
# Final steps to prepare the predictions for submission to Kaggle.

# Apply clipping to the ensemble predictions to prevent unrealistic values.
# 'a_min=0' ensures no negative prices, while 'a_max=None' means no upper limit is applied here.
# Clipping helps to make predictions more robust to potential outliers or model errors.
ensemble_preds = np.clip(ensemble_preds, a_min=0, a_max=None)

# Create the submission DataFrame with 'id' and the final 'price' predictions.
submission = pd.DataFrame({
    'id': test_df['id'], # Ensure 'id' column is taken from the original test_df
    'price': ensemble_preds
})

# Save the submission file to a CSV in the required format.
submission.to_csv('submission.csv', index=False)
print('\n✅  Submission file "submission.csv" saved successfully.')


✅  Submission file "submission.csv" saved successfully.
