# Model Training

## 1. Preparing Environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import yaml
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

## 2. Spliting data set

In [2]:
import sys
sys.path.append('..')

from src.utils.utils import load_config
from src.data.load_data import load_data

config = load_config('../configs/config.yaml')

# Load data
train_df, test_df = load_data(train_path="../data/processed/train_encoded.csv",
                              test_path="../data/processed/test_encoded.csv"
                              )

# Split features and target
X = train_df.drop(columns=[config["model"]["target_column"]])
y = train_df[config["model"]["target_column"]]

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=config["test_size"], 
    random_state=config["random_seed"]
)

Loading training data from ../data/processed/train_encoded.csv
Loading test data from ../data/processed/test_encoded.csv
Train shape: (61609, 97)
Test shape: (41074, 96)


## 3. Model Tranining and Baseline

### 3.1 Baseline model - Random Forest Regressor

In [3]:
from src.data.model_training import train_and_evaluate

model, val_rmse = train_and_evaluate(
    X_train, y_train, X_val, y_val, 
    config_path="../configs/config.yaml", 
    params_path="../configs/model_params.yaml",
    suffix=3
)




Model saved to ../outputs/models/random_forest_3.joblib
Validation RMSE: 4.3169


In [4]:
from src.utils.io import load_model
from src.data.model_training import predict_and_save
# get predictions for the test datafrom src.utils.io import load_model
model = load_model("../outputs/models/random_forest_3.joblib")
X_test = test_df  
predict_and_save(model, X_test, config_path="../configs/config.yaml", suffix="_6")

Predictions saved to ../outputs/predictions/random_forest_predictions_6.csv


### 3.2 Optuna-Optimized XGBoost

In [3]:
from src.data.model_training import optimize_xgboost_with_optuna

xgbmodel, val_rmse, best_params = optimize_xgboost_with_optuna(
    X_train, y_train, X_val, y_val,
    config_path="../configs/config.yaml",
    suffix="optuna",
    n_trials=100
)

[I 2025-05-30 05:05:49,944] A new study created in memory with name: no-name-4cbffd4f-1b1b-4be1-92c8-70935652a6c1
[I 2025-05-30 05:05:53,233] Trial 0 finished with value: 4.212680774284977 and parameters: {'n_estimators': 691, 'max_depth': 3, 'learning_rate': 0.15150489807299353, 'subsample': 0.914508599767698, 'colsample_bytree': 0.9764012506727722, 'gamma': 3.371930207152678, 'reg_alpha': 1.748238792224276, 'reg_lambda': 4.060478307764652}. Best is trial 0 with value: 4.212680774284977.
[I 2025-05-30 05:06:03,926] Trial 1 finished with value: 2.891508060323374 and parameters: {'n_estimators': 510, 'max_depth': 9, 'learning_rate': 0.036728524384104885, 'subsample': 0.6056040575131305, 'colsample_bytree': 0.6838508296626272, 'gamma': 4.8994169500433085, 'reg_alpha': 0.42883951744567994, 'reg_lambda': 0.8070364504519806}. Best is trial 1 with value: 2.891508060323374.
[I 2025-05-30 05:06:32,257] Trial 2 finished with value: 2.932844002686095 and parameters: {'n_estimators': 673, 'max_de

Model saved to ../outputs/models/xgboost_optuna_optuna.joblib
Best Validation RMSE: 2.6896


In [5]:
import sys
sys.path.append('..')
from src.utils.io import load_model
from src.data.model_training import predict_and_save
# get predictions for the test datafrom src.utils.io import load_model
model = load_model("../outputs/models/xgboost_optuna_optuna.joblib")
X_test = test_df  
predict_and_save(model, X_test, config_path="../configs/config.yaml", suffix="xgb")

Predictions saved to ../outputs/predictions/predictions_xgb.csv


#### 3.2.1 Feature importance of XGB model

In [4]:
import sys
sys.path.append('..')
from src.utils.io import load_model

model = load_model("../outputs/models/xgboost_optuna_optuna.joblib")

# Get feature importances
importances = model.feature_importances_
feature_names = X_train.columns

# Create and sort DataFrame
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# Display top 20 features
print(importance_df.head(20))



                               feature  importance
74         composition_label_1_encoded    0.286913
77            track_identifier_encoded    0.215141
75         composition_label_2_encoded    0.208035
76          creator_collective_encoded    0.046694
73         composition_label_0_encoded    0.046539
64                 is_new_year_release    0.009245
53                        release_year    0.007129
69            songs_released_that_year    0.006730
72  release_dayofweek_popularity_score    0.005196
82         weekday_of_release_Thursday    0.005132
63                  is_valentines_week    0.004976
85            season_of_release_autumn    0.004070
20                    time_signature_1    0.003899
70           songs_released_that_month    0.003679
56                   release_dayofweek    0.003550
84        weekday_of_release_Wednesday    0.003127
13               album_component_count    0.002856
29                 organic_immersion_0    0.002834
81           weekday_of_release