# Model Training

## 1. Preparing Environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import yaml
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

## 2. Spliting data set

In [2]:
import sys
sys.path.append('..')

from src.utils.utils import load_config
from src.data.load_data import load_data

config = load_config('../configs/config.yaml')

# Load data
train_df, test_df = load_data(train_path="../data/processed/train_encoded.csv",
                              test_path="../data/processed/test_encoded.csv"
                              )

# Split features and target
X = train_df.drop(columns=[config["model"]["target_column"]])
y = train_df[config["model"]["target_column"]]

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=config["test_size"], 
    random_state=config["random_seed"]
)

Loading training data from ../data/processed/train_encoded.csv
Loading test data from ../data/processed/test_encoded.csv
Train shape: (61609, 96)
Test shape: (41074, 95)


## 3. Model Tranining and Baseline

### 3.1 Baseline model - Random Forest Regressor

In [3]:
from src.data.model_training import train_and_evaluate

model, val_rmse = train_and_evaluate(
    X_train, y_train, X_val, y_val, 
    config_path="../configs/config.yaml", 
    params_path="../configs/model_params.yaml",
    suffix=3
)




Model saved to ../outputs/models/random_forest_3.joblib
Validation RMSE: 10.9913, Accuracy: 0.8665


In [4]:
from src.utils.io import load_model
from src.data.model_training import predict_and_save
# get predictions for the test datafrom src.utils.io import load_model
model = load_model("../outputs/models/random_forest_3.joblib")
X_test = test_df  
predict_and_save(model, X_test, config_path="../configs/config.yaml", suffix="_9")

Predictions saved to ../outputs/predictions/predictions__9.csv


### 3.2 Optuna-Optimized XGBoost

In [5]:
from src.data.model_training import optimize_xgboost_with_optuna

xgbmodel, val_rmse, best_params = optimize_xgboost_with_optuna(
    X_train, y_train, X_val, y_val,
    config_path="../configs/config.yaml",
    suffix="optuna_2",
    n_trials=100
)

[I 2025-05-30 22:44:57,858] A new study created in memory with name: no-name-bc9fccd7-b078-4840-a06e-246341339dec


[I 2025-05-30 22:46:04,303] Trial 0 finished with value: 8.827370076267274 and parameters: {'n_estimators': 938, 'max_depth': 14, 'learning_rate': 0.022159870866153442, 'subsample': 0.9271302455293515, 'colsample_bytree': 0.6049982728152821, 'gamma': 4.783577254550101, 'reg_alpha': 2.3543367117766003, 'reg_lambda': 3.95768310249324}. Best is trial 0 with value: 8.827370076267274.
[I 2025-05-30 22:47:05,257] Trial 1 finished with value: 9.239634711901493 and parameters: {'n_estimators': 282, 'max_depth': 13, 'learning_rate': 0.014244913514137616, 'subsample': 0.8639925678633418, 'colsample_bytree': 0.9359593779147231, 'gamma': 0.12069617667305688, 'reg_alpha': 3.041601331272177, 'reg_lambda': 4.336286297997884}. Best is trial 0 with value: 8.827370076267274.
[I 2025-05-30 22:47:22,743] Trial 2 finished with value: 8.934380260251162 and parameters: {'n_estimators': 715, 'max_depth': 8, 'learning_rate': 0.07754999462692534, 'subsample': 0.9864510324358711, 'colsample_bytree': 0.8678945605

Model saved to ../outputs/models/xgboost_optuna_optuna_2.joblib
Best Validation RMSE: 8.6246, Accuracy: 0.9335


In [6]:
import sys
sys.path.append('..')
from src.utils.io import load_model
from src.data.model_training import predict_and_save
# get predictions for the test datafrom src.utils.io import load_model
model = load_model("../outputs/models/xgboost_optuna_optuna.joblib")
X_test = test_df  
predict_and_save(model, X_test, config_path="../configs/config.yaml", suffix="xgb_2")

Predictions saved to ../outputs/predictions/predictions_xgb_2.csv


#### 3.2.1 Feature importance of XGB model

In [4]:
import sys
sys.path.append('..')
from src.utils.io import load_model

model = load_model("../outputs/models/xgboost_optuna_optuna.joblib")

# Get feature importances
importances = model.feature_importances_
feature_names = X_train.columns

# Create and sort DataFrame
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# Display top 20 features
print(importance_df.head(20))



                               feature  importance
74         composition_label_1_encoded    0.286913
77            track_identifier_encoded    0.215141
75         composition_label_2_encoded    0.208035
76          creator_collective_encoded    0.046694
73         composition_label_0_encoded    0.046539
64                 is_new_year_release    0.009245
53                        release_year    0.007129
69            songs_released_that_year    0.006730
72  release_dayofweek_popularity_score    0.005196
82         weekday_of_release_Thursday    0.005132
63                  is_valentines_week    0.004976
85            season_of_release_autumn    0.004070
20                    time_signature_1    0.003899
70           songs_released_that_month    0.003679
56                   release_dayofweek    0.003550
84        weekday_of_release_Wednesday    0.003127
13               album_component_count    0.002856
29                 organic_immersion_0    0.002834
81           weekday_of_release