# Model Training

## 1. Preparing Environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import yaml
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

## 2. Spliting data set

In [2]:
import sys
sys.path.append('..')

from src.utils.utils import load_config
from src.data.load_data import load_data

config = load_config('../configs/config.yaml')

# Load data
train_df, test_df = load_data(train_path="../data/processed/train_encoded.csv",
                              test_path="../data/processed/test_encoded.csv"
                              )

# Split features and target
X = train_df.drop(columns=[config["model"]["target_column"]])
y = train_df[config["model"]["target_column"]]

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=config["test_size"], 
    random_state=config["random_seed"]
)

Loading training data from ../data/processed/train_encoded.csv
Loading test data from ../data/processed/test_encoded.csv
Train shape: (61609, 74)
Test shape: (41074, 73)


## 3. Model Tranining and Baseline

### 3.1 Baseline model - Random Forest Regressor

In [3]:
from src.data.model_training import train_and_evaluate
from src.data.model_training import predict_and_save

model, val_rmse = train_and_evaluate(
    X_train, y_train, X_val, y_val, 
    config_path="../configs/config.yaml", 
    params_path="../configs/model_params.yaml",
    suffix=2
)




Model saved to ../outputs/models/random_forest2.joblib
Validation RMSE: 4.3149


In [4]:
from src.utils.io import load_model
from src.data.model_training import predict_and_save
# get predictions for the test datafrom src.utils.io import load_model
model = load_model("../outputs/models/random_forest.joblib")
X_test = test_df  
predict_and_save(model, X_test, config_path="../configs/config.yaml", suffix="_2")

Predictions saved to ../outputs/predictions/random_forest_predictions_2.csv
