# 04. Model Training

Notebook này huấn luyện các mô hình:
- Decision Tree
- Random Forest
- Hyperparameter tuning
- Model evaluation

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import project modules
import sys
from pathlib import Path

# Calculate project root by finding directory with config folder
current_path = Path().resolve()
project_root = current_path

# Find project root by looking for config/ directory
max_levels = 5
for _ in range(max_levels):
    if (project_root / 'config').exists() and (project_root / 'src').exists():
        break
    if project_root.parent == project_root:
        break
    project_root = project_root.parent
else:
    if 'notebooks' in str(current_path):
        project_root = current_path.parent

# Add src to Python path
src_path = project_root / 'src'
if src_path.exists():
    sys.path.insert(0, str(src_path))

from models.decision_tree import create_decision_tree_model, tune_decision_tree, save_model
from models.random_forest import create_random_forest_model, tune_random_forest, save_model as save_rf_model
from models.train import train_model

## 1. Load Data

In [2]:
# Load processed data
processed_dir = project_root / "data" / "processed"

X_train = pd.read_csv(processed_dir / "X_train.csv")
X_val = pd.read_csv(processed_dir / "X_val.csv")
X_test = pd.read_csv(processed_dir / "X_test.csv")

y_train = pd.read_csv(processed_dir / "y_train.csv").squeeze()
y_val = pd.read_csv(processed_dir / "y_val.csv").squeeze()
y_test = pd.read_csv(processed_dir / "y_test.csv").squeeze()

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

Training set: (8631, 17)
Validation set: (1233, 17)
Test set: (2466, 17)


## 2. Train Decision Tree

In [4]:
# Create and train Decision Tree
dt_model = create_decision_tree_model(
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced'
)

dt_model = train_model(dt_model, X_train, y_train)

# Save model
save_model(dt_model, 'decision_tree.pkl')
print("Decision Tree model trained and saved!")

Model saved to D:\code\machine\bai-cuoi-ky\models\decision_tree.pkl
Decision Tree model trained and saved!


## 3. Train Random Forest

In [5]:
# Create and train Random Forest
rf_model = create_random_forest_model(
    n_estimators=100,
    max_depth=None,
    max_features='sqrt',
    class_weight='balanced'
)

rf_model = train_model(rf_model, X_train, y_train)

# Save model
save_rf_model(rf_model, 'random_forest.pkl')
print("Random Forest model trained and saved!")

Model saved to D:\code\machine\bai-cuoi-ky\models\random_forest.pkl
Random Forest model trained and saved!


## 4. Hyperparameter Tuning (Optional)

In [None]:
# Uncomment to run hyperparameter tuning
# print("Tuning Decision Tree...")
# dt_grid = tune_decision_tree(X_train, y_train, cv=5, scoring='f1')
# print(f"Best parameters: {dt_grid.best_params_}")
# print(f"Best score: {dt_grid.best_score_}")

# print("\nTuning Random Forest...")
# rf_grid = tune_random_forest(X_train, y_train, cv=5, scoring='f1')
# print(f"Best parameters: {rf_grid.best_params_}")
# print(f"Best score: {rf_grid.best_score_}")