In [2]:
# Train Price Models: Dummy, RF, HGBR

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
import joblib

# Transformer to convert sparse to dense (needed for HistGradientBoostingRegressor)
class SparseToDense(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if sparse.issparse(X):
            return X.toarray()
        return X

DATA_DIR = Path('/Users/doananh/Documents/đồ án DS')
CLEAN_FILE = DATA_DIR / 'data_motobikes_clean.csv'
ARTIFACT_DIR = DATA_DIR / 'artifacts'
MODEL_DIR = DATA_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)

# Check dependencies
if not CLEAN_FILE.exists():
    raise FileNotFoundError(f"File not found: {CLEAN_FILE}\nPlease run 'preprocess_validate.ipynb' first.")
preprocessor_path = ARTIFACT_DIR / 'preprocessor.joblib'
if not preprocessor_path.exists():
    raise FileNotFoundError(f"File not found: {preprocessor_path}\nPlease run 'prep_preprocessor.ipynb' first.")

print(f"Loading cleaned data from: {CLEAN_FILE}")
Df = pd.read_csv(CLEAN_FILE, low_memory=False)
print(f"  Loaded {len(Df)} rows")

print(f"Loading preprocessor from: {preprocessor_path}")
pre = joblib.load(preprocessor_path)
preprocessor = pre['preprocessor']
NUMERIC_FEATURES = pre['numeric_features']
CATEGORICAL_FEATURES = pre['categorical_features']
TARGET = pre['target']

X = Df[NUMERIC_FEATURES + CATEGORICAL_FEATURES]
y = Df[TARGET].astype(float)

# Check target statistics
print(f"\nTarget ({TARGET}) statistics:")
print(f"  Total rows: {len(y)}")
print(f"  Missing (NaN): {y.isna().sum()}")
print(f"  Zero or negative: {(y <= 0).sum()}")
print(f"  Valid (> 0): {(y > 0).sum()}")

# Remove rows with missing/invalid target (required for training)
mask_valid = y.notna() & (y > 0)
X = X[mask_valid].copy()
y = y[mask_valid].copy()
print(f"\nAfter filtering: {len(X)} rows (removed {len(Df) - len(X)} rows with missing/invalid target)")

# Verify no NaN in y
assert y.notna().all(), f"Still found {y.isna().sum()} NaN values in y after filtering!"
assert (y > 0).all(), f"Still found {(y <= 0).sum()} non-positive values in y after filtering!"

# Helpers

def mape(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = y_true > 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

cv = KFold(n_splits=5, shuffle=True, random_state=42)

candidates = {
    'rf': RandomForestRegressor(
        n_estimators=600,
        max_depth=25,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    'hgbr': HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=10,
        max_iter=800,
        min_samples_leaf=15,
        random_state=42
    )
}

rows = []
results_per_model = {}

print(f"\nTraining {len(candidates)} models with {cv.n_splits}-fold CV...")
for name, model in candidates.items():
    print(f"\nTraining {name}...")
    # HGBR requires dense data, others can handle sparse
    if name == 'hgbr':
        pipe = Pipeline([
            ('pre', preprocessor),
            ('sparse_to_dense', SparseToDense()),
            ('model', model)
        ])
    else:
        pipe = Pipeline([
            ('pre', preprocessor),
            ('model', model)
        ])
    scores = cross_validate(
        pipe, X, y,
        cv=cv,
        scoring=('neg_mean_absolute_error','neg_root_mean_squared_error','r2'),
        return_train_score=False,
        n_jobs=-1
    )
    # Fit once to compute MAPE on held-out folds is tricky; approximate with CV predictions optional.
    # Here we compute MAPE via a manual loop for precision.
    maes, rmses, r2s, mapes = [], [], [], []
    for tr_idx, te_idx in cv.split(X):
        Xtr, Xte = X.iloc[tr_idx], X.iloc[te_idx]
        ytr, yte = y.iloc[tr_idx], y.iloc[te_idx]
        pipe.fit(Xtr, ytr)
        yhat = pipe.predict(Xte)
        maes.append(mean_absolute_error(yte, yhat))
        rmses.append(np.sqrt(mean_squared_error(yte, yhat)))
        r2s.append(r2_score(yte, yhat))
        mapes.append(mape(yte, yhat))
    row = {
        'model': name,
        'MAE_mean': np.mean(maes), 'MAE_std': np.std(maes),
        'RMSE_mean': np.mean(rmses), 'RMSE_std': np.std(rmses),
        'R2_mean': np.mean(r2s), 'R2_std': np.std(r2s),
        'MAPE_mean': np.mean(mapes), 'MAPE_std': np.std(mapes)
    }
    rows.append(row)
    results_per_model[name] = row
    print(f"  {name} - MAE: {row['MAE_mean']:.2f}, R²: {row['R2_mean']:.4f}")

print("\n" + "="*50)
cv_df = pd.DataFrame(rows).sort_values('MAE_mean')
cv_path = MODEL_DIR / 'price_cv_results.csv'
cv_df.to_csv(cv_path, index=False)
print('Saved CV results to:', cv_path)
print(cv_df)

# Select best by MAE
best_name = cv_df.iloc[0]['model']
print(f"\nBest model (lowest MAE): {best_name}")
best_model = candidates[best_name]
# Add sparse_to_dense if best model is HGBR
if best_name == 'hgbr':
    final_pipe = Pipeline([
        ('pre', preprocessor),
        ('sparse_to_dense', SparseToDense()),
        ('model', best_model)
    ])
else:
    final_pipe = Pipeline([
        ('pre', preprocessor),
        ('model', best_model)
    ])
print("Fitting final model on full dataset...")
final_pipe.fit(X, y)
print("Done!")

best_path = MODEL_DIR / 'price_model.joblib'
joblib.dump({'model': final_pipe, 'cv_results': cv_df.to_dict(orient='records'), 'best_name': best_name}, best_path)
print(f'\nSaved best price model to: {best_path}')



Loading cleaned data from: /Users/doananh/Documents/đồ án DS/data_motobikes_clean.csv
  Loaded 7208 rows
Loading preprocessor from: /Users/doananh/Documents/đồ án DS/artifacts/preprocessor.joblib

Target (gia_vnd_final) statistics:
  Total rows: 7208
  Missing (NaN): 2
  Zero or negative: 1
  Valid (> 0): 7205

After filtering: 7205 rows (removed 3 rows with missing/invalid target)

Training 2 models with 5-fold CV...

Training rf...
  rf - MAE: 60672658.90, R²: -145.0436

Training hgbr...
  hgbr - MAE: 134754490.84, R²: -238.4657

Saved CV results to: /Users/doananh/Documents/đồ án DS/models/price_cv_results.csv
  model      MAE_mean       MAE_std     RMSE_mean      RMSE_std     R2_mean  \
0    rf  6.067266e+07  2.586947e+07  1.381737e+09  1.132382e+09 -145.043598   
1  hgbr  1.347545e+08  1.643667e+07  1.492230e+09  1.061491e+09 -238.465709   

       R2_std    MAPE_mean      MAPE_std  
0   87.582280  6966.775861  12104.440811  
1  191.555846  9096.418655  10500.565985  

Best model 