In [3]:
# Train Anomaly Models: Residual-based and IsolationForest

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
import joblib

DATA_DIR = Path('/Users/doananh/Documents/đồ án DS')
CLEAN_FILE = DATA_DIR / 'data_motobikes_clean.csv'
ARTIFACT_DIR = DATA_DIR / 'artifacts'
MODEL_DIR = DATA_DIR / 'models'
OUT_DIR = DATA_DIR / 'anomaly_outputs'
MODEL_DIR.mkdir(exist_ok=True)
OUT_DIR.mkdir(exist_ok=True)

# Check dependencies
price_model_path = MODEL_DIR / 'price_model.joblib'
if not price_model_path.exists():
    raise FileNotFoundError(
        f"File not found: {price_model_path}\n"
        "Please run 'train_price_models.ipynb' first to create the price model."
    )

Df = pd.read_csv(CLEAN_FILE, low_memory=False)
pre = joblib.load(ARTIFACT_DIR / 'preprocessor.joblib')
preprocessor = pre['preprocessor']
NUMERIC_FEATURES = pre['numeric_features']
CATEGORICAL_FEATURES = pre['categorical_features']
TARGET = pre['target']

X = Df[NUMERIC_FEATURES + CATEGORICAL_FEATURES]
y = Df[TARGET].astype(float)

# 1) Residual-based anomaly using best price model
print(f"Loading price model from: {price_model_path}")
price_art = joblib.load(price_model_path)
price_model = price_art['model']
print(f"Price model loaded. Best model: {price_art.get('best_name', 'unknown')}")

print("\n1) Computing residual-based anomalies...")
print("   Predicting prices...")
y_pred = price_model.predict(X)
residual = y - y_pred
pct_err = np.where(y_pred > 0, residual / y_pred, np.nan)

Df_rb = Df.copy()
Df_rb['price_pred'] = y_pred
Df_rb['residual'] = residual
Df_rb['pct_err'] = pct_err

# Flag top-k% largest absolute percentage error
k_pct = 0.05
valid_pct_err = Df_rb['pct_err'].abs().dropna()
if len(valid_pct_err) == 0:
    threshold = np.nan
else:
    threshold = np.nanquantile(valid_pct_err, 1 - k_pct)
Df_rb['is_anomaly_residual'] = Df_rb['pct_err'].abs() >= threshold
num_anomalies_rb = Df_rb['is_anomaly_residual'].sum()
print(f"   Found {num_anomalies_rb} anomalies (top {k_pct*100}% by |pct_err|)")

rb_path = OUT_DIR / 'anomalies_residual.csv'
Df_rb.sort_values('pct_err', key=lambda s: np.abs(s), ascending=False).to_csv(rb_path, index=False)
print(f'   Saved residual-based anomalies to: {rb_path}')

# 2) IsolationForest on transformed features + log price
print("\n2) Training IsolationForest anomaly detector...")
from scipy import sparse

print("   Transforming features...")
Xt = preprocessor.transform(X)

# Use actual price when valid, otherwise fallback to predicted price
price_for_iso = np.where(~np.isnan(y.values) & (y.values > 0), y.values, y_pred)
price_for_iso = np.clip(price_for_iso, a_min=0, a_max=None)
log_price = np.log1p(price_for_iso).reshape(-1, 1)

# Remove rows where transformation still produced NaN/inf
mask_finite = np.isfinite(log_price).ravel()
if not mask_finite.all():
    removed = (~mask_finite).sum()
    print(f"   Removing {removed} rows with invalid log_price values before IsolationForest")
    log_price = log_price[mask_finite]
    if sparse.issparse(Xt):
        Xt = Xt[mask_finite]
    else:
        Xt = Xt[mask_finite]
    Df_iso_base = Df.loc[mask_finite].copy()
else:
    Df_iso_base = Df.copy()

# concat sparse Xt with dense log_price
if sparse.issparse(Xt):
    from scipy.sparse import hstack, csr_matrix
    Xt_aug = hstack([Xt, csr_matrix(log_price)])
else:
    Xt_aug = np.hstack([Xt, log_price])

print("   Fitting IsolationForest (contamination=0.05)...")
iso = IsolationForest(n_estimators=300, contamination=0.05, random_state=42, n_jobs=-1)
iso.fit(Xt_aug)
scores = -iso.score_samples(Xt_aug)  # higher => more anomalous

Df_iso = Df_iso_base.copy()
Df_iso['iso_score'] = scores

iso_threshold = np.quantile(scores, 0.95)
Df_iso['is_anomaly_iso'] = Df_iso['iso_score'] >= iso_threshold
num_anomalies_iso = Df_iso['is_anomaly_iso'].sum()
print(f"   Found {num_anomalies_iso} anomalies (top 5% by iso_score)")

iso_model_path = MODEL_DIR / 'iso_model.joblib'
joblib.dump({'model': iso, 'features': {'use_log_price': True}}, iso_model_path)
print(f'   Saved IsolationForest model to: {iso_model_path}')

iso_path = OUT_DIR / 'anomalies_isolation.csv'
Df_iso.sort_values('iso_score', ascending=False).to_csv(iso_path, index=False)
print(f'   Saved IsolationForest anomalies to: {iso_path}')

print("\n" + "="*50)
print("Anomaly detection completed!")



Loading price model from: /Users/doananh/Documents/đồ án DS/models/price_model.joblib
Price model loaded. Best model: rf

1) Computing residual-based anomalies...
   Predicting prices...
   Found 361 anomalies (top 5.0% by |pct_err|)
   Saved residual-based anomalies to: /Users/doananh/Documents/đồ án DS/anomaly_outputs/anomalies_residual.csv

2) Training IsolationForest anomaly detector...
   Transforming features...
   Fitting IsolationForest (contamination=0.05)...
   Found 361 anomalies (top 5% by iso_score)
   Saved IsolationForest model to: /Users/doananh/Documents/đồ án DS/models/iso_model.joblib
   Saved IsolationForest anomalies to: /Users/doananh/Documents/đồ án DS/anomaly_outputs/anomalies_isolation.csv

Anomaly detection completed!
