In [1]:
# Anomaly Evaluation and Reporting

import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('/Users/doananh/Documents/đồ án DS')
CLEAN = DATA_DIR / 'data_motobikes_clean.csv'
ANOM_DIR = DATA_DIR / 'anomaly_outputs'
OUT_SUMMARY = DATA_DIR / 'anomaly_summary.csv'

Df = pd.read_csv(CLEAN, low_memory=False)
rb = pd.read_csv(ANOM_DIR / 'anomalies_residual.csv')
iso = pd.read_csv(ANOM_DIR / 'anomalies_isolation.csv')

# Align by id/href if available
key = 'href' if 'href' in Df.columns else 'id'

rb_sel = rb[[key, 'price_pred', 'residual', 'pct_err', 'is_anomaly_residual']].copy()
iso_sel = iso[[key, 'iso_score', 'is_anomaly_iso']].copy()

merged = Df[[key, 'tieu_de','thuong_hieu','dong_xe','nam_dang_ky','so_km','tinh_thanh','quan','gia_vnd_final']].merge(rb_sel, on=key, how='left').merge(iso_sel, on=key, how='left')

# Combine anomaly flags
merged['is_anomaly_any'] = merged[['is_anomaly_residual','is_anomaly_iso']].any(axis=1)

# Basic summaries
summary = {
    'total_rows': len(merged),
    'num_residual_flags': int(merged['is_anomaly_residual'].fillna(False).sum()),
    'num_iso_flags': int(merged['is_anomaly_iso'].fillna(False).sum()),
    'num_any_flags': int(merged['is_anomaly_any'].fillna(False).sum()),
}
print(summary)

# Top anomalies lists
TOP_K = 100

# By absolute pct_err
top_resid = merged.dropna(subset=['pct_err']).reindex(merged['pct_err'].abs().sort_values(ascending=False).index).head(TOP_K)
top_resid.to_csv(DATA_DIR / 'top_anomalies_residual.csv', index=False)

# By iso_score
top_iso = merged.dropna(subset=['iso_score']).sort_values('iso_score', ascending=False).head(TOP_K)
top_iso.to_csv(DATA_DIR / 'top_anomalies_isolation.csv', index=False)

# Save overall merged for reference
merged.to_csv(OUT_SUMMARY, index=False)
print('Saved anomaly summary to:', OUT_SUMMARY)



{'total_rows': 7352, 'num_residual_flags': 361, 'num_iso_flags': 385, 'num_any_flags': 703}
Saved anomaly summary to: /Users/doananh/Documents/đồ án DS/anomaly_summary.csv
