In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from catboost import CatBoostClassifier
import sklearn.metrics as metrics
import pickle
from time_series_split import *
print(type(metrics))
from imblearn.over_sampling import SMOTE

<class 'module'>


In [2]:
def calculate_aqi(pm25):
    ranges = [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 500.0, 301, 500),
    ]
    for c_low, c_high, aqi_low, aqi_high in ranges:
        if c_low <= pm25 <= c_high:
            return round((aqi_high - aqi_low) / (c_high - c_low) * (pm25 - c_low) + aqi_low)
    return 500  # default nếu vượt ngưỡng

def aqi_class(aqi):
    if aqi <= 50: return 0
    elif aqi <= 100: return 1
    elif aqi <= 150: return 2
    elif aqi <= 200: return 3
    elif aqi <= 300: return 4
    else: return 5

In [3]:
combined_data = pd.read_csv('/home/thu/INT3041E_AI_PM2.5-Concentration-Estimation/data/add_AQI.csv')
combined_data['AQI'] = combined_data['pm25'].apply(calculate_aqi)
combined_data['AQI_Class'] = combined_data['AQI'].apply(aqi_class)
folds = split_consolidated_data()
print(f"Number of folds: {len(folds)}")

Number of folds: 3


In [4]:
params = {
    'iterations': 500,  # Tuning iterations for CatBoost
    'depth': 8,
    'learning_rate': 0.3,
    "l2_leaf_reg": 3,
    'loss_function': 'MultiClass',
    'custom_metric': ['AUC'],
    'cat_features': []  # List of categorical features if any
}

In [5]:
# Biến lưu nhãn thật và dự đoán trên toàn bộ test sets
all_y_true = []
all_y_pred = []
val_accuracies = []
test_accuracies = []
test_classification_reports = []

best_val_accuracy = 0
best_model = None
all_target_names = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']


In [8]:
for i, fold in enumerate(folds):
    print(f"\nProcessing Fold {i+1}/{len(folds)}")

    train_data = fold['train']
    val_data = fold['validation']
    test_data = fold['test']

    feature_columns = train_data.columns[3:-2]
    X_train = train_data[feature_columns]
    y_train = train_data['AQI_Class']
    X_val = val_data[feature_columns]
    y_val = val_data['AQI_Class']
    X_test = test_data[feature_columns]
    y_test = test_data['AQI_Class']
    
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    clf = CatBoostClassifier(**params, random_seed=43)
    clf.fit(X_train, y_train, verbose=100)

    train_acc = clf.score(X_train, y_train)
    val_acc = clf.score(X_val, y_val)
    test_acc = clf.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f} | Validation Accuracy: {val_acc:.4f} | Test Accuracy: {test_acc:.4f}")

    val_accuracies.append(val_acc)
    test_accuracies.append(test_acc)

    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        best_model = clf

    y_pred = clf.predict(X_test)
    
    # Lưu nhãn thật và nhãn dự đoán
    all_y_true.extend(y_test.tolist())
    all_y_pred.extend(y_pred.tolist())

    report = metrics.classification_report(
        y_test, y_pred, target_names=all_target_names, labels=[0, 1, 2, 3, 4, 5], output_dict=True, zero_division=1
    )
    test_classification_reports.append(report)


Processing Fold 1/3
0:	learn: 1.5042845	total: 266ms	remaining: 2m 12s
100:	learn: 0.2894852	total: 16.8s	remaining: 1m 6s
200:	learn: 0.1653375	total: 30s	remaining: 44.6s
300:	learn: 0.1114797	total: 42.8s	remaining: 28.3s
400:	learn: 0.0796780	total: 57.1s	remaining: 14.1s
499:	learn: 0.0602929	total: 1m 10s	remaining: 0us
Train Accuracy: 0.9987 | Validation Accuracy: 0.6885 | Test Accuracy: 0.4251

Processing Fold 2/3
0:	learn: 1.5001792	total: 173ms	remaining: 1m 26s
100:	learn: 0.2828136	total: 12.5s	remaining: 49.5s
200:	learn: 0.1663182	total: 26.3s	remaining: 39.1s
300:	learn: 0.1126487	total: 40.5s	remaining: 26.8s
400:	learn: 0.0797486	total: 57.8s	remaining: 14.3s
499:	learn: 0.0610071	total: 1m 12s	remaining: 0us
Train Accuracy: 0.9986 | Validation Accuracy: 0.4709 | Test Accuracy: 0.4967

Processing Fold 3/3
0:	learn: 1.5019723	total: 122ms	remaining: 1m
100:	learn: 0.2933683	total: 16.1s	remaining: 1m 3s
200:	learn: 0.1766715	total: 32.6s	remaining: 48.5s
300:	learn: 0.

In [9]:
mean_val_accuracy = np.mean(val_accuracies)
mean_test_accuracy = np.mean(test_accuracies)
print(f"\nAverage Validation Accuracy: {mean_val_accuracy:.4f}")
print(f"Average Test Accuracy: {mean_test_accuracy:.4f}")


Average Validation Accuracy: 0.6106
Average Test Accuracy: 0.4506


In [10]:
# Báo cáo trung bình từng lớp
avg_report = {}
for label in all_target_names:
    precisions = [r[label]['precision'] for r in test_classification_reports if label in r]
    recalls = [r[label]['recall'] for r in test_classification_reports if label in r]
    f1_scores = [r[label]['f1-score'] for r in test_classification_reports if label in r]
    supports = [r[label]['support'] for r in test_classification_reports if label in r]

    avg_report[label] = {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1-score': np.mean(f1_scores),
        'support': sum(supports),
        'num_folds': len(precisions)
    }

In [11]:
# Weighted avg
avg_report['weighted avg'] = {
    'precision': np.mean([r['weighted avg']['precision'] for r in test_classification_reports]),
    'recall': np.mean([r['weighted avg']['recall'] for r in test_classification_reports]),
    'f1-score': np.mean([r['weighted avg']['f1-score'] for r in test_classification_reports]),
    'support': sum([r['weighted avg']['support'] for r in test_classification_reports])
}


In [12]:
# In ra report trung bình
print("\n📋 Average Classification Report:")
for label in sorted(avg_report.keys(), key=lambda x: x if x != 'weighted avg' else 'zzz'):
    print(f"{label}:")
    print(f"  Precision: {avg_report[label]['precision']:.4f}")
    print(f"  Recall: {avg_report[label]['recall']:.4f}")
    print(f"  F1-score: {avg_report[label]['f1-score']:.4f}")
    print(f"  Support: {avg_report[label]['support']:.0f}")
    if label != 'weighted avg':
        print(f"  Number of folds with this class: {avg_report[label]['num_folds']}")



📋 Average Classification Report:
Good:
  Precision: 0.5879
  Recall: 0.3771
  F1-score: 0.4590
  Support: 436
  Number of folds with this class: 5
Hazardous:
  Precision: 0.6000
  Recall: 0.4000
  F1-score: 0.2000
  Support: 3
  Number of folds with this class: 5
Moderate:
  Precision: 0.4606
  Recall: 0.6406
  F1-score: 0.5265
  Support: 614
  Number of folds with this class: 5
Unhealthy:
  Precision: 0.3784
  Recall: 0.3586
  F1-score: 0.3598
  Support: 283
  Number of folds with this class: 5
Unhealthy for Sensitive:
  Precision: 0.1894
  Recall: 0.0899
  F1-score: 0.1177
  Support: 263
  Number of folds with this class: 5
Very Unhealthy:
  Precision: 0.6000
  Recall: 0.6000
  F1-score: 0.6000
  Support: 4
  Number of folds with this class: 5
weighted avg:
  Precision: 0.4580
  Recall: 0.4506
  F1-score: 0.4325
  Support: 1603


In [13]:
# Tính và in classification report tổng tất cả sample
final_report = metrics.classification_report(
    all_y_true, all_y_pred,
    target_names=all_target_names,
    labels=[0, 1, 2, 3, 4, 5]
)

print("\n" + "="*60)
print("Overall Classification Report for ALL Samples")
print("="*60)
print(final_report)


Overall Classification Report for ALL Samples
                         precision    recall  f1-score   support

                   Good       0.63      0.40      0.49       436
               Moderate       0.44      0.64      0.52       614
Unhealthy for Sensitive       0.17      0.09      0.12       263
              Unhealthy       0.44      0.45      0.45       283
         Very Unhealthy       0.00      0.00      0.00         4
              Hazardous       0.00      0.00      0.00         3

               accuracy                           0.45      1603
              macro avg       0.28      0.26      0.26      1603
           weighted avg       0.45      0.45      0.43      1603



In [14]:
# Báo cáo dạng dict để trích F1 tổng thể
report_dict = metrics.classification_report(
    all_y_true, all_y_pred,
    target_names=all_target_names,
    labels=[0,1,2,3,4,5],
    output_dict=True
)

overall_f1 = report_dict["weighted avg"]["f1-score"]
overall_support = report_dict["weighted avg"]["support"]
print(f"Weighted F1-score: {overall_f1:.4f} | Total Samples: {int(overall_support)}")


Weighted F1-score: 0.4326 | Total Samples: 1603


In [17]:
# Lưu mô hình tốt nhất
pickle.dump(best_model, open('metadata/checkpoint/catboost-aqi-classifier.pkl', 'wb'))
print("\nBest model saved as 'catboost-aqi-classifier.pkl'")


Best model saved as 'catboost-aqi-classifier.pkl'
