In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
import pickle
from time_series_split import *

In [None]:
print(type(metrics))

In [None]:
def calculate_aqi(pm25):
    ranges = [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 500.0, 301, 500),
    ]
    for c_low, c_high, aqi_low, aqi_high in ranges:
        if c_low <= pm25 <= c_high:
            return round((aqi_high - aqi_low) / (c_high - c_low) * (pm25 - c_low) + aqi_low)
    return 500  # default nếu vượt ngưỡng

def aqi_class(aqi):
    if aqi <= 50: return 0
    elif aqi <= 100: return 1
    elif aqi <= 150: return 2
    elif aqi <= 200: return 3
    elif aqi <= 300: return 4
    else: return 5

def aqi_bucket(aqi):
    if aqi <= 50: return 'Good'
    elif aqi <= 100: return 'Moderate'
    elif aqi <= 150: return 'Unhealthy for Sensitive'
    elif aqi <= 200: return 'Unhealthy'
    elif aqi <= 300: return 'Very Unhealthy'
    else: return 'Hazardous'

In [None]:
combined_data = pd.read_csv('/home/thu/INT3041E_AI_PM2.5-Concentration-Estimation/data/add_AQI.csv')
combined_data.head()

In [None]:
# Tính cột AQI từ PM2.5
combined_data['AQI'] = combined_data['pm25'].apply(calculate_aqi)
# Gán nhãn lớp AQI
combined_data['AQI_Class'] = combined_data['AQI'].apply(aqi_class)
combined_data['AQI_Bucket'] = combined_data['AQI'].apply(aqi_bucket)


In [None]:
combined_data.head()

In [None]:
# folds = split_original_data()
folds = split_consolidated_data()
print(f"Number of folds: {len(folds)}")

In [None]:
feature_columns = combined_data.columns[3:]
feature_columns= feature_columns.drop(['AQI', 'AQI_Class'])
feature_columns

**Best paramters**

In [None]:
params = {
    'n_estimators': 100,
    'max_features': 'log2',
    'max_depth': 16,
    'min_samples_split': 2,
    'min_samples_leaf': 2,
    'random_state': 123,
    'bootstrap': True,
    'class_weight': 'balanced',
    'criterion': 'gini',
}

In [None]:
# Biến lưu nhãn thật và dự đoán trên toàn bộ test sets
all_y_true = []
all_y_pred = []

In [None]:
# Danh sách để lưu kết quả từ mỗi fold
val_accuracies = []
test_accuracies = []
test_classification_reports = []

best_val_accuracy = 0
best_model = None

In [None]:
all_target_names = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']

In [None]:
for i, fold in enumerate(folds):
    print(f"\nProcessing Fold {i+1}/{len(folds)}")

    train_data = fold['train']
    val_data = fold['validation']
    test_data = fold['test']

    feature_columns = train_data.columns[3:-2]
    print(f"Feature columns: {feature_columns}")
    # break
    X_train = train_data[feature_columns]
    y_train = train_data['AQI_Class']
    X_val = val_data[feature_columns]
    y_val = val_data['AQI_Class']
    X_test = test_data[feature_columns]
    y_test = test_data['AQI_Class']

    # Huấn luyện mô hình Random Forest
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)

    train_acc = clf.score(X_train, y_train)
    val_acc = clf.score(X_val, y_val)
    test_acc = clf.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f} | Validation Accuracy: {val_acc:.4f} | Test Accuracy: {test_acc:.4f}")

    val_accuracies.append(val_acc)
    test_accuracies.append(test_acc)

    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        best_model = clf

    y_pred = clf.predict(X_test)
    
    # Lưu nhãn thật và nhãn dự đoán để tính tổng sau này
    all_y_true.extend(y_test.tolist())
    all_y_pred.extend(y_pred.tolist())

    report = metrics.classification_report(y_test, y_pred, target_names=all_target_names, labels=[0,1,2,3,4,5], output_dict=True)
    test_classification_reports.append(report)

    # Biểu đồ sai số
    sns.histplot(y_test - y_pred, kde=True)
    plt.title(f'Fold {i+1} - Distribution of Prediction Errors')
    plt.xlabel('Prediction Error')
    plt.ylabel('Count')
    plt.show()

In [None]:
mean_val_accuracy = np.mean(val_accuracies)
mean_test_accuracy = np.mean(test_accuracies)
print(f"\nAverage Validation Accuracy: {mean_val_accuracy:.4f}")
print(f"Average Test Accuracy: {mean_test_accuracy:.4f}")

In [None]:
# Báo cáo trung bình từng lớp
avg_report = {}
for label in all_target_names:
    precisions = [r[label]['precision'] for r in test_classification_reports if label in r]
    recalls = [r[label]['recall'] for r in test_classification_reports if label in r]
    f1_scores = [r[label]['f1-score'] for r in test_classification_reports if label in r]
    supports = [r[label]['support'] for r in test_classification_reports if label in r]

    avg_report[label] = {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1-score': np.mean(f1_scores),
        'support': sum(supports),
        'num_folds': len(precisions)
    }

# Weighted avg
avg_report['weighted avg'] = {
    'precision': np.mean([r['weighted avg']['precision'] for r in test_classification_reports]),
    'recall': np.mean([r['weighted avg']['recall'] for r in test_classification_reports]),
    'f1-score': np.mean([r['weighted avg']['f1-score'] for r in test_classification_reports]),
    'support': sum([r['weighted avg']['support'] for r in test_classification_reports])
}

In [None]:
# In ra report trung bình
print("\n📋 Average Classification Report:")
for label in sorted(avg_report.keys(), key=lambda x: x if x != 'weighted avg' else 'zzz'):
    print(f"{label}:")
    print(f"  Precision: {avg_report[label]['precision']:.4f}")
    print(f"  Recall: {avg_report[label]['recall']:.4f}")
    print(f"  F1-score: {avg_report[label]['f1-score']:.4f}")
    print(f"  Support: {avg_report[label]['support']:.0f}")
    if label != 'weighted avg':
        print(f"  Number of folds with this class: {avg_report[label]['num_folds']}")


In [None]:
# Tính và in classification report tổng tất cả sample
final_report = metrics.classification_report(
    all_y_true, all_y_pred,
    target_names=all_target_names,
    labels=[0, 1, 2, 3, 4, 5]
)

print("\n" + "="*60)
print("Overall Classification Report for ALL Samples")
print("="*60)
print(final_report)

In [None]:
# Báo cáo dạng dict để trích F1 tổng thể
report_dict = metrics.classification_report(
    all_y_true, all_y_pred,
    target_names=all_target_names,
    labels=[0,1,2,3,4,5],
    output_dict=True
)

overall_f1 = report_dict["weighted avg"]["f1-score"]
overall_support = report_dict["weighted avg"]["support"]
print(f"Weighted F1-score: {overall_f1:.4f} | Total Samples: {int(overall_support)}")


In [None]:
# Lưu mô hình tốt nhất
pickle.dump(best_model, open('checkpoint/randomforest-aqi-classifier.pkl', 'wb'))
print("\nBest model saved as 'randomforest-aqi-classifier.pkl'")