## 0. Import libraries

In [1]:
# 1
import pandas as pd

# 2
from sklearn.preprocessing import StandardScaler
# !pip install scikit-learn==1.2.2
# !pip install imbalanced-learn==0.10.1
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# 3
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

# 4
import joblib

## 1. Load data

In [2]:
df = pd.read_csv('air_quality_health_impact_data.csv')
df

Unnamed: 0,RecordID,AQI,PM10,PM2_5,NO2,SO2,O3,Temperature,Humidity,WindSpeed,RespiratoryCases,CardiovascularCases,HospitalAdmissions,HealthImpactScore,HealthImpactClass
0,1,187.270059,295.853039,13.038560,6.639263,66.161150,54.624280,5.150335,84.424344,6.137755,7,5,1,97.244041,0.0
1,2,475.357153,246.254703,9.984497,16.318326,90.499523,169.621728,1.543378,46.851415,4.521422,10,2,0,100.000000,0.0
2,3,365.996971,84.443191,23.111340,96.317811,17.875850,9.006794,1.169483,17.806977,11.157384,13,3,0,100.000000,0.0
3,4,299.329242,21.020609,14.273403,81.234403,48.323616,93.161033,21.925276,99.473373,15.302500,8,8,1,100.000000,0.0
4,5,78.009320,16.987667,152.111623,121.235461,90.866167,241.795138,9.217517,24.906837,14.534733,9,0,1,95.182643,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5806,5807,171.112762,11.246387,197.984628,158.643107,17.743678,280.370909,37.359323,61.707640,4.097129,14,5,2,100.000000,4.0
5807,5808,490.691667,275.340762,55.774170,132.336871,29.334724,108.043492,34.532542,21.528555,6.682549,8,6,2,100.000000,3.0
5808,5809,314.841798,41.892699,184.708551,82.105823,68.334578,105.568503,22.975564,92.725625,2.889698,12,2,3,100.000000,1.0
5809,5810,208.080473,165.533785,199.177255,100.796385,87.586488,166.469537,36.090620,25.836286,10.722393,6,2,3,100.000000,4.0


## 2. Prepare data

We have no missing values.

In [3]:
df.isnull().sum()

RecordID               0
AQI                    0
PM10                   0
PM2_5                  0
NO2                    0
SO2                    0
O3                     0
Temperature            0
Humidity               0
WindSpeed              0
RespiratoryCases       0
CardiovascularCases    0
HospitalAdmissions     0
HealthImpactScore      0
HealthImpactClass      0
dtype: int64

We modify the target class type to `int`, due to it being a ordinal categorical variable.

In [4]:
type(df['HealthImpactClass'][0])

numpy.float64

In [5]:
df['HealthImpactClass'] = df['HealthImpactClass'].astype(int)
type(df['HealthImpactClass'][0])

numpy.int32

We drop the `RecordID` field, as it does not provide any relevant information to the classification task.

In [6]:
df.drop(columns=['RecordID'], axis=1, inplace=True)
df

Unnamed: 0,AQI,PM10,PM2_5,NO2,SO2,O3,Temperature,Humidity,WindSpeed,RespiratoryCases,CardiovascularCases,HospitalAdmissions,HealthImpactScore,HealthImpactClass
0,187.270059,295.853039,13.038560,6.639263,66.161150,54.624280,5.150335,84.424344,6.137755,7,5,1,97.244041,0
1,475.357153,246.254703,9.984497,16.318326,90.499523,169.621728,1.543378,46.851415,4.521422,10,2,0,100.000000,0
2,365.996971,84.443191,23.111340,96.317811,17.875850,9.006794,1.169483,17.806977,11.157384,13,3,0,100.000000,0
3,299.329242,21.020609,14.273403,81.234403,48.323616,93.161033,21.925276,99.473373,15.302500,8,8,1,100.000000,0
4,78.009320,16.987667,152.111623,121.235461,90.866167,241.795138,9.217517,24.906837,14.534733,9,0,1,95.182643,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5806,171.112762,11.246387,197.984628,158.643107,17.743678,280.370909,37.359323,61.707640,4.097129,14,5,2,100.000000,4
5807,490.691667,275.340762,55.774170,132.336871,29.334724,108.043492,34.532542,21.528555,6.682549,8,6,2,100.000000,3
5808,314.841798,41.892699,184.708551,82.105823,68.334578,105.568503,22.975564,92.725625,2.889698,12,2,3,100.000000,1
5809,208.080473,165.533785,199.177255,100.796385,87.586488,166.469537,36.090620,25.836286,10.722393,6,2,3,100.000000,4


## 2. Split data

In [7]:
X = df.drop(columns=['HealthImpactScore', 'HealthImpactClass'], axis=1)
y = df['HealthImpactClass']

We scale the data because they are not measured in the same units and/or magnitude.

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
df['HealthImpactClass'].value_counts()

0    4808
1     579
2     273
3      95
4      56
Name: HealthImpactClass, dtype: int64

We have imbalanced classes, having some with few instances. We, therefore, use SMOTE (Synthetic Minority Over-Sampling Technique) to generate synthetic samples for these minority classes, which helps balance the dataset and improves the model's ability to learn and predict the minority classes effectively.

In [10]:
smote = SMOTE(sampling_strategy={2:546, 3: 285, 4: 224}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

We use `stratify` in the splitting to ensure that all classes appear in train and test sets, by maintaining the same class distribution as the original dataset, after applying SMOTE.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

## 3. Train and evaluate data

We try various ensemble methods to train the data: AdaBoost, Random Forest, XGBoost.

### 3.1. AdaBoost

In [12]:
# Create and train AdaBoost classifier
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_clf.fit(X_train, y_train)

# Predict on test set
y_pred_adaboost = adaboost_clf.predict(X_test)

# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred_adaboost))

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       962
           1       0.40      0.43      0.41       116
           2       0.54      0.41      0.47       109
           3       0.15      0.05      0.08        57
           4       0.12      0.09      0.10        45

    accuracy                           0.78      1289
   macro avg       0.42      0.39      0.39      1289
weighted avg       0.75      0.78      0.76      1289



### 3.2. Random Forest

In [13]:
# Create and train Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_clf.predict(X_test)

# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       962
           1       0.81      0.56      0.66       116
           2       0.88      0.78      0.83       109
           3       1.00      0.39      0.56        57
           4       1.00      0.51      0.68        45

    accuracy                           0.89      1289
   macro avg       0.92      0.65      0.73      1289
weighted avg       0.90      0.89      0.88      1289



### 3.3. XGBoost

In [14]:
# Create and train XGBoost classifier
xgb_clf = XGBClassifier(n_estimators=100, objective='multi:softmax')
xgb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       962
           1       0.79      0.63      0.70       116
           2       0.81      0.81      0.81       109
           3       1.00      0.51      0.67        57
           4       0.96      0.56      0.70        45

    accuracy                           0.90      1289
   macro avg       0.90      0.70      0.77      1289
weighted avg       0.90      0.90      0.90      1289



## 4. Save best model

Overall, the model that gave us the best performance was XGBoost. We save it.

In [15]:
joblib.dump(xgb_clf, 'xgboost_model.joblib')

['xgboost_model.joblib']