# 1. Preparation

* Import libraries

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline, make_pipeline
import matplotlib.pyplot as plt
import joblib

* Prepare Data

In [19]:
train_df = pd.read_csv("./processed-data/processed_data_train.csv")
test_df = pd.read_csv("./processed-data/processed_data_test.csv")

train_df.head()

Unnamed: 0,temp_c,condition,wind_kph,pressure_in,precip_mm,snow_cm,humidity,cloud,vis_km,uv,hour_sin,hour_cos,month_sin,month_cos,wind_degree_sin,wind_degree_cos,dewpoint_depression
0,0.90142,7,-0.544269,-1.702582,-0.205408,-0.032943,0.653087,0.116381,0.23672,-0.906157,0.366506,1.364709,-0.701803,-1.213762,-1.414583,-0.029929,-0.182125
1,0.495247,9,1.392931,0.033613,-0.205408,-0.032943,-0.360737,-0.99366,0.23672,1.310283,1.000829,-1.000022,-1.220815,-0.697459,0.534746,1.216967,0.186417
2,0.698334,1,0.431839,-0.559234,-0.205408,-0.032943,0.399631,1.036902,0.23672,-0.906157,-1.225136,0.706151,-1.410787,0.007825,-1.103593,-0.920732,-0.115367
3,-1.632327,1,-1.35519,0.584113,-0.205408,-0.032943,-0.614193,0.576641,0.23672,-0.589523,-1.225136,-0.707289,0.716166,1.229412,1.303438,-0.803327,-0.17411
4,0.263148,7,-0.334031,-0.178118,-0.205408,-0.032943,0.805161,0.224677,0.23672,0.993649,1.415269,-0.000569,-1.410787,0.007825,1.199124,0.784002,-0.306822


In [20]:
X_train = train_df.drop(columns=['condition'])
y_train = train_df['condition']
X_test = test_df.drop(columns=['condition'])
y_test = test_df['condition']

In [43]:
y_train.value_counts()

condition
0     16399
9     15949
1     14476
7     10633
2      7226
6      4986
4       282
5       126
8       105
10       35
3        35
Name: count, dtype: int64

In [21]:
label_encoder = joblib.load("label_encoder.pkl")

# 2. Baseline Model

## 2.1. Baseline Model & Cross Validation

In [22]:
kf = StratifiedKFold(n_splits=5, shuffle=False)
rf = RandomForestClassifier(class_weight='balanced')

## 2.2. Hyperparameter Tuning Using GridSearchCV

In [23]:
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [6, 10, 12],
    'random_state': [42]
}

grid_rf = GridSearchCV(rf, param_grid=params, cv=kf, scoring='f1_macro', n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [24]:
print('Best parameters:', grid_rf.best_params_)
print('Best score:', grid_rf.best_score_)

Best parameters: {'max_depth': 12, 'n_estimators': 200, 'random_state': 42}
Best score: 0.8295711153934835


# 3. Random Resampling Imbalanced Datasets

* Random Oversampling: Randomly duplicate examples in the minority class.
* Random Undersampling: Randomly delete examples in the majority class.

## 3.1. Random Oversampling

In [25]:
ros = RandomOverSampler(random_state=42)

In [26]:
X_over, y_over = ros.fit_resample(X_train, y_train)

In [27]:
y_over.value_counts()

condition
7     16399
9     16399
1     16399
0     16399
2     16399
4     16399
6     16399
5     16399
8     16399
10    16399
3     16399
Name: count, dtype: int64

In [28]:
random_overs_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                      RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42))

In [29]:
score2 = cross_val_score(random_overs_pipeline, X_train, y_train, scoring='f1_macro', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score2))
print("Average Cross Validation Recall score: {}".format(score2.mean()))

Cross Validation Recall Scores are: [0.83614056 0.81871405 0.82069196 0.83138079 0.82515329]
Average Cross Validation Recall score: 0.8264161294123438


## 3.2. Random Undersampling

In [30]:
rus = RandomUnderSampler(random_state=42)

In [31]:
X_under, y_under = rus.fit_resample(X_train, y_train)

In [32]:
y_under.value_counts()

condition
0     35
1     35
2     35
3     35
4     35
5     35
6     35
7     35
8     35
9     35
10    35
Name: count, dtype: int64

In [33]:
random_unders_pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                                      RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42))

In [34]:
score3 = cross_val_score(random_unders_pipeline, X_train, y_train, scoring='f1_macro', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score3))
print("Average Cross Validation Recall score: {}".format(score3.mean()))

Cross Validation Recall Scores are: [0.63089817 0.66838693 0.63221671 0.68171412 0.67502144]
Average Cross Validation Recall score: 0.6576474736553531


# 4. SMOTE (Synthetic Minority Oversampling Technique)

SMOTE (Synthetic Minority Oversampling Technique) synthesize elements for the minority class. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

In [35]:
smote_pipeline = make_pipeline(SMOTE(random_state=42),
                              RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42))

In [38]:
score4 = cross_val_score(smote_pipeline, X_train, y_train, scoring='f1_macro', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score4))
print("Average Cross Validation Recall score: {}".format(score4.mean()))

Cross Validation Recall Scores are: [0.85687343 0.81902126 0.83610346 0.8422512  0.81948212]
Average Cross Validation Recall score: 0.8347462944358126


# 5. Combine SMOTE + Tomek Links

A combination of over-sampling the minority (abnormal) class and under-sampling the majority (normal) class can achieve better classifier performance than only under-sampling the majority class. This method was first introduced by Batista et al. (2003).

The process of SMOTE-Tomek Links is as follows.

1. Start of SMOTE: choose random data from the minority class.
2. Calculate the distance between the random data and its k nearest neighbors.
3. Multiply the difference with a random number between 0 and 1, then add the result to the minority class as a synthetic sample.
4. Repeat step number 2–3 until the desired proportion of minority class is met (End of SMOTE).
5. Start of Tomek Links: choose random data from the majority class.
6. If the random data’s nearest neighbor is the data from the minority class (i.e. create the Tomek Link), then remove the Tomek Link.

In [46]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(
    sampling_strategy='auto',
    random_state=42
)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)
y_train_resampled.value_counts()

condition
8     16399
10    16399
4     16399
3     16399
5     16399
6     16386
9     16231
0     16184
2     16103
1     15943
7     15829
Name: count, dtype: int64

In [48]:

SMOTETomek_pipeline = make_pipeline(
    SMOTETomek(sampling_strategy='auto', random_state=42),
    RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42)
)

score5 = cross_val_score(
    SMOTETomek_pipeline, X_train, y_train,
    scoring='f1_macro', cv=kf
)
print("Cross Validation F1 Scores with SMOTE + TomekLinks: {}".format(score5))
print("Average F1 score with SMOTE + TomekLinks: {:.4f}".format(score5.mean()))


Cross Validation F1 Scores with SMOTE + TomekLinks: [0.84543735 0.81302492 0.83837874 0.84175996 0.84146548]
Average F1 score with SMOTE + TomekLinks: 0.8360
