In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, fbeta_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('./data/processed_data.csv')

#Features and Target
X = data.drop('y', axis=1)
y = data['y']

#Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2024)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=2024)

In [3]:
def evaluate_model(pipeline):
    # evaluate model
    y_test_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    f2_score = fbeta_score(y_test, y_test_pred, beta=2)
    print(f'Accuracy: {accuracy*100:.2f}%')
    print(f'F2 Score: {f2_score*100:.2f}%')
    print(classification_report(y_test, y_test_pred))
    
    # cofusion martrix
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    print(f'TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}')
    
    # best params
    print(f'Best parameters: {best_params}')

def search_best_params(pipeline,param_grid):
    best_score = 0
    best_params = None
    
    for params in ParameterGrid(param_grid):
        pipeline.set_params(**params)
        pipeline.fit(X_train, y_train)
        y_val_pred = pipeline.predict(X_val)
        score = fbeta_score(y_val, y_val_pred, beta=2) #use f2_score choose best params
        
        if score > best_score:
            best_score = score
            best_params = params
    return best_params

### Decision Tree without SMOTE

In [8]:
pipeline = Pipeline(steps=[
    ('classifier', DecisionTreeClassifier(random_state=2024))
])

param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [1, 5, 10]
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 87.62%
F2 Score: 45.52%
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      3987
           1       0.48      0.45      0.46       535

    accuracy                           0.88      4522
   macro avg       0.70      0.69      0.70      4522
weighted avg       0.87      0.88      0.87      4522

TP: 241, TN: 3721, FP: 266, FN: 294
Best parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 30, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5}


### Decision Tree with SMOTE

In [9]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=2024)),
    ('classifier', DecisionTreeClassifier(random_state=2024))
])

param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [1, 5, 10]
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 84.34%
F2 Score: 64.37%
              precision    recall  f1-score   support

           0       0.96      0.86      0.91      3987
           1       0.41      0.75      0.53       535

    accuracy                           0.84      4522
   macro avg       0.69      0.80      0.72      4522
weighted avg       0.90      0.84      0.86      4522

TP: 401, TN: 3413, FP: 574, FN: 134
Best parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5}


### Random Forest without SMOTE

In [13]:
pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(random_state=2024))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10],
    'classifier__bootstrap': [True, False],
} 

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 90.23%
F2 Score: 43.23%
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      3987
           1       0.64      0.40      0.49       535

    accuracy                           0.90      4522
   macro avg       0.78      0.68      0.72      4522
weighted avg       0.89      0.90      0.89      4522

TP: 214, TN: 3866, FP: 121, FN: 321
Best parameters: {'classifier__bootstrap': False, 'classifier__max_depth': 30, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}


### Random Forest with SMOTE

In [14]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=2024)),
    ('classifier', RandomForestClassifier(random_state=2024))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10],
    'classifier__bootstrap': [True, False],
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 85.63%
F2 Score: 68.55%
              precision    recall  f1-score   support

           0       0.97      0.86      0.91      3987
           1       0.44      0.80      0.57       535

    accuracy                           0.86      4522
   macro avg       0.70      0.83      0.74      4522
weighted avg       0.91      0.86      0.87      4522

TP: 426, TN: 3446, FP: 541, FN: 109
Best parameters: {'classifier__bootstrap': False, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 200}


### XGBoost without SMOTE

In [11]:
pipeline = Pipeline(steps=[
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=2024))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 90.18%
F2 Score: 52.30%
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3987
           1       0.60      0.51      0.55       535

    accuracy                           0.90      4522
   macro avg       0.77      0.73      0.75      4522
weighted avg       0.90      0.90      0.90      4522

TP: 271, TN: 3807, FP: 180, FN: 264
Best parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 7, 'classifier__n_estimators': 300, 'classifier__subsample': 0.8}


### XGBoost with SMOTE

In [12]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=2024)),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=2024))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 84.83%
F2 Score: 68.23%
              precision    recall  f1-score   support

           0       0.97      0.85      0.91      3987
           1       0.43      0.80      0.56       535

    accuracy                           0.85      4522
   macro avg       0.70      0.83      0.73      4522
weighted avg       0.91      0.85      0.87      4522

TP: 430, TN: 3406, FP: 581, FN: 105
Best parameters: {'classifier__colsample_bytree': 1.0, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 7, 'classifier__n_estimators': 200, 'classifier__subsample': 0.6}


### SVM without SMOTE

In [15]:
pipeline = Pipeline(steps=[
   
    ('classifier', SVC(random_state=2024))
])

param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 88.85%
F2 Score: 50.66%
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3987
           1       0.53      0.50      0.52       535

    accuracy                           0.89      4522
   macro avg       0.73      0.72      0.73      4522
weighted avg       0.89      0.89      0.89      4522

TP: 268, TN: 3750, FP: 237, FN: 267
Best parameters: {'classifier__C': 100, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}


### SVM with SMOTE

In [16]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=2024)),
    ('classifier', SVC(random_state=2024))
])

param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

#search best params
best_params=search_best_params(pipeline,param_grid)

#use best params retrain model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

#evaluate
evaluate_model(pipeline)

Accuracy: 86.33%
F2 Score: 70.78%
              precision    recall  f1-score   support

           0       0.97      0.87      0.92      3987
           1       0.46      0.82      0.59       535

    accuracy                           0.86      4522
   macro avg       0.71      0.84      0.75      4522
weighted avg       0.91      0.86      0.88      4522

TP: 439, TN: 3465, FP: 522, FN: 96
Best parameters: {'classifier__C': 10, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
