In [5]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report


path_dir = os.path.dirname(os.getcwd())

path_data = os.path.join(path_dir, "data\prepared_data.csv")
df = pd.read_csv(path_data)

df.shape

(4464, 77)

## Split the data

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    df.drop('Label', axis=1),
    df[['Label']],
    test_size=0.2,
    stratify=df[['Label']],
    random_state=59
)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")

Train shape: (3571, 76)
Validation shape: (893, 76)


### Baseline

In [7]:
y_pred = np.ones_like(y_val)
accuracy_score(y_val, y_pred)

0.7984322508398656

In [12]:
print(classification_report(y_val, y_pred, zero_division=np.nan))

              precision    recall  f1-score   support

         0.0        nan      0.00       nan       180
         1.0       0.80      1.00      0.89       713

    accuracy                           0.80       893
   macro avg       0.80      0.50      0.89       893
weighted avg       0.80      0.80      0.89       893



## Logistic Regression

In [70]:
lr_model = LogisticRegression(random_state=59)
lr_model.fit(X_train, y_train.values.ravel())

In [71]:
y_pred = lr_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.9832026875699889

In [72]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       180
         1.0       0.99      0.99      0.99       713

    accuracy                           0.98       893
   macro avg       0.97      0.97      0.97       893
weighted avg       0.98      0.98      0.98       893



## DecisionTreeClassifier

In [73]:
tree_model = DecisionTreeClassifier(max_depth=11, random_state=59)
tree_model.fit(X_train, y_train)

In [74]:
y_pred = tree_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.9876819708846585

In [75]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.94      0.97       180
         1.0       0.99      1.00      0.99       713

    accuracy                           0.99       893
   macro avg       0.99      0.97      0.98       893
weighted avg       0.99      0.99      0.99       893



In [58]:
tree_model.get_depth()

2

## RandomForestClassifier

In [64]:
forest_model = RandomForestClassifier(n_estimators=100, random_state=59)
forest_model.fit(X_train, y_train.values.ravel())

In [65]:
y_pred = forest_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.9966405375139977

In [66]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       180
         1.0       1.00      1.00      1.00       713

    accuracy                           1.00       893
   macro avg       1.00      0.99      0.99       893
weighted avg       1.00      1.00      1.00       893



## Export the models

In [76]:
import pickle

model_names = [
    "lr_model.pkl",
    "tree_model.pkl",
    "forest_model.pkl"
]

models = [
    lr_model,
    tree_model,
    forest_model
]

for model_name, model in zip(model_names, models):
    path_model = os.path.join(path_dir, "models", model_name)
    pickle.dump(model, open(path_model, 'wb'))