In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("cleaned dataset.csv")
df.shape

(73074, 48)

In [4]:
X = df.drop('y',axis=1)

In [5]:
y=df['y']

In [6]:
scaler = StandardScaler()
X =scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((58459, 47), (14615, 47))

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [38]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "GaussianNB": GaussianNB()
}

In [39]:
def evaluate_model(true, predicted):
    ps = precision_score(true, predicted)
    rs = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    acc = accuracy_score(true, predicted)
    return round(ps,4), round(rs,4), round(f1,4), round(acc,4)

In [40]:
model_list = []
f1_list = []

In [41]:
for i in range(len(list(models))):
    model = list (models.values())[i]
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_precision, train_recall, train_f1, train_accuracy = evaluate_model(y_train, y_train_pred)
    test_precision, test_recall, test_f1, test_accuracy = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print(f"Precision:{train_precision} Recall:{train_recall} f1-score:{train_f1} accuracy:{train_accuracy}")
    print("------------------------------------------------")
    print('Model performance for Test set')
    print(f"Precision:{test_precision} Recall:{test_recall} f1-score:{test_f1} accuracy:{test_accuracy}")
    f1_list.append(test_f1)
    print("=================================================")
    print('\n')


Logistic Regression
Model performance for Training set
Precision:0.9396 Recall:0.9283 f1-score:0.9339 accuracy:0.9342
------------------------------------------------
Model performance for Test set
Precision:0.9402 Recall:0.9267 f1-score:0.9334 accuracy:0.9342


SVC
Model performance for Training set
Precision:0.9641 Recall:0.9455 f1-score:0.9547 accuracy:0.9551
------------------------------------------------
Model performance for Test set
Precision:0.9514 Recall:0.9264 f1-score:0.9387 accuracy:0.9399


Decision Tree Classifier
Model performance for Training set
Precision:1.0 Recall:1.0 f1-score:1.0 accuracy:1.0
------------------------------------------------
Model performance for Test set
Precision:0.9244 Recall:0.9342 f1-score:0.9293 accuracy:0.9293


Random Forest Classifier
Model performance for Training set
Precision:1.0 Recall:1.0 f1-score:1.0 accuracy:1.0
------------------------------------------------
Model performance for Test set
Precision:0.9503 Recall:0.9496 f1-score:0.9

In [44]:
pred_df = pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'f1_score']).sort_values(by=['f1_score'],ascending=False)
pred_df

Unnamed: 0,Model Name,f1_score
3,Random Forest Classifier,0.95
6,Bagging Classifier,0.9436
1,SVC,0.9387
5,Gradient Boosting Classifier,0.9368
0,Logistic Regression,0.9334
4,Ada Boost Classifier,0.9328
2,Decision Tree Classifier,0.9293
7,GaussianNB,0.8392
