-
Notifications
You must be signed in to change notification settings - Fork 1
/
modeling.py
130 lines (98 loc) · 4.63 KB
/
modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import make_scorer, recall_score, log_loss
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from utils.utils_modeling import scaling_x_data
from utils.utils_modeling import print_full
from cleaning_data import X_train_scaled, X_test_scaled, y_train,y_test
# Empy list
models = []
# Add tuple of models and its names
models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0,
class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear', random_state = 0)))
models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0)))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
#models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier',
DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, criterion = 'entropy', random_state = 0)))
models.append(('XGB', XGBClassifier()))
models.append(("GBC",GradientBoostingClassifier()))
# Empty lists
acc_results = []
auc_results = []
names = []
# create empty df with columns below
col = ['Algorithm', 'ROC_AUC_score_mean(10-foldCV)', 'ROC_AUC_std',
'Accuracy_score_mean(10-foldCV)', 'Accuracy_std']
model_results = pd.DataFrame(columns=col)
# empty object to pass ""
i = 0
# evaluate each model using k-fold cross-validation
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=0) # 10-fold cross-validation
# calculate accuracy scores
cv_acc_results = model_selection.cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='accuracy')
# calculate roc_auc scores
cv_auc_results = model_selection.cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='roc_auc')
# add scoring results in empty lists
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
model_results_df = model_results.sort_values(by=["ROC_AUC_score_mean(10-foldCV)"], ascending=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(model_results_df)
# Accuracy
fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(acc_results)
ax.set_xticklabels(names)
plt.title('Accuracy Score Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()
# ROC
fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(auc_results)
ax.set_xticklabels(names)
plt.title('ROC AUC Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()