In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix ,f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
import plotly.express as px

In [2]:
data = pd.read_csv("Data/HTRU_2.csv")
x = data.iloc[:, 0:7]
y = data.iloc[:, -1]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [6]:
standard_scale = StandardScaler()
x_train_scaled = standard_scale.fit_transform(x_train)

In [9]:
smt = SMOTE(sampling_strategy=0.5, random_state=42)
x_train_sm, y_train_sm = smt.fit_sample(x_train_scaled, y_train)

# Random Forest Classifier

In [11]:
random_forest=RandomForestClassifier()
random_forest.fit(x_train_sm, y_train_sm)

RandomForestClassifier()

In [12]:
random_forest.feature_importances_

array([0.19258324, 0.03999028, 0.33858919, 0.1715771 , 0.06283627,
       0.13973173, 0.05469218])

In [17]:
y_train_pred_random_forest=cross_val_predict(random_forest, x_train_sm, y_train_sm, cv=5)

In [18]:
confusion_matrix_random_forest = confusion_matrix(y_train_sm, y_train_pred_random_forest)

confusion_matrix_random_forest

array([[12840,   163],
       [  327,  6174]], dtype=int64)

In [19]:
# precision, accuracy and recall for random forest classifier

print(accuracy_score(y_train_sm, y_train_pred_random_forest))
print(precision_score(y_train_sm, y_train_pred_random_forest))
print(recall_score(y_train_sm, y_train_pred_random_forest))
print(f1_score(y_train_sm, y_train_pred_random_forest))

0.9748769483182936
0.9742780495502604
0.9497000461467466
0.9618320610687022


In [20]:
y_test_pred_random_forest=random_forest.predict(standard_scale.fit_transform(x_test))

In [21]:
# precision, accuracy and recall for random forest classifier

print(accuracy_score(y_test, y_test_pred_random_forest))
print(precision_score(y_test, y_test_pred_random_forest))
print(recall_score(y_test, y_test_pred_random_forest))
print(f1_score(y_test, y_test_pred_random_forest))

0.9798882681564246
0.8913043478260869
0.8858024691358025
0.8885448916408668


# Support Vector Machines

In [24]:
gaussian_svm = SVC(kernel="poly",degree=3, C=5, probability=True)
gaussian_svm.fit(x_train_sm, y_train_sm)

SVC(C=5, kernel='poly', probability=True)

In [25]:
y_train_pred_svm=cross_val_predict(gaussian_svm, x_train_sm, y_train_sm, cv=5)

In [26]:
print(accuracy_score(y_train_sm, y_train_pred_svm))
print(precision_score(y_train_sm, y_train_pred_svm))
print(recall_score(y_train_sm, y_train_pred_svm))
print(f1_score(y_train_sm, y_train_pred_svm))

0.9467288761279737
0.9766143106457242
0.8607906475926781
0.9150519172594228


In [27]:
y_gaussian_svm=gaussian_svm.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_gaussian_svm))
print(precision_score(y_test, y_gaussian_svm))
print(recall_score(y_test, y_gaussian_svm))
print(f1_score(y_test, y_gaussian_svm))

0.9793296089385475
0.9084967320261438
0.8580246913580247
0.8825396825396825


# Logistic Regression

In [34]:
logistic_regression=LogisticRegression()
logistic_regression.fit(x_train_sm, y_train_sm)

LogisticRegression()

In [35]:
y_train_pred_log=cross_val_predict(logistic_regression, x_train_sm, y_train_sm, cv=5)

In [36]:
# log confusion matrix

confusion_matrix_log = confusion_matrix(y_train_sm, y_train_pred_log)

confusion_matrix_log

array([[12825,   178],
       [  721,  5780]], dtype=int64)

In [37]:
# precision, accuracy and recall for log classifier

print(accuracy_score(y_train_sm, y_train_pred_log))
print(precision_score(y_train_sm, y_train_pred_log))
print(recall_score(y_train_sm, y_train_pred_log))
print(f1_score(y_train_sm, y_train_pred_log))

0.9539068908941756
0.9701242027526016
0.8890939855406861
0.9278433261096397


In [38]:
y_logistic_regression=logistic_regression.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_logistic_regression))
print(precision_score(y_test, y_logistic_regression))
print(recall_score(y_test, y_logistic_regression))
print(f1_score(y_test, y_logistic_regression))

0.979608938547486
0.8814589665653495
0.8950617283950617
0.888208269525268


# Adaboost

In [40]:
adaboost_classifier=AdaBoostClassifier(
    RandomForestClassifier(), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5
)

adaboost_classifier.fit(x_train_sm, y_train_sm)

AdaBoostClassifier(base_estimator=RandomForestClassifier(), learning_rate=0.5,
                   n_estimators=200)

In [41]:
# adaboost metrics

y_train_pred_adaboost=cross_val_predict(adaboost_classifier, x_train_sm, y_train_sm, cv=5)

In [42]:
# adaboost confusion matrix

confusion_matrix_adaboost = confusion_matrix(y_train_sm, y_train_pred_adaboost)

confusion_matrix_adaboost

array([[12832,   171],
       [  339,  6162]], dtype=int64)

In [43]:
# precision, accuracy and recall for adaboost classifier

print(accuracy_score(y_train_sm, y_train_pred_adaboost))
print(precision_score(y_train_sm, y_train_pred_adaboost))
print(recall_score(y_train_sm, y_train_pred_adaboost))
print(f1_score(y_train_sm, y_train_pred_adaboost))

0.9738515176374077
0.9729985788725722
0.9478541762805722
0.9602618045815802


In [44]:
y_adaboost_classifier=adaboost_classifier.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_adaboost_classifier))
print(precision_score(y_test, y_adaboost_classifier))
print(recall_score(y_test, y_adaboost_classifier))
print(f1_score(y_test, y_adaboost_classifier))

0.9807262569832402
0.8899082568807339
0.8981481481481481
0.8940092165898618


# SGDC

In [46]:
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(x_train_sm, y_train_sm)

SGDClassifier(random_state=42)

In [47]:
y_train_pred_sgd_clf=cross_val_predict(sgd_clf, x_train_sm, y_train_sm, cv=5)

In [48]:
# sgd_clf confusion matrix

confusion_matrix_sgd_clf = confusion_matrix(y_train_sm, y_train_pred_sgd_clf)

confusion_matrix_sgd_clf

array([[12802,   201],
       [  724,  5777]], dtype=int64)

In [49]:
# precision, accuracy and recall for sgd_clf classifier

print(accuracy_score(y_train_sm, y_train_pred_sgd_clf))
print(precision_score(y_train_sm, y_train_pred_sgd_clf))
print(recall_score(y_train_sm, y_train_pred_sgd_clf))
print(f1_score(y_train_sm, y_train_pred_sgd_clf))

0.9525738310090238
0.9663767146202743
0.8886325180741425
0.9258754707909287


In [50]:
y_sgd_clf=sgd_clf.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_sgd_clf))
print(precision_score(y_test, y_sgd_clf))
print(recall_score(y_test, y_sgd_clf))
print(f1_score(y_test, y_sgd_clf))

0.9810055865921787
0.9102564102564102
0.8765432098765432
0.8930817610062893


# K Neighbors

In [53]:
k_neig=KNeighborsClassifier(n_neighbors=15)
k_neig.fit(x_train_sm, y_train_sm)

KNeighborsClassifier(n_neighbors=15)

In [54]:
y_train_pred_k_neig=cross_val_predict(k_neig, x_train_sm, y_train_sm, cv=5)

In [55]:
# k_neig confusion matrix

confusion_matrix_k_neig = confusion_matrix(y_train_sm, y_train_pred_k_neig)

confusion_matrix_k_neig

array([[12728,   275],
       [  442,  6059]], dtype=int64)

In [56]:
# precision, accuracy and recall for k_neig classifier

print(accuracy_score(y_train_sm, y_train_pred_k_neig))
print(precision_score(y_train_sm, y_train_pred_k_neig))
print(recall_score(y_train_sm, y_train_pred_k_neig))
print(f1_score(y_train_sm, y_train_pred_k_neig))

0.9632383100902379
0.9565835175244711
0.9320104599292417
0.9441371250486951


In [57]:
y_neig=k_neig.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_neig))
print(precision_score(y_test, y_neig))
print(recall_score(y_test, y_neig))
print(f1_score(y_test, y_neig))

0.973463687150838
0.8225352112676056
0.9012345679012346
0.8600883652430044


# Voting Classification

In [61]:
voting_clf=VotingClassifier(
    estimators=[('lr', logistic_regression), ('rf', random_forest), ('ab', adaboost_classifier), ('kn', k_neig), ('sm', gaussian_svm)], voting='soft'
)

voting_clf.fit(x_train_sm, y_train_sm)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('ab',
                              AdaBoostClassifier(base_estimator=RandomForestClassifier(),
                                                 learning_rate=0.5,
                                                 n_estimators=200)),
                             ('kn', KNeighborsClassifier(n_neighbors=15)),
                             ('sm', SVC(C=5, kernel='poly', probability=True))],
                 voting='soft')

In [62]:
y_train_pred_voting_clf=cross_val_predict(voting_clf, x_train_sm, y_train_sm, cv=5)

In [63]:
# voting_clf confusion matrix

confusion_matrix_voting_clf = confusion_matrix(y_train_sm, y_train_pred_voting_clf)

confusion_matrix_voting_clf

array([[12850,   153],
       [  545,  5956]], dtype=int64)

In [64]:
# precision, accuracy and recall for voting_clf classifier

print(accuracy_score(y_train_sm, y_train_pred_voting_clf))
print(precision_score(y_train_sm, y_train_pred_voting_clf))
print(recall_score(y_train_sm, y_train_pred_voting_clf))
print(f1_score(y_train_sm, y_train_pred_voting_clf))

0.9642124692370796
0.9749549844491734
0.9161667435779111
0.9446471054718478


In [65]:
y_test_pred_voting=voting_clf.predict(standard_scale.fit_transform(x_test))

In [66]:
print(accuracy_score(y_test, y_test_pred_voting))
print(precision_score(y_test, y_test_pred_voting))
print(recall_score(y_test, y_test_pred_voting))
print(f1_score(y_test, y_test_pred_voting))

0.9798882681564246
0.8865030674846626
0.8919753086419753
0.8892307692307693


# saving the model

In [2]:
import pickle 

pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(adaboost_classifier, pickle_out) 
pickle_out.close()

NameError: name 'adaboost_classifier' is not defined