In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix ,f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
import plotly.express as px

In [11]:
data = pd.read_csv("../Data/HTRU_2.csv")
x = data.iloc[:, 0:8]
y = data.iloc[:, -1]
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MIP     17898 non-null  float64
 1   SDIP    17898 non-null  float64
 2   KIP     17898 non-null  float64
 3   SIP     17898 non-null  float64
 4   MDM     17898 non-null  float64
 5   SDDM    17898 non-null  float64
 6   KDM     17898 non-null  float64
 7   SDM     17898 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB


In [12]:
y

0        0
1        0
2        0
3        0
4        0
        ..
17893    0
17894    0
17895    0
17896    0
17897    0
Name: CLASS, Length: 17898, dtype: int64

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [14]:
standard_scale = StandardScaler()
x_train_scaled = standard_scale.fit_transform(x_train)

In [15]:
smt = SMOTE(sampling_strategy=0.5, random_state=42)
x_train_sm, y_train_sm = smt.fit_sample(x_train_scaled, y_train)

# Random Forest Classifier

In [16]:
random_forest=RandomForestClassifier()
random_forest.fit(x_train_sm, y_train_sm)

RandomForestClassifier()

In [17]:
random_forest.feature_importances_

array([0.18622726, 0.03417498, 0.27607481, 0.20094187, 0.08066645,
       0.09969421, 0.06608994, 0.05613047])

In [18]:
y_train_pred_random_forest=cross_val_predict(random_forest, x_train_sm, y_train_sm, cv=5)

In [19]:
confusion_matrix_random_forest = confusion_matrix(y_train_sm, y_train_pred_random_forest)

confusion_matrix_random_forest

array([[12826,   168],
       [  360,  6137]], dtype=int64)

In [20]:
# precision, accuracy and recall for random forest classifier

print(accuracy_score(y_train_sm, y_train_pred_random_forest))
print(precision_score(y_train_sm, y_train_pred_random_forest))
print(recall_score(y_train_sm, y_train_pred_random_forest))
print(f1_score(y_train_sm, y_train_pred_random_forest))

0.9729105741111282
0.9733544805709754
0.9445898106818532
0.9587564443055773


In [21]:
y_test_pred_random_forest=random_forest.predict(standard_scale.fit_transform(x_test))

In [22]:
# precision, accuracy and recall for random forest classifier

print(accuracy_score(y_test, y_test_pred_random_forest))
print(precision_score(y_test, y_test_pred_random_forest))
print(recall_score(y_test, y_test_pred_random_forest))
print(f1_score(y_test, y_test_pred_random_forest))

0.9787709497206704
0.8722741433021807
0.8888888888888888
0.880503144654088


# Support Vector Machines

In [28]:
gaussian_svm = SVC(kernel="poly",degree=3, C=5, probability=True)
gaussian_svm.fit(x_train_sm, y_train_sm)

SVC(C=5, kernel='poly', probability=True)

In [29]:
y_train_pred_svm=cross_val_predict(gaussian_svm, x_train_sm, y_train_sm, cv=5)

In [30]:
print(accuracy_score(y_train_sm, y_train_pred_svm))
print(precision_score(y_train_sm, y_train_pred_svm))
print(recall_score(y_train_sm, y_train_pred_svm))
print(f1_score(y_train_sm, y_train_pred_svm))

0.9442306705659023
0.9780841286673736
0.8517777435739572
0.9105717811600165


In [31]:
y_gaussian_svm=gaussian_svm.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_gaussian_svm))
print(precision_score(y_test, y_gaussian_svm))
print(recall_score(y_test, y_gaussian_svm))
print(f1_score(y_test, y_gaussian_svm))

0.9804469273743017
0.8990228013029316
0.8761904761904762
0.8874598070739549


# Logistic Regression

In [32]:
logistic_regression=LogisticRegression()
logistic_regression.fit(x_train_sm, y_train_sm)

LogisticRegression()

In [33]:
y_train_pred_log=cross_val_predict(logistic_regression, x_train_sm, y_train_sm, cv=5)

In [34]:
# log confusion matrix

confusion_matrix_log = confusion_matrix(y_train_sm, y_train_pred_log)

confusion_matrix_log

array([[12812,   182],
       [  714,  5783]], dtype=int64)

In [35]:
# precision, accuracy and recall for log classifier

print(accuracy_score(y_train_sm, y_train_pred_log))
print(precision_score(y_train_sm, y_train_pred_log))
print(recall_score(y_train_sm, y_train_pred_log))
print(f1_score(y_train_sm, y_train_pred_log))

0.9540300651582782
0.9694886839899414
0.8901031245190087
0.9281014283421603


In [36]:
y_logistic_regression=logistic_regression.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_logistic_regression))
print(precision_score(y_test, y_logistic_regression))
print(recall_score(y_test, y_logistic_regression))
print(f1_score(y_test, y_logistic_regression))

0.9768156424581006
0.8473053892215568
0.8984126984126984
0.8721109399075501


# Adaboost

In [23]:
adaboost_classifier=AdaBoostClassifier(
    RandomForestClassifier(), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5
)

adaboost_classifier.fit(x_train_sm, y_train_sm)

AdaBoostClassifier(base_estimator=RandomForestClassifier(), learning_rate=0.5,
                   n_estimators=200)

In [24]:
# adaboost metrics

y_train_pred_adaboost=cross_val_predict(adaboost_classifier, x_train_sm, y_train_sm, cv=5)

In [25]:
# adaboost confusion matrix

confusion_matrix_adaboost = confusion_matrix(y_train_sm, y_train_pred_adaboost)

confusion_matrix_adaboost

array([[12822,   172],
       [  357,  6140]], dtype=int64)

In [26]:
# precision, accuracy and recall for adaboost classifier

print(accuracy_score(y_train_sm, y_train_pred_adaboost))
print(precision_score(y_train_sm, y_train_pred_adaboost))
print(recall_score(y_train_sm, y_train_pred_adaboost))
print(f1_score(y_train_sm, y_train_pred_adaboost))

0.972859268380278
0.9727503168567807
0.9450515622595044
0.9587009134202514


In [27]:
y_adaboost_classifier=adaboost_classifier.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_adaboost_classifier))
print(precision_score(y_test, y_adaboost_classifier))
print(recall_score(y_test, y_adaboost_classifier))
print(f1_score(y_test, y_adaboost_classifier))

0.9787709497206704
0.8676923076923077
0.8952380952380953
0.88125


# SGDC

In [37]:
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(x_train_sm, y_train_sm)

SGDClassifier(random_state=42)

In [38]:
y_train_pred_sgd_clf=cross_val_predict(sgd_clf, x_train_sm, y_train_sm, cv=5)

In [39]:
# sgd_clf confusion matrix

confusion_matrix_sgd_clf = confusion_matrix(y_train_sm, y_train_pred_sgd_clf)

confusion_matrix_sgd_clf

array([[12790,   204],
       [  719,  5778]], dtype=int64)

In [40]:
# precision, accuracy and recall for sgd_clf classifier

print(accuracy_score(y_train_sm, y_train_pred_sgd_clf))
print(precision_score(y_train_sm, y_train_pred_sgd_clf))
print(recall_score(y_train_sm, y_train_pred_sgd_clf))
print(f1_score(y_train_sm, y_train_pred_sgd_clf))

0.9526448104253246
0.9658976930792377
0.8893335385562567
0.9260357400432727


In [41]:
y_sgd_clf=sgd_clf.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_sgd_clf))
print(precision_score(y_test, y_sgd_clf))
print(recall_score(y_test, y_sgd_clf))
print(f1_score(y_test, y_sgd_clf))

0.9782122905027933
0.8691588785046729
0.8857142857142857
0.8773584905660377


# K Neighbors

In [42]:
k_neig=KNeighborsClassifier(n_neighbors=15)
k_neig.fit(x_train_sm, y_train_sm)

KNeighborsClassifier(n_neighbors=15)

In [43]:
y_train_pred_k_neig=cross_val_predict(k_neig, x_train_sm, y_train_sm, cv=5)

In [44]:
# k_neig confusion matrix

confusion_matrix_k_neig = confusion_matrix(y_train_sm, y_train_pred_k_neig)

confusion_matrix_k_neig

array([[12732,   262],
       [  469,  6028]], dtype=int64)

In [45]:
# precision, accuracy and recall for k_neig classifier

print(accuracy_score(y_train_sm, y_train_pred_k_neig))
print(precision_score(y_train_sm, y_train_pred_k_neig))
print(recall_score(y_train_sm, y_train_pred_k_neig))
print(f1_score(y_train_sm, y_train_pred_k_neig))

0.9624955107485507
0.9583465818759936
0.9278128366938587
0.9428325643231407


In [46]:
y_neig=k_neig.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_neig))
print(precision_score(y_test, y_neig))
print(recall_score(y_test, y_neig))
print(f1_score(y_test, y_neig))

0.9692737430167597
0.7887323943661971
0.8888888888888888
0.835820895522388


# Voting Classification

In [47]:
voting_clf=VotingClassifier(
    estimators=[('lr', logistic_regression), ('rf', random_forest), ('ab', adaboost_classifier), ('kn', k_neig), ('sm', gaussian_svm)], voting='soft'
)

voting_clf.fit(x_train_sm, y_train_sm)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('ab',
                              AdaBoostClassifier(base_estimator=RandomForestClassifier(),
                                                 learning_rate=0.5,
                                                 n_estimators=200)),
                             ('kn', KNeighborsClassifier(n_neighbors=15)),
                             ('sm', SVC(C=5, kernel='poly', probability=True))],
                 voting='soft')

In [48]:
y_train_pred_voting_clf=cross_val_predict(voting_clf, x_train_sm, y_train_sm, cv=5)

In [49]:
# voting_clf confusion matrix

confusion_matrix_voting_clf = confusion_matrix(y_train_sm, y_train_pred_voting_clf)

confusion_matrix_voting_clf

array([[12851,   143],
       [  576,  5921]], dtype=int64)

In [50]:
# precision, accuracy and recall for voting_clf classifier

print(accuracy_score(y_train_sm, y_train_pred_voting_clf))
print(precision_score(y_train_sm, y_train_pred_voting_clf))
print(recall_score(y_train_sm, y_train_pred_voting_clf))
print(f1_score(y_train_sm, y_train_pred_voting_clf))

0.9631111795187522
0.9764182058047494
0.9113436970909651
0.9427593344478943


In [51]:
y_test_pred_voting=voting_clf.predict(standard_scale.fit_transform(x_test))

In [52]:
print(accuracy_score(y_test, y_test_pred_voting))
print(precision_score(y_test, y_test_pred_voting))
print(recall_score(y_test, y_test_pred_voting))
print(f1_score(y_test, y_test_pred_voting))

0.9787709497206704
0.8699690402476781
0.8920634920634921
0.8808777429467084


# saving the model

In [53]:
import pickle 

pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(voting_clf, pickle_out) 
pickle_out.close()