# Import

In [72]:
import warnings
warnings.filterwarnings('ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('HTRU_2.csv', names=['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean', 'DMSNR_deviation',
                                       'DMSNR_kurtosis', 'DMSNR_skew', 'Class'])

In [3]:
data

Unnamed: 0,IP_mean,IP_deviation,IP_kurtosis,IP_skew,DMSNR_mean,DMSNR_deviation,DMSNR_kurtosis,DMSNR_skew,Class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [4]:
# check class distribution

data['Class'].value_counts()

0    16259
1     1639
Name: Class, dtype: int64

In [3]:
data.dtypes

IP_mean            float64
IP_deviation       float64
IP_kurtosis        float64
IP_skew            float64
DMSNR_mean         float64
DMSNR_deviation    float64
DMSNR_kurtosis     float64
DMSNR_skew         float64
Class                int64
dtype: object

# Data Prep

According to the original SMOTE paper in 2011 (https://arxiv.org/abs/1106.1813), SMOTE is best used when combined with random undersampling. I chose to begin with random undersampling in order to limit the danger of my minority class being lost in the volume of majority instances at the time of SMOTE implemenation. My data class ratio was initially at 1:10 which I trimmed to 1:3 using random undersampling.

After undersampling, I used SMOTE to complete the process of evening my class instances by bringing the ratio down to 1:1.

I chose not to use pipelines here to allow for neat compartmentalization and quick verification of each step.

In [5]:
x = data[['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean',
       'DMSNR_deviation', 'DMSNR_kurtosis', 'DMSNR_skew']]
y = data['Class']

In [6]:
# train test split

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [8]:
# random undersampling to trim majority class, class imbalance will go from 1:10 to 1:3.

rus = RandomUnderSampler(sampling_strategy=0.3, random_state=2)
xTrain, yTrain = rus.fit_resample(x, y)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 1639})


In [12]:
# smote to beef up minority class (1), class imbalance will go from 1:3 to 1:1.

smote = SMOTE(sampling_strategy=0.5, random_state=2)
xTrain, yTrain = smote.fit_sample(xTrain, yTrain)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 2731})


In [13]:
# scale our resampled training data

scaler = MinMaxScaler()
X = scaler.fit_transform(xTrain)

# Validation

In [47]:
# standard kfold validation on

Y = yTrain

kfd = KFold(10)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in kfd.split(X, Y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        clf = model()
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        score = score_type(Y_test, Y_pred)
        kfold_scores.append(score)
        
    
    print(sum(kfold_scores)/len(kfold_scores))
          

# Modeling

In [85]:
# Logisitic Regression

lr_rcScore = kfold(LogisticRegression, recall_score)
lr_pcScore = kfold(LogisticRegression, precision_score)
lr_acScore = kfold(LogisticRegression, accuracy_score)
lr_f1Score = kfold(LogisticRegression, f1_score)

0.3337718242827732
0.39565217391304347
0.9345629709044344
0.3620804999042013


In [84]:
# Support Vector Machines

svc_rcScore = kfold(SVC, recall_score)
svc_pcScore = kfold(SVC, precision_score)
svc_acScore = kfold(SVC, accuracy_score)
svc_f1Score = kfold(SVC, f1_score)

0.3292554566277194
0.3969026548672566
0.9337057387057388
0.35991544568196615


In [83]:
# Naive Bayes

gnb_rcScore = kfold(GaussianNB, recall_score)
gnb_pcScore = kfold(GaussianNB, precision_score)
gnb_acScore = kfold(GaussianNB, accuracy_score)
gnb_f1Score = kfold(GaussianNB, f1_score)

0.3387725818382753
0.3888888888888889
0.9166300366300367
0.36201623359200974


In [82]:
# KNN

knn_rcScore = kfold(KNeighborsClassifier, recall_score)
knn_pcScore = kfold(KNeighborsClassifier, precision_score)
knn_acScore = kfold(KNeighborsClassifier, accuracy_score)
knn_f1Score = kfold(KNeighborsClassifier, f1_score)

0.3565804835877829
0.3952
0.9427523154352423
0.3748908747495742


In [81]:
# Linear SVC

lsvc_rcScore = kfold(LinearSVC, recall_score)
lsvc_pcScore = kfold(LinearSVC, precision_score)
lsvc_acScore = kfold(LinearSVC, accuracy_score)
lsvc_f1Score = kfold(LinearSVC, f1_score)

0.34048064668502626
0.3982532751091703
0.9440842490842491
0.3670838419581651


In [80]:
# Stochastic Gradient Descent

sgd_rcScore = kfold(SGDClassifier, recall_score)
sgd_pcScore = kfold(SGDClassifier, precision_score)
sgd_acScore = kfold(SGDClassifier, accuracy_score)
sgd_f1Score = kfold(SGDClassifier, f1_score)

0.3408375890127715
0.3940711462450593
0.9412756484707705
0.36442572438857423


In [86]:
# Decision Tree

dtc_rcScore = kfold(DecisionTreeClassifier, recall_score)
dtc_pcScore = kfold(DecisionTreeClassifier, precision_score)
dtc_acScore = kfold(DecisionTreeClassifier, accuracy_score)
dtc_f1Score = kfold(DecisionTreeClassifier, f1_score)


0.3603602399222837
0.38992537313432835
0.931529527383186
0.3735790597803885


In [87]:
# Random Forest

rfc_rcScore = kfold(RandomForestClassifier, recall_score)
rfc_pcScore = kfold(RandomForestClassifier, precision_score)
rfc_acScore = kfold(RandomForestClassifier, accuracy_score)
rfc_f1Score = kfold(RandomForestClassifier, f1_score)

0.35402306533693395
0.3983333333333333
0.9529991959260252
0.3760752240058812


# Model Examination

In [None]:
# use recall score and accuracy score

In [66]:
models = pd.DataFrame({'Model': ['Logistic Regression', 'Support Vector Machines', 'Naive Bayes','KNN', 
                                 'Linear SVC', 'Stochastic Gradient Decent', 'Decision Tree', 'Random Forest'], 
                       'Recall_Score' : [lr_rcScore, svc_rcScore, gnb_rcScore, knn_rcScore, lsvc_rcScore, sgd_rcScore,
                                 dtc_rcScore, rfc_rcScore],
                       'Precision_Score' : [lr_pcScore, svc_pcScore, gnb_pcScore, knn_pcScore, lsvc_pcScore, sgd_pcScore,
                                 dtc_pcScore, rfc_pcScore]})

models

Unnamed: 0,Model,Recall_Score,Precision_Score
0,Logistic Regression,,
1,Support Vector Machines,,
2,Naive Bayes,,
3,KNN,,
4,Linear SVC,,
5,Stochastic Gradient Decent,,
6,Decision Tree,,
7,Random Forest,,


# Final Model

# Visualizations