# Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier



In [2]:
data = pd.read_csv('HTRU_2.csv', names=['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean', 'DMSNR_deviation',
                                       'DMSNR_kurtosis', 'DMSNR_skew', 'Class'])

In [3]:
data

Unnamed: 0,IP_mean,IP_deviation,IP_kurtosis,IP_skew,DMSNR_mean,DMSNR_deviation,DMSNR_kurtosis,DMSNR_skew,Class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [4]:
# check class distribution

data['Class'].value_counts()

0    16259
1     1639
Name: Class, dtype: int64

In [3]:
data.dtypes

IP_mean            float64
IP_deviation       float64
IP_kurtosis        float64
IP_skew            float64
DMSNR_mean         float64
DMSNR_deviation    float64
DMSNR_kurtosis     float64
DMSNR_skew         float64
Class                int64
dtype: object

# Data Prep

According to the original SMOTE paper in 2011 (https://arxiv.org/abs/1106.1813), SMOTE is best used when combined with random undersampling. I chose to begin with random undersampling in order to limit the danger of my minority class being lost in the volume of majority instances at the time of SMOTE implemenation. My data class ratio was initially at 1:10 which I trimmed to 1:3 using random undersampling.

After undersampling, I used SMOTE to complete the process of evening my class instances by bringing the ratio down to 1:1.

I chose not to use pipelines here to allow for neat compartmentalization and quick verification of each step.

In [5]:
x = data[['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean',
       'DMSNR_deviation', 'DMSNR_kurtosis', 'DMSNR_skew']]
y = data['Class']

In [6]:
# train test split

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [8]:
# random undersampling to trim majority class, class imbalance will go from 1:10 to 1:3.

rus = RandomUnderSampler(sampling_strategy=0.3, random_state=2)
xTrain, yTrain = rus.fit_resample(x, y)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 1639})


In [12]:
# smote to beef up minority class (1), class imbalance will go from 1:3 to 1:1.

smote = SMOTE(sampling_strategy=0.5, random_state=2)
xTrain, yTrain = smote.fit_sample(xTrain, yTrain)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 2731})


In [13]:
# scale our resampled training data

scaler = MinMaxScaler()
X = scaler.fit_transform(xTrain)

# Validation

In [None]:
# standard kfold validation on

skf = KFold(n_splits=10)
skf.get_n_splits(X, Y)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in skf.split(X, Y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        clf = model()
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        score = score_type(Y_test, Y_pred)
        kfold_scores.append(score)
        
    
    print(f'{score_type} mean score: {sum(kfold_scores)/len(kfold_scores)}')
          
    

# Modeling

In [None]:
# parameters, lasso, ridge, get into those on all models, pruning, grid search, parameter tuning for each

# be able to desribe whihc metrics matter to me and why

# run ROC_AUC

In [None]:
# Logisitc Regression

from sklearn.metrics import recall_score

lr = LogisticRegression()
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_test)

lr_score = lr.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
kfold(LogisticRegression, recall_score)

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc_score = svc.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
# Naive Bayes

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

Y_pred = gnb.predict(X_test)

gnb_score = gnb.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
# KNN

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

knn_score = knn.score(X_test, Y_test)
recall_score(Y_test, Y_pred)


In [None]:
# Linear SVC

lsvc = LinearSVC()
lsvc.fit(X_train, Y_train)

Y_pred = lsvc.predict(X_test)

lsvc_score = lsvc.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)

Y_pred = sgd.predict(X_test)

sgd_score = sgd.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
# Decision Tree

dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)

Y_pred = dtc.predict(X_test)

dtc_score = dtc.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

In [None]:
# Random Forest

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

Y_pred = rfc.predict(X_test)

rfc_score = rfc.score(X_test, Y_test)

recall_score(Y_test, Y_pred)

# Model Examination

In [None]:
# use recall score instead of accuracy score

In [None]:
models = pd.DataFrame({'Model': ['Logistic Regression', 'Support Vector Machines', 'Naive Bayes','KNN', 
                                 'Linear SVC', 'Stochastic Gradient Decent', 'Decision Tree', 'Random Forest'], 
                       'Score' : [lr_score, svc_score, gnb_score, knn_score, lsvc_score, sgd_score,
                                 dtc_score, rfc_score]})

models['Score'] = round(models.Score * 100, 2)
models.sort_values(by='Score', ascending=False)

# Final Model

# Visualizations