# Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



In [2]:
data = pd.read_csv('HTRU_2.csv', names=['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean', 'DMSNR_deviation',
                                       'DMSNR_kurtosis', 'DMSNR_skew', 'Class'])

In [None]:
data

In [None]:
# check class distribution

data['Class'].value_counts()

In [None]:
data.dtypes

# Data Prep

According to the original SMOTE paper in 2011 (https://arxiv.org/abs/1106.1813), SMOTE is best used when combined with random undersampling. I chose to begin with random undersampling in order to limit the danger of my minority class being lost in the volume of majority instances at the time of SMOTE implemenation. My data class ratio was initially at 1:10 which I trimmed to 1:3 using random undersampling.

After undersampling, I used SMOTE to complete the process of evening my class instances by bringing the ratio down to 1:1.

I chose not to use pipelines here to allow for neat compartmentalization and quick verification of each step.

In [3]:
x = data[['IP_mean', 'IP_deviation', 'IP_kurtosis', 'IP_skew', 'DMSNR_mean',
       'DMSNR_deviation', 'DMSNR_kurtosis', 'DMSNR_skew']]
y = data['Class']

In [4]:
# train test split

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [6]:
# random undersampling to trim majority class, class imbalance will go from 1:10 to 1:3.

rus = RandomUnderSampler(sampling_strategy=0.3, random_state=2)
xTrain, yTrain = rus.fit_resample(x, y)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 1639})


In [7]:
# smote to beef up minority class (1), class imbalance will go from 1:3 to 1:1.

smote = SMOTE(sampling_strategy=0.5, random_state=2)
xTrain, yTrain = smote.fit_sample(xTrain, yTrain)

print('Resampled dataset shape %s' % Counter(yTrain))

Resampled dataset shape Counter({0: 5463, 1: 2731})


In [None]:
# scale our resampled training data

scaler = MinMaxScaler()
X = scaler.fit_transform(xTrain)

# Validation

In [None]:
# standard kfold validation on

Y = yTrain

kfd = KFold(10)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in kfd.split(X, Y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        clf = model()
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        score = score_type(Y_test, Y_pred)
        kfold_scores.append(score)
        
    
    print(sum(kfold_scores)/len(kfold_scores))
          

# Modeling

In [None]:
# Logisitic Regression

lr_rcScore = kfold(LogisticRegression, recall_score)
lr_pcScore = kfold(LogisticRegression, precision_score)
lr_acScore = kfold(LogisticRegression, accuracy_score)
lr_f1Score = kfold(LogisticRegression, f1_score)

In [None]:
# Support Vector Machines

svc_rcScore = kfold(SVC, recall_score)
svc_pcScore = kfold(SVC, precision_score)
svc_acScore = kfold(SVC, accuracy_score)
svc_f1Score = kfold(SVC, f1_score)

In [None]:
# Naive Bayes

gnb_rcScore = kfold(GaussianNB, recall_score)
gnb_pcScore = kfold(GaussianNB, precision_score)
gnb_acScore = kfold(GaussianNB, accuracy_score)
gnb_f1Score = kfold(GaussianNB, f1_score)

In [None]:
# KNN

knn_rcScore = kfold(KNeighborsClassifier, recall_score)
knn_pcScore = kfold(KNeighborsClassifier, precision_score)
knn_acScore = kfold(KNeighborsClassifier, accuracy_score)
knn_f1Score = kfold(KNeighborsClassifier, f1_score)

In [None]:
# Linear SVC

lsvc_rcScore = kfold(LinearSVC, recall_score)
lsvc_pcScore = kfold(LinearSVC, precision_score)
lsvc_acScore = kfold(LinearSVC, accuracy_score)
lsvc_f1Score = kfold(LinearSVC, f1_score)

In [None]:
# Stochastic Gradient Descent

sgd_rcScore = kfold(SGDClassifier, recall_score)
sgd_pcScore = kfold(SGDClassifier, precision_score)
sgd_acScore = kfold(SGDClassifier, accuracy_score)
sgd_f1Score = kfold(SGDClassifier, f1_score)

In [None]:
# Decision Tree

dtc_rcScore = kfold(DecisionTreeClassifier, recall_score)
dtc_pcScore = kfold(DecisionTreeClassifier, precision_score)
dtc_acScore = kfold(DecisionTreeClassifier, accuracy_score)
dtc_f1Score = kfold(DecisionTreeClassifier, f1_score)


In [None]:
# Random Forest

rfc_rcScore = kfold(RandomForestClassifier, recall_score)
rfc_pcScore = kfold(RandomForestClassifier, precision_score)
rfc_acScore = kfold(RandomForestClassifier, accuracy_score)
rfc_f1Score = kfold(RandomForestClassifier, f1_score)

# Model Examination

In [None]:
# 

In [None]:
models_1 = pd.DataFrame({'Model': ['Logistic Regression', 'Support Vector Machines', 'Naive Bayes','KNN', 
                                 'Linear SVC', 'Stochastic Gradient Decent', 'Decision Tree', 'Random Forest'], 
                       'Recall_Score' : [lr_rcScore, svc_rcScore, gnb_rcScore, knn_rcScore, lsvc_rcScore, sgd_rcScore,
                                 dtc_rcScore, rfc_rcScore],
                       'Precision_Score' : [lr_pcScore, svc_pcScore, gnb_pcScore, knn_pcScore, lsvc_pcScore, sgd_pcScore,
                                 dtc_pcScore, rfc_pcScore],
                       'Accuracy_Score' : [lr_acScore, svc_acScore, gnb_acScore, knn_acScore, lsvc_acScore, sgd_acScore,
                                 dtc_acScore, rfc_acScore],
                       'F1_Score' : [lr_f1Score, svc_f1Score, gnb_f1Score, knn_f1Score, lsvc_f1Score, sgd_f1Score,
                                 dtc_f1Score, rfc_f1Score]})

models_1

# Final Model

In [None]:
# dont ferogt all the satudd you said in the notes etc

# Visualizations