In [9]:
import numpy as np
import pandas as pd
import time

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Loading Datasets:

In [2]:
# Training Set:
X_train = pd.read_csv('Data/X_train.csv')
y_train = pd.read_csv('Data/y_train.csv', header=None)

In [3]:
print("X_train Length: {:,}".format(len(X_train)))
print("y_train Length: {:,}".format(len(y_train)))

X_train Length: 464,809
y_train Length: 464,809


In [4]:
# Definining Scaler:
scaler = MinMaxScaler()

# Fitting scaler on X_train (which also converts to an array):
X_train = scaler.fit_transform(X_train)

In [5]:
# Converting y_train into an array:
y_train = np.array(y_train[0])

In [6]:
seed = 42

# Define Params to Search:

In [16]:
# RANDOM SEARCH FOR COMBINATIONS OF PARAMETERS
rand_list = {"C": stats.expon(scale=50),
             "gamma": stats.uniform(0.1, 0.99)}

# Sampling Strategies:

In [25]:
# Sampling strategies to loop through:
tomek = [TomekLinks(sampling_strategy=[1,2]), 'None']

# Undersampling the two majority classes - Types 1 & 2:
unders = [(0.85, 0.85),                   # 15 % undersampling of majority classes
          (0.75, 0.75),                   # 25 %
          (0.60, 0.60),                   # 40 %
          (0.40, 0.40),                   # 60 %
          (0.20, 0.20),                   # 80 %
          (0.55, 0.40)  ]                 # Imbalanced undersampling

# Oversampling the 5 minority classes - Types 3, 4, 5, 6 & 7:
overs = [(1.2, 1.2, 1.2, 1.2, 1.2),       # SMOTE - 20% oversampling
         (1.5, 1.5, 1.5, 1.5, 1.5),       # 50% oversampling
         (2.0, 2.0, 2.0, 2.0, 2.0),       # 100% oversampling
         (4.0, 4.0, 4.0, 4.0, 4.0),       # 300% oversampling
         (6.0, 6.0, 6.0, 6.0, 6.0),       # 500% oversampling
         (2.0, 10.0, 10.0, 5.0, 4.0),     # Imbalanced oversampling
         (0),                             # Equalizing minority classes with majority
         
         (1.2, 1.2, 1.2, 1.2, 1.2),       # Borderline SMOTE - 20% oversampling
         (1.5, 1.5, 1.5, 1.5, 1.5),       # 50% oversampling
         (2.0, 2.0, 2.0, 2.0, 2.0),       # 100% oversampling
         (4.0, 4.0, 4.0, 4.0, 4.0),       # 300% oversampling
         (6.0, 6.0, 6.0, 6.0, 6.0),       # 500% oversampling
         (2.0, 10.0, 10.0, 5.0, 4.0),     # Imbalanced oversampling
         (0),  ]                          # Equalizing minority classes with majority

# Gridsearch:

In [26]:
# Function to return index
def get_count(array, cov_type, percent):
        # Count the number of a specific cover type in the current split array to define sampling bounds:
        count = round(array.tolist().count(cov_type)*percent)
        return count

In [27]:
######## RANDOM SEARCH ########
  
iters = 10
eval_metric = make_scorer(accuracy_score)

test_results = np.zeros(((iters*len(tomek)*len(unders)*len(overs)), 7))


loop_counter = 0

print("TOTAL LOOPS = ", (len(tomek)*len(unders)*len(overs)))


for t in range(len(tomek)):

    for u in range(len(unders)):

        for o in range(len(overs)):

            # Defining the model, looping through the list of available kernels:
            svm_model = svm.SVC(kernel='rbf')


            # Using train-test split to split out a random, stratified 10% portion of the training data to grid search:
            X_gs, X_leave, y_gs, y_leave = train_test_split(X_train, y_train, 
                                                    test_size = 0.9,     # Inverse of the size of the data to search
                                                    stratify = y_train,  # Keeping same proportion of target classes
                                                   random_state = seed)  


             ###### APPLYING SAMPLING STRATEGY ######
            # Appling sampling strategies to X_train/y_train after splitting for Cross-val

            # Only applying Tomek Links on half the loops (t = 0 or 1):
            if t == 0:
                resample = tomek[t]
                X_gs, y_gs = resample.fit_resample(X_gs, y_gs)

            # Undersampling applied on majority classes according to methods above:
            under_strategy = {1:get_count(y_gs, 1, unders[u][0]),
                              2:get_count(y_gs, 2, unders[u][1])}

            undersample = RandomUnderSampler(sampling_strategy = under_strategy)
            X_gs, y_gs = undersample.fit_resample(X_gs, y_gs)

            # Oversampling applied on minority classes according to methods above:
            if o == 6:
                oversample = SMOTE()
            elif o == 13:
                oversample = BorderlineSMOTE()
            elif o < 6:
                over_strategy = {3:get_count(y_gs, 3, overs[o][0]),
                                 4:get_count(y_gs, 4, overs[o][1]),
                                 5:get_count(y_gs, 5, overs[o][2]),
                                 6:get_count(y_gs, 6, overs[o][3]),
                                 7:get_count(y_gs, 7, overs[o][4]),
                                }
                oversample = SMOTE(sampling_strategy = over_strategy)
            else:
                over_strategy = {3:get_count(y_gs, 3, overs[o][0]),
                                 4:get_count(y_gs, 4, overs[o][1]),
                                 5:get_count(y_gs, 5, overs[o][2]),
                                 6:get_count(y_gs, 6, overs[o][3]),
                                 7:get_count(y_gs, 7, overs[o][4]),
                                }
                oversample = BorderlineSMOTE(sampling_strategy = over_strategy)

            X_gs, y_gs = oversample.fit_resample(X_gs, y_gs)


            print("Starting Loop {:.0f}".format(loop_counter))
            loop_counter += 1

            start = time.time()
            # Setting the random search parameters:
            rand_search = RandomizedSearchCV(svm_model, 
                                             param_distributions = rand_list, # Sets to params specified above 
                                             n_iter = iters,                  # No. sampled parameter settings, 10 = default
                                             n_jobs = -1,                     # Set to -1 to use all available CPU processors
                                             cv = 4,                          # Cross-validation folds
                                             scoring = eval_metric)           # Scoring metric chosen

            # Fitting the random search to the sampled data:
            rand_search.fit(X_gs, y_gs)

            # CV Results
            cv_res = rand_search.cv_results_

            # Appending Results:
            idx = o+(u*len(overs))+(t*(len(unders)*len(overs)))
            test_results[idx*iters: (idx+1)*iters, 0] = cv_res['mean_test_score']
            test_results[idx*iters: (idx+1)*iters, 1] = cv_res['mean_fit_time']
            test_results[idx*iters: (idx+1)*iters, 2] = [d['C'] for d in rand_search.cv_results_['params']]
            test_results[idx*iters: (idx+1)*iters, 3] = [d['gamma'] for d in rand_search.cv_results_['params']]
            test_results[idx*iters: (idx+1)*iters, 4] = t
            test_results[idx*iters: (idx+1)*iters, 5] = u
            test_results[idx*iters: (idx+1)*iters, 6] = o


            end = time.time()
            time_taken = end - start

            print("Time taken for Grid Search loop: ", time_taken)



TOTAL LOOPS =  168
Starting Loop 0
Time taken for Grid Search loop:  315.9313654899597
Starting Loop 1
Time taken for Grid Search loop:  369.21482014656067
Starting Loop 2
Time taken for Grid Search loop:  345.05234360694885
Starting Loop 3
Time taken for Grid Search loop:  543.0894181728363
Starting Loop 4
Time taken for Grid Search loop:  639.5210440158844
Starting Loop 5
Time taken for Grid Search loop:  522.3280820846558
Starting Loop 6
Time taken for Grid Search loop:  1401.7313549518585
Starting Loop 7
Time taken for Grid Search loop:  305.7166426181793
Starting Loop 8
Time taken for Grid Search loop:  344.96580624580383
Starting Loop 9
Time taken for Grid Search loop:  388.34884762763977
Starting Loop 10
Time taken for Grid Search loop:  536.6972608566284
Starting Loop 11
Time taken for Grid Search loop:  702.2991688251495
Starting Loop 12
Time taken for Grid Search loop:  558.3931982517242
Starting Loop 13
Time taken for Grid Search loop:  1765.0403351783752
Starting Loop 14
Ti

Starting Loop 118
Time taken for Grid Search loop:  767.5072240829468
Starting Loop 119
Time taken for Grid Search loop:  193.00302362442017
Starting Loop 120
Time taken for Grid Search loop:  202.04849767684937
Starting Loop 121
Time taken for Grid Search loop:  221.36763215065002
Starting Loop 122
Time taken for Grid Search loop:  361.6896622180939
Starting Loop 123
Time taken for Grid Search loop:  525.229161977768
Starting Loop 124
Time taken for Grid Search loop:  364.89438700675964
Starting Loop 125
Time taken for Grid Search loop:  993.9411137104034
Starting Loop 126
Time taken for Grid Search loop:  85.0226092338562
Starting Loop 127
Time taken for Grid Search loop:  92.39948511123657
Starting Loop 128
Time taken for Grid Search loop:  104.89396095275879
Starting Loop 129
Time taken for Grid Search loop:  198.73712491989136
Starting Loop 130
Time taken for Grid Search loop:  372.6553318500519
Starting Loop 131
Time taken for Grid Search loop:  193.83226084709167
Starting Loop 1

# Mapping Results to Dataframe and Saving as Csv:

In [28]:
test_res_df_samp = pd.DataFrame(test_results, columns= ['mean_test_score', "mean_fit_time", 'C', 'Gamma',
                                                       'tomek', 'under_strat', 'over_strat'])

In [30]:
tomek_map = {0: 'TL', 1: 'No TL'}
under_map = {0: '-15%', 1: '-25%', 2: '-40%', 3: '-60%', 4: '-80%', 5: 'Imbal'}
over_map = {0: '+20% S', 1: '+50% S', 2: '+100% S', 3: '+300% S', 4: '+500% S', 5: 'Imbal S', 6: 'SMOTE',
            7: '+20% BS', 8: '+50% BS', 9: '+100% BS', 10: '+300% BS', 11: '+500% BS', 12: 'Imbal BS', 13: 'BoSMOTE'}

In [31]:
test_res_df_samp['tomek'] = test_res_df_samp['tomek'].map(tomek_map)
test_res_df_samp['under_strat'] = test_res_df_samp['under_strat'].map(under_map)
test_res_df_samp['over_strat'] = test_res_df_samp['over_strat'].map(over_map)

In [32]:
test_res_df_samp.sort_values(by = 'mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,mean_fit_time,C,Gamma,tomek,under_strat,over_strat
62,0.926540,304.765204,142.452992,1.005504,TL,-15%,SMOTE
901,0.920481,388.256793,268.301587,0.782780,No TL,-15%,SMOTE
347,0.915996,139.882393,92.085811,1.079928,TL,-40%,SMOTE
202,0.914451,190.281967,112.736727,0.754321,TL,-25%,SMOTE
346,0.913354,128.682077,88.801957,1.003014,TL,-40%,SMOTE
69,0.913298,161.436364,73.781407,0.849985,TL,-15%,SMOTE
65,0.912656,258.278818,70.626966,0.849978,TL,-15%,SMOTE
1048,0.910964,136.228243,64.892039,1.056299,No TL,-25%,SMOTE
762,0.910460,62.134983,148.652672,1.089462,TL,Imbal,SMOTE
488,0.909125,46.025634,130.499545,0.995317,TL,-60%,SMOTE


In [33]:
test_res_df_samp.to_csv('Data/sampling_SVM_gridsearch.csv')