In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Hyperparameter Search:

In [2]:
# Training Set:
X_train = pd.read_csv('Data/X_train.csv')
y_train = pd.read_csv('Data/y_train.csv', header=None)

In [3]:
X_train.head(3)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3171,179,20,270,48,2758,223,248,147,3194,...,0,0,0,0,0,0,0,0,0,0
1,3067,203,26,0,0,3396,198,252,176,792,...,0,0,1,0,0,0,0,0,0,0
2,3382,335,4,162,23,3445,211,234,162,2259,...,0,0,0,0,1,0,0,0,0,0


In [4]:
# Definining Scaler:
scaler = MinMaxScaler()

# Fitting scaler on X_train (which also converts to an array):
X_train = scaler.fit_transform(X_train)

In [5]:
y_train[0].value_counts()

2    226640
1    169472
3     28603
7     16408
6     13894
5      7594
4      2198
Name: 0, dtype: int64

In [6]:
# Converting y_train into an array:
y_train = np.array(y_train[0])

In [7]:
len(X_train)

464809

In [8]:
len(y_train)

464809

# Define Classifier:

In [12]:
seed = 42

In [10]:
# RANDOM SEARCH FOR COMBINATIONS OF PARAMETERS
rand_list = {"C": stats.expon(scale=50),
             "gamma": stats.uniform(0.1, 0.99)}

In [110]:
######## RANDOM SEARCH ########

kern_types = ['linear','sigmoid', 'poly', 'rbf']    
iters = 20

eval_metric = make_scorer(accuracy_score)


# Creating an empty array to hold results:
test_results = np.zeros((iters*len(kern_types), 5))


# Using train-test split to split out a random, stratified 10% portion of the training data to grid search:
X_gs, X_leave, y_gs, y_leave = train_test_split(X_train, y_train, 
                                                test_size = 0.9,    # Inverse of the size of the data to search
                                                stratify = y_train,  # Keeping same proportion of target classes
                                               random_state = seed) 

for i in range(len(kern_types)):
    # Defining the model, looping through the list of available kernels:
    svm_model = svm.SVC(kernel=kern_types[i])

    start = time.time()
    # Setting the random search parameters:
    rand_search = RandomizedSearchCV(svm_model, 
                                     param_distributions = rand_list, # Sets to params specified above 
                                     n_iter = iters,                  # No. sampled parameter settings, 10 = default
                                     n_jobs = -1,                     # Set to -1 to use all available CPU processors
                                     cv = 4,                          # Cross-validation folds
                                     scoring = eval_metric)           # Scoring metric chosen

    # Fitting the random search to the subsetted data:
    rand_search.fit(X_gs, y_gs)

    # CV Results
    cv_res = rand_search.cv_results_


    # Appending Results:
    test_results[i*iters: (i+1)*iters, 1] = cv_res['mean_test_score']
    test_results[i*iters: (i+1)*iters, 2] = cv_res['mean_fit_time']
    test_results[i*iters: (i+1)*iters, 3] = [d['C'] for d in rand_search.cv_results_['params']]
    test_results[i*iters: (i+1)*iters, 4] = [d['gamma'] for d in rand_search.cv_results_['params']]

    end = time.time()
    time_taken = end - start

    print("Time taken for Grid Search loop: ", time_taken)



Time taken for Grid Search loop:  2308.882013320923
Time taken for Grid Search loop:  1110.4007375240326
Time taken for Grid Search loop:  9625.278460502625
Time taken for Grid Search loop:  1149.2336256504059


# Converting Results to Dataframe and Saving Csv:

In [107]:
def convert_res(results):
    df = pd.DataFrame(results, columns=['name', 'mean_score', 'mean_fit_time', 'param_C', 'param_gamma'])
    for i in range(len(kern_types)):
        df['name'][i*iters:(i+1)*iters] = kern_types[i]

    return df

In [111]:
results_df = convert_res(test_results)

In [15]:
results_df.sort_values(by = 'mean_score', ascending=False).head(10)

Unnamed: 0,name,mean_score,mean_fit_time,param_C,param_gamma
66,rbf,0.819836,213.773285,108.922825,0.987901
67,rbf,0.81605,208.640711,106.959446,0.851896
60,rbf,0.812801,226.951448,177.991492,0.610313
40,poly,0.811768,4074.293481,183.404385,1.010411
62,rbf,0.81136,185.002533,74.745104,0.829911
75,rbf,0.809897,186.730004,75.428172,0.774827
73,rbf,0.80966,189.94274,73.857453,0.773336
42,poly,0.808842,2182.817651,61.774699,1.088295
72,rbf,0.808391,179.107991,60.835432,0.794658
63,rbf,0.806411,172.883391,61.956224,0.74372


In [114]:
#results_df.to_csv('Data/SVM_Gridsearch_Results.csv')

In [17]:
len(results_df)

80

# Comparing Linear SVC:

In [11]:
# RANDOM SEARCH FOR C PARAM - No Gamma param for linear kernels:
rand_list_lin = {"C": stats.expon(scale=50)}

In [19]:
# Comparison of Linear SVC to the Gridsearch times and accuracies above:

# Setting Static Variables
iters = 20
eval_metric = make_scorer(accuracy_score)

# Creating an empty array to hold results:
test_results = np.zeros((iters, 3))

# Using train-test split to split out a random, stratified 10% portion of the training data to grid search:
X_gs, X_leave, y_gs, y_leave = train_test_split(X_train, y_train, 
                                                test_size = 0.9,    # Inverse of the size of the data to search
                                                stratify = y_train,  # Keeping same proportion of target classes
                                               random_state = seed) 

# Defining the model
svm_lin_model = svm.LinearSVC()

start = time.time()
# Setting the random search parameters:
rand_search = RandomizedSearchCV(svm_lin_model, 
                                 param_distributions = rand_list_lin, # Sets to params specified above 
                                 n_iter = iters,                  # No. sampled parameter settings, 10 = default
                                 n_jobs = -1,                     # Set to -1 to use all available CPU processors
                                 cv = 4,                          # Cross-validation folds
                                 scoring = eval_metric)           # Scoring metric chosen

# Fitting the random search to the subsetted data:
rand_search.fit(X_gs, y_gs)

# CV Results
cv_res = rand_search.cv_results_

# Appending Results:
test_results[0: iters, 0] = cv_res['mean_test_score']
test_results[0: iters, 1] = cv_res['mean_fit_time']
test_results[0: iters, 2] = [d['C'] for d in rand_search.cv_results_['params']]

end = time.time()
time_taken = end - start

print("Time taken for Grid Search loop: ", time_taken)

Time taken for Grid Search loop:  248.7047634124756


In [21]:
linearSVC_df = pd.DataFrame(test_results, columns=['mean_score', 'mean_fit_time', 'param_C'])

In [23]:
linearSVC_df.sort_values(by = 'mean_score', ascending=False).head(10)

Unnamed: 0,mean_score,mean_fit_time,param_C
7,0.714393,32.686381,13.789707
16,0.714049,33.319493,13.452727
1,0.714028,35.11981,21.855043
0,0.713834,36.482925,27.935964
2,0.713769,38.151093,31.716857
12,0.713705,31.240626,8.563421
8,0.713597,35.687535,27.27531
13,0.713576,24.162505,4.4014
5,0.71349,23.315981,4.289423
4,0.712995,36.640827,32.181676


In [24]:
linearSVC_df.to_csv('Data/SVM_LinearSVC_Results.csv')