## This file covers code for Support Vector Machines

Installing relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, scale 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score


In [2]:
filepath = r'/Users/ureemjames/Downloads/Output_NoOutliers.csv'
data = pd.read_csv(filepath)

In [3]:
columns_to_drop = data.columns.to_list()[0]

In [4]:
data.drop(columns = [columns_to_drop], axis = 1, inplace = True)

In [6]:
object_variables = ['RIAGENDR',
'RIDRETH3',
'DMDBORN4',
'DMDEDUC2',
'DMDMARTZ',
'BPQ020',
'BPQ080',
'CDQ001',
'CDQ010',
'HSQ590',
'FSD162',
'HEQ010',
'HEQ030',
'IMQ011',
'IMQ020',
'KIQ022',
'KIQ044',
'DPQ020',
'MCQ010',
'MCQ080',
'MCQ092',
'PAQ620',
'PAQ665',
'RHQ131',
'SLQ050',
'SMQ020']

In [7]:
encoded_data = pd.get_dummies(data, columns = object_variables, prefix= object_variables, drop_first= True)

Now, that we have dropped the Sequence ID number, we have our final data set. Let's now partition the data into testing and training data.

In [8]:
target_variable = 'DIQ010'

In [9]:
predictors_df = encoded_data.drop(columns = [target_variable], axis = 1)

In [10]:
target_df = encoded_data[target_variable]

In [11]:
from sklearn.model_selection import train_test_split

random_number = 1000

np.random.seed(random_number)

X_train, X_test, Y_train, Y_test = train_test_split(predictors_df, target_df, train_size=0.8, test_size=0.2, random_state=random_number)

In [12]:
Y_train.value_counts()/len(Y_train)

0    0.820272
1    0.179728
Name: DIQ010, dtype: float64

In [13]:
Y_test.value_counts()/len(Y_test)

0    0.828224
1    0.171776
Name: DIQ010, dtype: float64

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
from sklearn.svm import SVC

In [18]:
model = SVC()
model.fit(X_train_scaled, Y_train)

SVC()

In [19]:
model.score(X_test_scaled, Y_test)

0.8979915433403806

## Now Let's try and optimize this model.

In [20]:
param_grid = [

{

    'C': [0.5,1,10,100],
    'gamma': ['scale',1,0.1,0.01,0.001,0.0001,0.000001],
    'kernel' : ['rbf']
}


]

In [21]:
original_params = GridSearchCV(model,param_grid, cv = 5, scoring = 'accuracy', verbose = 0)
original_params.fit(X_train_scaled, Y_train)

best_params = original_params.best_params_
best_estimator = original_params.best_estimator_

print(best_params)

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}


In [26]:
from sklearn.metrics import confusion_matrix

test_accuracy = best_estimator.score(X_test_scaled, Y_test)
y_pred = best_estimator.predict(X_test_scaled)
precision_test = precision_score(Y_test,y_pred)
recall_test = recall_score(Y_test,y_pred)
f1_score_test = f1_score(Y_test, y_pred)
conf_matrix = confusion_matrix(Y_test, y_pred)
specificity = conf_matrix[0,0]/( conf_matrix[0,0] +  conf_matrix[0,1])



print(f"Test Accuracy: {test_accuracy:.2f}; Precision: {precision_test}: Recall: {recall_test}; Specificity: {specificity} F1 Score: {f1_score_test}.")

Test Accuracy: 0.91; Precision: 0.8387096774193549: Recall: 0.56; Specificity: 0.9776643267389917 F1 Score: 0.6715867158671587.
