## Model Selection

#### 1. Cross Validation|
#### 2. Grid Search

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Original Process
- Fitting Kernel SVM to the Training set
- Predicting the Test set results
- Making the Confusion Matrix (Can used to validate performance, but not the best one)

In [8]:

from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm


array([[64,  4],
       [ 3, 29]], dtype=int64)

#### Applying k-Fold Cross Validation

- Use different fold to test the model performance. more robust
- Evaluate both average and standard deviation

In [7]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier,X = X_train, y = y_train,cv = 10)

print("Average  = ",accuracies.mean())
print("Variance = ",accuracies.std())

Average  =  0.9005302187615868
Variance =  0.06388957356626285


#### Apply Grid Search to find the best model and the best parameters

- C: penalty parameter, the more increase, the more it will prevent overfitting

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [1,10,100,1000],'kernel': ['linear']},  ## First option
              {'C': [1,10,100,1000],'kernel': ['rbf'],'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}  ## Second option
             ]
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 10,
                          n_jobs = -1)

gird_search = grid_search.fit(X_train,y_train)


In [13]:
best_accuracy = grid_search.best_score_
print("Best Accuracy", best_accuracy)

best_parameters = grid_search.best_params_
print("Best Parameters", best_parameters)


Best Accuracy 0.9033333333333333
Best Parameters {'gamma': 0.7, 'C': 1, 'kernel': 'rbf'}
