# Part 10 Model Selection

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 10.1 k-Fold Cross Validation
- **STEP 1**: Split the training set into $k$ folds.
- **STEP 2**: We train the model on $k-1$ folds and test the model on the last remaining fold.
- **STEP 3**: We then do the iterations and train and test $k$ combinations of the training and test set in total. 
- **STEP 4**: We take the average of the accuracy of the $k$ combinations and calculate the deviation.

In [2]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
# Splitting the dataset into the independent and dependent variables
X = dataset.iloc[:, 2: -1].to_numpy()
y = dataset.iloc[:, -1].to_numpy()

In [4]:
# Splitting the dataset into the training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# Fitting kernel SVM to the training set
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In `cross_val_score` method, we need to specify:
- `estimator`: The object to use to fit the data.
- `X`: The data to fit.
- `y`: The target variable to try to predict in the case of supervised learning.
- `cv`: Determines the cross-validation splitting strategy. 

In [7]:
# Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print('Mean Accuracy:', accuracies.mean())
print('Standard Deviation:', accuracies.std())

Mean Accuracy: 0.897146871945259
Standard Deviation: 0.0478476880711251




## 10.2 Grid Search 

In `GridSearchCV` class, we need to specify parameters:
- `estimator`: This is assumed to implement the scikit-learn estimator interface.
- `param_grid`: Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored.
- `scoring`: A single string or a callable to evaluate the predictions on the test set.
- `cv`: Determines the cross-validation splitting strategy.
- `n_jobs`: Number of jobs to run in parallel. `-1` means using all processors.

We also need to specify arguments:
- `best_score_`: Mean cross-validated score of the best_estimator.
- `best_params_`: Parameter setting that gave the best results on the hold out data.

In [8]:
# Applying grid search to find the best model and the best parameter
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': np.arange(0.1, 1, 0.1).tolist()}
             ]
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
print('Best Accuracy:', best_accuracy)
best_parameters = grid_search.best_params_
print('Best Parameters:', best_parameters)

Best Accuracy: 0.90625
Best Parameters: {'C': 1, 'gamma': 0.9, 'kernel': 'rbf'}




## 10.3 XGBoost
For `XGBoost`, feature scaling is not necessary. 

In [9]:
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
# Splitting the dataset into the independent and dependent variables
X = dataset.iloc[:, 3: -1].to_numpy()
y = dataset.iloc[:, -1].to_numpy()

In [11]:
# Encoding the categorical variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelEncoder_X_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
oneHotEncoder = OneHotEncoder(categorical_features=[1])
X = oneHotEncoder.fit_transform(X).toarray()
X = X[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
# Splitting the dataset into the training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
# Fitting XGBoost to the training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
# Predicting the test set results
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1521,   74],
       [ 197,  208]])

In [21]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print('Accuracy Mean:', accuracies.mean())
print('Accuracy Deviation:', accuracies.std())

Accuracy Mean: 0.8629994451163204
Accuracy Deviation: 0.010677872171663988
