## Section 2.1

### In the book I have discussed results with Titanic Dataset, here for the simplicity I am using Iris Dataset

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
iris = load_iris()
X, y = iris.data, iris.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

#### 1. Scratch Implementation

In [4]:
# Defining Search Space

c = 0.001
gamma = 1e-10
param_grid = {
              "C": [c*(10**i) for i in range(1,14)],
              "gamma": [gamma*(10**i) for i in range(1,14)]
             }

In [5]:
from itertools import product

def make_sets(grid):
    """function makes all possible set from the grid above"""
    sets = list()
    all_hps_vals = [lst for lst in param_grid.values()]
    hp_keys = [hp for hp in param_grid.keys()]
    val_sets = product(*all_hps_vals)
    for val in val_sets:
        hp_set = dict()
        for idx, hp_key in enumerate(hp_keys):
            hp_set[hp_key] = val[idx]
        sets.append(hp_set)
    return sets

make_sets(param_grid)[:5]

[{'C': 0.01, 'gamma': 1e-09},
 {'C': 0.01, 'gamma': 1e-08},
 {'C': 0.01, 'gamma': 1.0000000000000001e-07},
 {'C': 0.01, 'gamma': 1e-06},
 {'C': 0.01, 'gamma': 1e-05}]

In [6]:
def grid_search(clf, grid, X_train, y_train, X_test, y_test):
    # iterates over all the sets
    all_sets = make_sets(grid)
    logs = list()
    best_hp_set = {
        "best_test_score": 0.0
    }
    for hp_set in all_sets:
        log = dict()
        model = clf(**hp_set)
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        
        log["hp"] = hp_set
        log["train_score"] = train_score
        log["test_score"] = test_score
        
        if best_hp_set["best_test_score"]<test_score:
            best_hp_set["best_test_score"] = test_score
            best_hp_set["hp_set"] = hp_set
        
        logs.append(log)
        
    return logs, best_hp_set

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [8]:
logs, best = grid_search(SVC, param_grid, X_train, y_train, X_test, y_test)

In [9]:
print(best)

{'best_test_score': 0.9666666666666667, 'hp_set': {'C': 0.1, 'gamma': 0.1}}


#### 2. Using GridSearchCV from Scikit-Learn

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
# using k fold cross validation, here k=3

clf = GridSearchCV(SVC(), param_grid, cv=3)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0,
                               100000.0, 1000000.0, 10000000.0, 100000000.0,
                               1000000000.0, 10000000000.0],
                         'gamma': [1e-09, 1e-08, 1.0000000000000001e-07, 1e-06,
                                   1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                                   100.0, 1000.0]})

In [12]:
clf.best_estimator_

SVC(gamma=1.0)

In [13]:
# print(f'Test Score: {clf.score(X_test, y_test)}')
# print(f'Train Score: {clf.score(X_train, y_train)}')