SIB - P7
A Jupyter Notebook with examples of how to use cross validation and grid search. 7/11/2022

In [1]:
# imports
from si.io.csv import read_csv
from si.linear_model.logistic_regression import LogisticRegression
from si.model_selection.cross_validate import cross_validate
from si.model_selection.grid_search import grid_search_cv

In [2]:
# datasets
breast_bin_dataset = read_csv('../datasets/breast-bin.csv', features=False, label=True)

In [3]:
# standardization
from sklearn.preprocessing import StandardScaler
breast_bin_dataset.X = StandardScaler().fit_transform(breast_bin_dataset.X)
breast_bin_dataset.X

array([[ 0.20885295, -0.69912815, -0.74242297, ..., -1.0000359 ,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.58991542,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.17979494,
        -0.61132565, -0.34418721],
       ...,
       [ 0.20885295, -0.37139715, -0.40592217, ..., -1.0000359 ,
        -0.61132565,  0.23892607],
       [-0.8578253 , -0.04366616, -0.40592217, ..., -0.17979494,
        -0.61132565, -0.34418721],
       [ 0.91997179,  0.93952681,  0.94008103, ...,  1.46068699,
        -0.61132565, -0.34418721]])

In [4]:
# cross validation
lg = LogisticRegression()
scores = cross_validate(lg, breast_bin_dataset, cv=5)
scores

{'seeds': [146, 971, 533, 132, 390],
 'train': [0.9660107334525939,
  0.9677996422182469,
  0.9695885509838998,
  0.9677996422182469,
  0.964221824686941],
 'test': [0.9712230215827338,
  0.9640287769784173,
  0.9568345323741008,
  0.9640287769784173,
  0.9784172661870504]}

In [24]:
# grid search cv

lg = LogisticRegression()

# parameter grid
parameter_grid = {
    'l2_penalty': (1, 10),
    'alpha': (0.001, 0.0001, 0.00001),
    'max_iter': (1000, 2000, 3000, 4000, 5000, 6000)
}

# cross validate the model
scores = grid_search_cv(lg,
                        breast_bin_dataset,
                        parameter_grid=parameter_grid,
                        cv=3)

scores

[{'seeds': [45, 612, 421],
  'train': [0.9713774597495528, 0.964221824686941, 0.9731663685152058],
  'test': [0.9496402877697842, 0.9784172661870504, 0.9424460431654677],
  'parameters': {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 1000}},
 {'seeds': [700, 555, 572],
  'train': [0.9660107334525939, 0.9713774597495528, 0.9749552772808586],
  'test': [0.9784172661870504, 0.9568345323741008, 0.9424460431654677],
  'parameters': {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 2000}},
 {'seeds': [70, 768, 128],
  'train': [0.9660107334525939, 0.9660107334525939, 0.9695885509838998],
  'test': [0.9784172661870504, 0.9784172661870504, 0.9568345323741008],
  'parameters': {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 3000}},
 {'seeds': [827, 331, 433],
  'train': [0.9695885509838998, 0.9660107334525939, 0.964221824686941],
  'test': [0.9640287769784173, 0.9784172661870504, 0.9928057553956835],
  'parameters': {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 4000}},
 {'seeds': [141, 550, 965],
  't

In [25]:
import pandas as pd
cols = list(scores[0]['parameters'].keys())
cols = cols + ['train', 'test', 'cv']

dict_df = {col: [] for col in cols}
for score in scores:
    for i, (train_val, test_val) in enumerate(zip(score['train'], score['test'])):
        dict_df['cv'].append(i)
        dict_df['train'].append(train_val)
        dict_df['test'].append(test_val)
        for p_key, p_val in score['parameters'].items():
            dict_df[p_key].append(p_val)

df = pd.DataFrame(dict_df)
df

Unnamed: 0,l2_penalty,alpha,max_iter,train,test,cv
0,1,0.00100,1000,0.971377,0.949640,0
1,1,0.00100,1000,0.964222,0.978417,1
2,1,0.00100,1000,0.973166,0.942446,2
3,1,0.00100,2000,0.966011,0.978417,0
4,1,0.00100,2000,0.971377,0.956835,1
...,...,...,...,...,...,...
103,10,0.00001,5000,0.964222,0.978417,1
104,10,0.00001,5000,0.969589,0.956835,2
105,10,0.00001,6000,0.969589,0.956835,0
106,10,0.00001,6000,0.964222,0.978417,1
