In [1]:
# cross validation : ensures that model fits data accurately and does not overfit (very common with neural networks)
# overfit (model fits perfectly on training set, while performs badly on test data)
# or training loss keeps on decreasing, while test loss keeps on increasing 
# always monitor training/testing loss, when training neural networks
# ^ Stop training when validation loss reaches its minimum value 


In [2]:
# occams razor : do not complicate things that can be solved in a simple manner  

In [3]:
# type of cross validation depends upon type of dataset at hand 

In [5]:
# hold-out set cross validation : training on one part, and testing on another 
# k fold cross validation : data - randomize rows - k exclusive of each other 
# stratified k fold cross validation (default choice for classification problems, not used for regression)
    #: data - stratified sampling (ratio of classes remains same) 
    # to be used for regression, first divide the target into bins 
    #sturgs rule : number of bins = 1+log2(N) (for small datasets)
    
# hold-out based validation : 10 folds of stratified sampling with a single hold out on which all metrics are computed 
# leave one out cross validation : k fold cross validation where k=n , results in n folds, 
    #where model is trained on all samples instead of 1 
# group k fold cross validation : e.g., considering a patient as a single group

In [6]:
# split your data first, whether you are performing feature engineering or model training 

In [1]:
# examples

In [14]:
import pandas as pd 
from sklearn import model_selection

In [9]:
df=pd.read_csv('data/01_mnist_lbl.csv')
df.head()

Unnamed: 0,class
0,5
1,0
2,4
3,1
4,9


In [10]:
df['kfold']=-1

In [11]:
df=df.sample(frac=1).reset_index(drop=True)

In [12]:
df

Unnamed: 0,class,kfold
0,8,-1
1,1,-1
2,8,-1
3,4,-1
4,6,-1
...,...,...
69995,8,-1
69996,0,-1
69997,4,-1
69998,1,-1


In [15]:
kf = model_selection.KFold(n_splits=5)

In [18]:
for fold,(trn,val) in enumerate(kf.split(X=df)):
    df.loc[val,'kfold']=fold

In [23]:
df['kfold'].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [28]:
df.groupby(['kfold'],as_index=False).agg(rows = pd.NamedAgg(column='class',aggfunc='count'))

Unnamed: 0,kfold,rows
0,0,14000
1,1,14000
2,2,14000
3,3,14000
4,4,14000


In [29]:
df=pd.read_csv('data/01_mnist_img.csv')

In [30]:
df_2=pd.read_csv('data/01_mnist_lbl.csv')

In [32]:
ksf=model_selection.StratifiedKFold(n_splits=5)

for fold,(trn,val) in enumerate(ksf.split(X=df,y=df_2)):
    print(fold)

0
1
2
3
4


In [33]:
# for regression 

In [36]:
import numpy as np
num_bins = int(np.floor(1 + np.log2(len(df))))

In [37]:
num_bins

17

In [41]:
out=pd.DataFrame(np.column_stack([df,df_2]))

In [47]:
out[784].unique()

array([5., 0., 4., 1., 9., 2., 3., 6., 7., 8.])

In [44]:
out.loc[:, "bins"] = pd.cut(out[784], bins=num_bins, labels=False)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,776,777,778,779,780,781,782,783,784,bins
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,9
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3
69996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5
69997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7
69998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,9
