In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import tree

### Raw data, predictors and target

In [2]:
df = pd.read_excel('IG_sample.xlsx', sheetname='PlayGolf')
display(df)

feat_col = ['Temperature', 'Outlook', 'Humidity', 'Windy', 'PlayGolf']
targ_col = ['PlayGolf']
X = df[feat_col]; y = df[targ_col]
display(df.groupby(by = targ_col).apply(len).to_frame())

Unnamed: 0,Day,Temperature,Outlook,Humidity,Windy,PlayGolf
0,2017-07-05,hot,sunny,high,False,no
1,2017-07-06,hot,sunny,high,True,no
2,2017-07-07,hot,overcast,high,False,yes
3,2017-07-09,cool,rain,normal,False,yes
4,2017-07-10,cool,overcast,normal,True,yes
5,2017-07-12,mild,sunny,high,False,no
6,2017-07-14,cool,sunny,normal,False,yes
7,2017-07-15,mild,rain,normal,False,yes
8,2017-07-20,mild,sunny,normal,True,yes
9,2017-07-21,mild,overcast,high,True,yes


Unnamed: 0_level_0,0
PlayGolf,Unnamed: 1_level_1
no,5
yes,9


### Stratified sampling based on the target, using *train_test_split*

#### random sampling generating undesired splits

In [3]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33, random_state=42)
display(X_train.index)
display(X_test.index)
display(y_test)

Int64Index([8, 2, 1, 13, 4, 7, 10, 3, 6], dtype='int64')

Int64Index([9, 11, 0, 12, 5], dtype='int64')

Unnamed: 0,PlayGolf
9,yes
11,no
0,no
12,no
5,no


#### stratify option in train_test_split, generating desired splits

In [4]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33, random_state=42, stratify = y)
display(X_train.index)
display(X_test.index)
display(y_test)

Int64Index([1, 6, 9, 12, 3, 5, 8, 2, 4], dtype='int64')

Int64Index([10, 13, 0, 7, 11], dtype='int64')

Unnamed: 0,PlayGolf
10,yes
13,yes
0,no
7,yes
11,no


#### stratify using one predictor + response

In [5]:
y_split = df[['Windy', 'PlayGolf']]
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33, random_state=42, stratify = y_split)
display(X_train.index)
display(X_test.index)

display(df.groupby(by = 'Windy').apply(len).to_frame())
display(y_split.loc[y_test.index])
display(y_test)

Int64Index([0, 1, 11, 3, 4, 13, 2, 7, 8], dtype='int64')

Int64Index([9, 10, 5, 12, 6], dtype='int64')

Unnamed: 0_level_0,0
Windy,Unnamed: 1_level_1
False,8
True,6


Unnamed: 0,Windy,PlayGolf
9,True,yes
10,False,yes
5,False,no
12,True,no
6,False,yes


Unnamed: 0,PlayGolf
9,yes
10,yes
5,no
12,no
6,yes


### K-fold using *StratifiedKFold* 

#### SKF using response, without shuffling

In [6]:
skf = StratifiedKFold(n_splits=5, random_state=42)
print skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    print y.loc[test_index]

5
('TRAIN:', array([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]), 'TEST:', array([0, 2, 3]))
  PlayGolf
0       no
2      yes
3      yes
('TRAIN:', array([ 0,  2,  3,  5,  7,  8,  9, 10, 11, 12, 13]), 'TEST:', array([1, 4, 6]))
  PlayGolf
1       no
4      yes
6      yes
('TRAIN:', array([ 0,  1,  2,  3,  4,  6,  9, 10, 11, 12, 13]), 'TEST:', array([5, 7, 8]))
  PlayGolf
5       no
7      yes
8      yes
('TRAIN:', array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 12, 13]), 'TEST:', array([ 9, 10, 11]))
   PlayGolf
9       yes
10      yes
11       no
('TRAIN:', array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 'TEST:', array([12, 13]))
   PlayGolf
12       no
13      yes


#### SKF using response, with shuffling

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    print y.loc[test_index]

5
('TRAIN:', array([ 0,  2,  4,  5,  6,  7,  8,  9, 11, 12, 13]), 'TEST:', array([ 1,  3, 10]))
   PlayGolf
1        no
3       yes
10      yes
('TRAIN:', array([ 0,  1,  3,  4,  5,  6,  7,  9, 10, 11, 13]), 'TEST:', array([ 2,  8, 12]))
   PlayGolf
2       yes
8       yes
12       no
('TRAIN:', array([ 0,  1,  2,  3,  6,  7,  8,  9, 10, 11, 12]), 'TEST:', array([ 4,  5, 13]))
   PlayGolf
4       yes
5        no
13      yes
('TRAIN:', array([ 1,  2,  3,  4,  5,  8,  9, 10, 11, 12, 13]), 'TEST:', array([0, 6, 7]))
  PlayGolf
0       no
6      yes
7      yes
('TRAIN:', array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 13]), 'TEST:', array([ 9, 11]))
   PlayGolf
9       yes
11       no


#### SKF using a predictor + response

In [8]:
y_split = df[['Windy', 'PlayGolf']]

skf = StratifiedKFold(n_splits=3, random_state=42)

print skf.get_n_splits(X, y_split)

try:
    for train_index, test_index in skf.split(X, y_split):
        print("TRAIN:", train_index, "TEST:", test_index)
        print y.loc[test_index]
except Exception as e:
    print e

3
Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.


#### SKF using a predictor

In [9]:
y_split = df['Windy']

skf = StratifiedKFold(n_splits=3, random_state=42)

print skf.get_n_splits(X, y_split)

try:
    for train_index, test_index in skf.split(X, y_split):
        print("TRAIN:", train_index, "TEST:", test_index)
        print y_split.loc[test_index]
        print y.loc[test_index]
except Exception as e:
    print e

3
('TRAIN:', array([ 5,  6,  7,  8,  9, 10, 11, 12, 13]), 'TEST:', array([0, 1, 2, 3, 4]))
0    False
1     True
2    False
3    False
4     True
Name: Windy, dtype: bool
  PlayGolf
0       no
1       no
2      yes
3      yes
4      yes
('TRAIN:', array([ 0,  1,  2,  3,  4, 10, 11, 12, 13]), 'TEST:', array([5, 6, 7, 8, 9]))
5    False
6    False
7    False
8     True
9     True
Name: Windy, dtype: bool
  PlayGolf
5       no
6      yes
7      yes
8      yes
9      yes
('TRAIN:', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 'TEST:', array([10, 11, 12, 13]))
10    False
11     True
12     True
13    False
Name: Windy, dtype: bool
   PlayGolf
10      yes
11       no
12       no
13      yes


### using *StratifiedShuffleSplit*

In [10]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
print sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    print y.loc[test_index]

5
('TRAIN:', array([ 1,  6,  4, 12,  3,  2,  5, 13,  0,  8,  9]), 'TEST:', array([11, 10,  7]))
   PlayGolf
11       no
10      yes
7       yes
('TRAIN:', array([ 1, 13,  4, 10,  3,  6,  5,  0,  9, 12,  2]), 'TEST:', array([ 7,  8, 11]))
   PlayGolf
7       yes
8       yes
11       no
('TRAIN:', array([ 2,  7,  9, 10, 12, 11,  5,  6,  0,  8,  4]), 'TEST:', array([ 1,  3, 13]))
   PlayGolf
1        no
3       yes
13      yes
('TRAIN:', array([11,  6,  5, 12,  0, 10,  4,  2,  8,  9,  7]), 'TEST:', array([ 3,  1, 13]))
   PlayGolf
3       yes
1        no
13      yes
('TRAIN:', array([ 3, 12,  5,  7,  4, 10,  8,  9,  6,  1,  0]), 'TEST:', array([ 2, 11, 13]))
   PlayGolf
2       yes
11       no
13      yes


>stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

Note sample 11 for 'no' is used twice, so is sample 1