In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection

In [24]:
def create_folds(data):
    #create a new column kfold and fill it with -1
    data['kfold'] = -1
    
    #randomize the rows of the dataset
    data = data.sample(frac=1).reset_index(drop = True)
    
    #calculate the number of bins accroding to Sturge's rule
    num_bins = int(np.floor(1+np.log2(len(data))))
    
    # bin targets
    data.loc[:,'bins'] = pd.cut(data['target'], bins = num_bins, labels = False)
    
    #inititate the kfold class
    kf = model_selection.StratifiedKFold(n_splits = 5)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop('bins', axis=1)
    # return dataframe with folds
    return data


In [25]:
X, y = datasets.make_regression(
n_samples=15000, n_features=100, n_targets=1
)

In [26]:
print(X)
print(X.shape)

[[-0.52514617 -1.58323332 -1.43778369 ...  0.65537192  1.76848482
   0.7401146 ]
 [ 0.48496506  1.09979441  0.7207697  ... -0.13877175  1.72465706
  -0.58259598]
 [ 0.78714359 -1.67808539  1.46423674 ... -2.3715171  -0.33204419
   1.6827237 ]
 ...
 [ 0.50781215  0.27712871  0.12551835 ... -1.72855507  1.29264977
   0.84369696]
 [-0.11143979  0.71241241 -0.86562477 ...  2.2083026   1.19361853
   0.89756745]
 [-0.74901233  0.62030026 -0.25133201 ...  0.73104292 -2.32733379
  -0.91830139]]
(15000, 100)


In [27]:
print(y)

print(y.shape)

[ -9.82498903  42.58817207 -69.07224704 ... -72.06332824 205.1503445
  66.42826982]
(15000,)


In [28]:
# create a dataframe out of our numpy arrays
df = pd.DataFrame(
X,
columns=[f"f_{i}" for i in range(X.shape[1])]
)
df.loc[:, "target"] = y

In [29]:
df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_91,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target
0,-0.525146,-1.583233,-1.437784,-0.404797,1.267593,1.151981,-0.343519,1.07339,0.769832,0.073751,...,-0.013781,-0.848906,0.066854,-1.121562,-1.057065,-0.979351,0.655372,1.768485,0.740115,-9.824989
1,0.484965,1.099794,0.72077,0.442704,-1.368589,-1.450244,1.859426,0.78615,0.398399,-0.073412,...,0.04252,0.645926,-0.419887,0.427496,0.31015,-0.602732,-0.138772,1.724657,-0.582596,42.588172
2,0.787144,-1.678085,1.464237,1.440619,-0.455875,-1.532614,-0.136488,1.475794,-0.045262,-0.368138,...,1.731526,-0.382279,-0.788279,-1.173537,-1.925679,1.59898,-2.371517,-0.332044,1.682724,-69.072247
3,-0.178982,-0.620203,1.36668,-0.839253,0.18325,-1.940284,1.74053,0.558849,-0.365508,-1.847663,...,-1.179801,-0.409681,-0.571533,0.330379,1.141662,0.221841,-0.786844,1.513891,-0.186875,103.041569
4,0.134712,2.008959,-0.431136,1.98779,-0.132229,0.190238,-1.321777,-0.509031,-1.361029,0.883055,...,-0.064697,-0.529883,1.577711,-0.390695,-0.950241,-0.255888,-1.096533,0.298198,-1.009772,158.204852


In [30]:

#create folds
df_st = create_folds(df)

In [31]:
df_st.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold
0,0.517839,0.409258,0.732007,-1.182794,1.587454,-2.07952,1.077034,0.278979,-0.136377,0.624497,...,-0.250536,0.945134,-0.405119,1.00696,0.910378,-1.789283,0.237473,0.51611,278.066988,0
1,1.871763,0.430821,0.934271,0.339543,-0.403445,-0.418436,1.357253,-0.267333,0.41138,0.10143,...,-0.478462,2.768034,0.167528,-1.011148,1.376815,1.733582,0.645658,0.687921,302.119446,0
2,0.272876,-1.35509,1.693375,-0.292866,0.097803,0.255379,0.041877,-1.498651,1.745947,-0.803296,...,0.282071,0.713609,-0.459195,0.989653,-0.947846,-0.42882,-0.606726,1.157346,182.924573,0
3,1.235102,-1.19477,-0.650388,-0.034194,-1.677949,0.299435,0.857251,-2.130943,0.224631,0.580559,...,-0.687842,0.08372,-0.377703,0.05286,-0.671939,0.143435,-0.36304,-0.52344,24.5889,0
4,-2.01741,-0.158221,1.066353,0.988039,-0.673414,0.865759,0.534661,-0.050036,-1.433564,-0.820706,...,2.037634,0.076843,-0.28239,-0.177194,0.368502,-0.1103,0.108331,1.962559,-130.556946,0


In [33]:
df_st.tail()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold
14995,-0.662555,0.582861,0.140807,1.678898,-0.043034,-0.292997,-1.069002,-0.620422,0.604045,0.50198,...,0.828612,-1.112285,0.196999,1.39308,-0.744753,0.451289,-0.734825,1.029291,-52.783051,4
14996,1.224667,0.59263,-1.121594,-0.473496,-0.607403,1.416974,-0.467593,0.046514,-1.940107,-1.382416,...,0.879633,0.081104,2.049364,-0.55063,0.577855,0.176801,1.447427,-2.570015,-105.007291,4
14997,-0.123634,-0.245881,-1.350969,1.06171,2.3974,1.013382,-0.025259,0.580556,0.8137,-1.512352,...,-1.022981,0.219057,0.563498,1.131489,-0.460648,-0.401203,1.044803,0.757891,271.12757,4
14998,-0.37126,0.933496,-0.042701,0.408207,-0.688375,0.598518,-2.625791,0.844566,-1.62049,-1.501574,...,-0.575139,0.367044,0.367397,-0.131572,-0.861964,-1.299779,-0.382614,-0.20779,32.269283,4
14999,-0.010065,-0.254261,1.172477,-0.86943,1.386375,-0.167726,-0.001724,1.127236,-2.095647,0.036314,...,0.629562,-1.073114,-0.367243,0.743268,0.851255,-0.456528,-0.378508,-0.092878,24.630474,4
