In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection

In [5]:
def create_folds(data):
    # Create a new col, kfold, with -1's
    data['kfold'] = -1
    # Sample rows randomly
    data = data.sample(frac = 1).reset_index(drop = True)
    # Calculate #. bins by Sturge's rule
    num_bins = int(np.floor(1 + np.log2(len(data))))
    # bin targets
    data.loc[:, 'bins'] = pd.cut(data.target, bins = num_bins, labels = False)
    # Initialize kfold classes with 5
    kf = model_selection.StratifiedKFold(n_splits = 5)
    # Update the kfold in the validation set
    for f, (t_, v_) in enumerate(kf.split(X = data, y = data.bins.values)):
        data.loc[:, 'kfold'] = f
    
    data = data.drop('bins', axis = 1)
    return data

In [6]:
if __name__ == '__main__':
    # Generate sample data of 15000 samples with 1000 features and 1 target
    X, y = datasets.make_regression(n_samples = 15000, n_features = 100, n_targets = 1)
    df = pd.DataFrame(X, columns = [f'f_{i}' for i in range(X.shape[1])])
    df.loc[:, 'target'] = y
    df = create_folds(df)

