<a href="https://colab.research.google.com/github/tensorblack/ikfold/blob/master/IKFold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.model_selection import train_test_split
import numpy as np

# Incremental K-Fold Cross Validation Dataset Splitter for Pandas Dataframes
# Given number of folds k, create k groups of incremental size S
# For each increment i in 1:k+1, S = i/k * len(df)
# Also supports creating a holdout set of data for later validation

class IKFold():
  # Initialize object parameters
  def __init__(self, df, k=5, test_size=.33, random_state=42, shuffle=True,
               target='Class', holdout_size=.1, verbose=False):
    if verbose:
      print("DF Shape: {}".format(df.shape))
    self.verbose      = verbose
    self.df           = df 
    self.k            = k
    self.test_size    = test_size
    self.target       = target
    self.random_state = random_state
    self.shuffle      = shuffle
    if self.shuffle:
      self.df = self.df.sample(frac=1)
  
  # Holdout function returns a holdout set of data
  # And resaves internal dataframe without it
  def holdout(self, p=.1):
    tdf = self.df.copy()
    msk = np.random.rand(len(tdf))<p
    holdout = tdf[msk]
    # Update df to remove holdout
    self.df = tdf[~msk]
    if self.verbose:
        print("Holdout length: {}".format(len(holdout)))
        print("New DF Length: {}".format(len(self.df)))
    return holdout
  
  # Split is a generator, returning tuples of train_test_split resulting
  # (X_train, X_test, y_train, y_test)
  def split(self):
    for i in range(1, self.k+1):
      tdf = self.df.copy()
      tdf = tdf.sample(frac=i/self.k)
      y = tdf[self.target].values
      X = tdf.drop([self.target], axis=1).values
      
      X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=self.test_size, random_state=self.random_state)
      
      yield (X_train, X_test, y_train, y_test)


In [21]:
from sklearn.datasets import load_iris
import pandas as pd

data = load_iris()
X = data.data
y = data.target
df = pd.DataFrame(X, columns=data.feature_names)
df['Class'] = y

ikf = IKFold(df, verbose=False)
holdout = ikf.holdout(.1)
for fold_idx, (X_train, X_test, y_train, y_test) in enumerate(ikf.split()):
      t = {}
      t['fold'] = fold_idx
      t['X_train'] = X_train
      t['y_train'] = y_train
      t['X_test']  = X_test
      t['y_test']  = y_test
      for key, value in t.items():
        if key in ['group','fold'] :
          print(key, value)
        else:
          print(key, value.shape)
      print("\tDo some processing and training here...\n")

fold 0
X_train (18, 4)
y_train (18,)
X_test (9, 4)
y_test (9,)
	Do some processing and training here...

fold 1
X_train (36, 4)
y_train (36,)
X_test (18, 4)
y_test (18,)
	Do some processing and training here...

fold 2
X_train (54, 4)
y_train (54,)
X_test (27, 4)
y_test (27,)
	Do some processing and training here...

fold 3
X_train (72, 4)
y_train (72,)
X_test (36, 4)
y_test (36,)
	Do some processing and training here...

fold 4
X_train (90, 4)
y_train (90,)
X_test (45, 4)
y_test (45,)
	Do some processing and training here...

