# Iterator class

`Iterator` - class that implements 3 different crossvalidation strategies:
* crossvalidation by users (RepeatedKFold)
* stratified crossvalidation by target column (RepeatedStratifiedKFold)
* simple crossvalidation (RepeatedKFold)

## Imports

In [1]:
import pandas as pd
import os
import requests
import numpy as np
import yaml
import logging
from sklearn.metrics import roc_auc_score
from crosspredict.iterator import Iterator
from crosspredict.crossval import CrossLightgbmModel
logging.basicConfig(level=logging.INFO)

## Load data

In [2]:
file_url = 'https://boosters.pro/api/ch/files/pub/onetwotrip_challenge_train.csv'
file_path = '../tests/onetwotrip_challenge_train.csv'
if os.path.isfile(file_path)!=True:
    myfile = requests.get(file_url)
    open(file_path, 'wb').write(myfile.content)
    
df = pd.read_csv('../tests/onetwotrip_challenge_train.csv')


unique_clients = pd.Series(df['userid'].unique())
test_users = unique_clients.sample(frac=0.2,random_state=0)
val_idx = df['userid'].isin(test_users)
test = df[val_idx].copy()
train = df[~val_idx].copy()

## Crossvalidation by col_client column (rows from each users goes in specific fold)

In [3]:
iter_df = Iterator(n_repeats=3,
                    n_splits=10,
                    random_state = 0,
                    col_client = 'userid',
                    cv_byclient=True)

for i, (X_train, X_val) in enumerate(iter_df.split(train)):
    pass

Using RepeatedKFold by column group "userid"


## Stratified CrossValidation by `col_target`

In [4]:
iter_df = Iterator(n_repeats=3,
                        n_splits=10,
                        random_state = 0,
                       col_target = 'goal1',
                       cv_byclient=False)

for i, (X_train, X_val) in enumerate(iter_df.split(train)):
    pass

Using RepeatedStratifiedKFold by column group "goal1"


## Simple CrossValidation

In [5]:
iter_df = Iterator(n_repeats=3,
                        n_splits=10,
                        random_state = 0,
                       cv_byclient=False)

for i, (X_train, X_val) in enumerate(iter_df.split(train)):
    pass

Using RepeatedKFold by all data
