In [1]:
import sys
sys.path.append("./src")
import pandas as pd
import numpy as np
import upgini.utils.blocked_time_series as ts
from sklearn.model_selection import KFold, TimeSeriesSplit, cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegression

In [4]:
import importlib
importlib.reload(ts)

<module 'upgini.utils.blocked_time_series' from '/Users/artemy.gusev/Documents/projects_upgini/upgini/./src/upgini/utils/blocked_time_series.py'>

In [5]:
df = pd.read_csv("tests/test_data/binary/data2.csv.gz", compression="gzip")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15555 entries, 0 to 15554
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SystemRecordId_473310000  15555 non-null  object 
 1   phone_num                 15555 non-null  int64  
 2   rep_date                  15555 non-null  int64  
 3   target                    15555 non-null  int64  
 4   feature1                  15555 non-null  float64
 5   feature2                  15555 non-null  float64
dtypes: float64(2), int64(3), object(1)
memory usage: 729.3+ KB


In [6]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
simple_ts = TimeSeriesSplit(n_splits=5) 
blocked_ts = ts.BlockedTimeSeriesSplit(n_splits=5, test_size=0.25)

model = LogisticRegression(random_state=0)
X = df[["feature1", "feature2"]]
y = df["target"]
X.shape, y.shape

((15555, 2), (15555,))

### 1) Проверка логики разбиения на фолды

In [7]:
for i, (train_idx, test_idx) in enumerate(simple_ts.split(X, y)):
    print("Iteration:", i)
    print("Train fold:", len(train_idx), min(train_idx), max(train_idx))
    print("Test fold:", len(test_idx), min(test_idx), max(test_idx))
    print()

Iteration: 0
Train fold: 2595 0 2594
Test fold: 2592 2595 5186

Iteration: 1
Train fold: 5187 0 5186
Test fold: 2592 5187 7778

Iteration: 2
Train fold: 7779 0 7778
Test fold: 2592 7779 10370

Iteration: 3
Train fold: 10371 0 10370
Test fold: 2592 10371 12962

Iteration: 4
Train fold: 12963 0 12962
Test fold: 2592 12963 15554



In [8]:
cv_split_stat = []
for i, (train_idx, test_idx) in enumerate(blocked_ts.split(X, y)):
    cv_split_stat.append([
        len(train_idx), min(train_idx), max(train_idx), 
        len(test_idx), min(test_idx), max(test_idx)
    ])
    print("Iteration:", i)
    print("Train fold:", len(train_idx), min(train_idx), max(train_idx))
    print("Test fold:", len(test_idx), min(test_idx), max(test_idx))
    print()

len(cv_split_stat)

Iteration: 0
Train fold: 2333 0 2332
Test fold: 778 2333 3110

Iteration: 1
Train fold: 2333 3111 5443
Test fold: 778 5444 6221

Iteration: 2
Train fold: 2333 6222 8554
Test fold: 778 8555 9332

Iteration: 3
Train fold: 2333 9333 11665
Test fold: 778 11666 12443

Iteration: 4
Train fold: 2333 12444 14776
Test fold: 778 14777 15554



5

In [87]:
df_stat = pd.DataFrame(cv_split_stat, columns=["train_len", "train_min", "train_max", "test_len", "test_min", "test_max"])
df_stat.to_csv("tests/test_data/binary/blocked_ts_logic.csv", index=False)

### 2) Прогон cross_val_score с данной кросс-валидацией

In [9]:
## KFold
cross_val_score(model, X, y, cv=kfold)

array([0.50048216, 0.48505304, 0.50466088, 0.52137576, 0.50144648])

In [10]:
## TimeSeriesSplit
cross_val_score(model, X, y, cv=simple_ts)

array([0.50308642, 0.50115741, 0.49614198, 0.49729938, 0.49807099])

In [11]:
## BlockedTimeSeriesSplit
cross_val_score(model, X, y, cv=blocked_ts)

array([0.50128535, 0.48843188, 0.46658098, 0.47557841, 0.47043702])

### 3) Проверка исключений

In [12]:
X_short, y_short = X.iloc[:10, :], y.iloc[:10]
blocked_ts = ts.BlockedTimeSeriesSplit(n_splits=5)
cross_val_score(model, X_short, y_short, cv=blocked_ts)

ValueError: Cannot have number of samples in test fold (test_size * n_samples / n_splits) <= 1

In [71]:
blocked_ts = ts.BlockedTimeSeriesSplit(n_splits=5, test_size=2)
cross_val_score(model, X, y, cv=blocked_ts)

ValueError: test_size=2 should be a float in the (0, 1) range

In [72]:
blocked_ts = ts.BlockedTimeSeriesSplit(n_splits=5.5, test_size=0.5)
cross_val_score(model, X, y, cv=blocked_ts)

ValueError: The number of folds must be of Integral type. 5.5 of type <class 'float'> was passed.