In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
data = pd.read_feather('data/train.f')

### Repeated Holdout 60/10

In [3]:
%%time
# 60% = 72
# last 10% = 108

results_val = []

for seed in range(20):

    np.random.seed(seed)
    split_point = np.random.randint(72, 108)
    print(f'Repetition: {seed} - Split point: {split_point}')

    train = data[(data['era'] > split_point - 72) & (data['era'] <= split_point)]
    val   = data[(data['era'] > split_point) & data['era'] <= split_point + 12]


    X_train = train.filter(regex=r'feature')
    X_val   = val.filter(regex=r'feature')

    y_train = train['target_kazutsugi']
    y_val   = val['target_kazutsugi']

    model = LGBMRegressor(max_depth        = 5,
                          num_leaves       = 2**5,
                          learning_rate    = .01,
                          n_estimators     = 2000,
                          colsample_bytree = .1,
                          random_state     = 0)

    model.fit(X_train, y_train)

    probs = pd.Series(model.predict(X_val))

    # Ranked Correlation
    ranked_probs = probs.rank(pct=True, method='first')
    corr = np.corrcoef(y_val, ranked_probs)[0, 1]
    
    results_val.append(corr)

    print(f'Correlation: {corr}')
    print()

Repetition: 0 - Split point: 72
Correlation: 0.1664234610296449

Repetition: 1 - Split point: 84
Correlation: 0.1704076269566628

Repetition: 2 - Split point: 87
Correlation: 0.17322038455527933

Repetition: 3 - Split point: 96
Correlation: 0.17490775757202837

Repetition: 4 - Split point: 77
Correlation: 0.16834082477689338

Repetition: 5 - Split point: 107
Correlation: 0.1779451949412366

Repetition: 6 - Split point: 82
Correlation: 0.16978066887563617

Repetition: 7 - Split point: 76
Correlation: 0.168271441199296

Repetition: 8 - Split point: 75
Correlation: 0.16921564358974228

Repetition: 9 - Split point: 100
Correlation: 0.17540266315579997

Repetition: 10 - Split point: 81
Correlation: 0.1692641583220416

Repetition: 11 - Split point: 97
Correlation: 0.17486501705486673

Repetition: 12 - Split point: 83
Correlation: 0.16954264918998116

Repetition: 13 - Split point: 90
Correlation: 0.17453694334091385

Repetition: 14 - Split point: 96
Correlation: 0.17490775757202837

Repetitio

In [4]:
np.mean(results_val), np.median(results_val)

(0.17198929913630923, 0.17181400575597106)

In [5]:
np.min(results_val), np.max(results_val)

(0.1664234610296449, 0.1779451949412366)

In [6]:
len(results_val)

20