In [9]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_feather('data/train.f')

In [4]:
train = data[data['era'] <= 60]
val   = data[data['era'] >= 60]

In [5]:
X_train = train.filter(regex=r'feature')
X_val   = val.filter(regex=r'feature')

y_train = train['target_kazutsugi']
y_val   = val['target_kazutsugi']

In [11]:
%%time

model = LGBMRegressor(max_depth        = 5,
                      num_leaves       = 2**5,
                      learning_rate    = .01,
                      n_estimators     = 2000,
                      colsample_bytree = .1,
                      random_state     = 0)

model.fit(X_train, y_train)

Wall time: 23 s


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [15]:
probs = pd.Series(model.predict(X_val))

# Ranked Correlation
ranked_probs = probs.rank(pct=True, method='first')
corr = np.corrcoef(y_val, ranked_probs)[0, 1]
corr

0.04905838625099541

In [16]:
test = pd.read_feather('data/test.f')

X_test = test.filter(regex=r'feature')
y_test = test['target_kazutsugi']

probs = pd.Series(model.predict(X_test))

# Ranked Correlation
ranked_probs = probs.rank(pct=True, method='first')
corr = np.corrcoef(y_test, ranked_probs)[0, 1]
corr

0.029597436454573203

### Holdout 70/30

In [22]:
train = data[data['era'] <= 84]
val   = data[data['era'] >= 84]

X_train = train.filter(regex=r'feature')
X_val   = val.filter(regex=r'feature')

y_train = train['target_kazutsugi']
y_val   = val['target_kazutsugi']

In [23]:
%%time

model = LGBMRegressor(max_depth        = 5,
                      num_leaves       = 2**5,
                      learning_rate    = .01,
                      n_estimators     = 2000,
                      colsample_bytree = .1,
                      random_state     = 0)

model.fit(X_train, y_train)

Wall time: 36.6 s


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [24]:
probs = pd.Series(model.predict(X_val))

# Ranked Correlation
ranked_probs = probs.rank(pct=True, method='first')
corr = np.corrcoef(y_val, ranked_probs)[0, 1]
corr

0.05183432285694725

In [25]:
X_test = test.filter(regex=r'feature')
y_test = test['target_kazutsugi']

probs = pd.Series(model.predict(X_test))

# Ranked Correlation
ranked_probs = probs.rank(pct=True, method='first')
corr = np.corrcoef(y_test, ranked_probs)[0, 1]
corr

0.028505636083722287

### Retrain Score

In [26]:
X_train = data.filter(regex=r'feature')
y_train = data['target_kazutsugi']


In [27]:
%%time

model = LGBMRegressor(max_depth        = 5,
                      num_leaves       = 2**5,
                      learning_rate    = .01,
                      n_estimators     = 2000,
                      colsample_bytree = .1,
                      random_state     = 0)

model.fit(X_train, y_train)

Wall time: 31.6 s


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [28]:
X_test = test.filter(regex=r'feature')
y_test = test['target_kazutsugi']

probs = pd.Series(model.predict(X_test))

# Ranked Correlation
ranked_probs = probs.rank(pct=True, method='first')
corr = np.corrcoef(y_test, ranked_probs)[0, 1]
corr

0.029616141679292902