In [1]:
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
import xgboost as xgb
import janestreet
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Dataset

In [2]:
%%time

train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=10**5)

print(train.shape)
train.head()

# all train shape (2390491, 138)

(100000, 138)
CPU times: user 4.05 s, sys: 643 ms, total: 4.69 s
Wall time: 6.94 s


Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [3]:
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')

print(example_test.shape)
example_test.head()

(15219, 133)


Unnamed: 0,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,date,ts_id
0,0.0,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,...,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0,0
1,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,...,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,0,1
2,0.0,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,0,2
3,0.0,-1,1.174378,0.34464,0.066872,0.009357,-1.006373,-0.676458,,,...,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,0,3
4,0.138531,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,,,...,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,0,4


trainから5つのrespカラムが抜けている → respカラムが目的変数に関係している。

In [4]:
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')

print(sample_prediction_df.shape)
sample_prediction_df.head()

(15219, 2)


Unnamed: 0,ts_id,action
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


# Preprocess

In [5]:
train = train[train['weight'] != 0]

# train['action'] = (train['resp'].values > 0).astype('int')
train['action'] =  ((train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )).astype('int')

features = [c for c in train.columns if 'feature' in c]
target = 'action'

print(train.shape)
train.head()

(77759, 139)


Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,action
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1,0
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4,0
6,0,0.190575,-0.001939,-0.002301,0.001088,0.005963,0.000709,-1,-3.172026,-3.093182,...,0.336873,4.076447,0.614783,6.622176,0.800618,5.231595,0.361506,3.921714,6,0
7,0,3.820844,0.017395,0.021361,0.031163,0.03697,0.033473,-1,0.44605,-0.46621,...,2.101997,4.846202,1.479875,5.261328,2.305066,4.571762,2.201537,4.429745,7,1
8,0,0.116557,-0.00546,-0.007301,-0.009085,-0.003546,-0.001677,1,-3.172026,-3.093182,...,1.537913,4.785838,1.637435,6.968002,2.354338,5.825499,1.778029,4.740577,8,0


In [6]:
train = train.fillna(-999)

print(train.isnull().sum().sum())

0


In [7]:
train, test = train_test_split(train, random_state=666, test_size=0.2)

print(train.shape)
print(test.shape)

(62207, 139)
(15552, 139)


In [8]:
'''
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train[features], train[target])
    score = roc_auc_score(
        test[target].values, 
        model.predict_proba(test[features])[:,1]
    )
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
params = study.best_params

print(params)

'''

'\ndef create_model(trial):\n    num_leaves = trial.suggest_int("num_leaves", 2, 31)\n    n_estimators = trial.suggest_int("n_estimators", 50, 300)\n    max_depth = trial.suggest_int(\'max_depth\', 3, 8)\n    min_child_samples = trial.suggest_int(\'min_child_samples\', 100, 1200)\n    learning_rate = trial.suggest_uniform(\'learning_rate\', 0.0001, 0.99)\n    min_data_in_leaf = trial.suggest_int(\'min_data_in_leaf\', 5, 90)\n    bagging_fraction = trial.suggest_uniform(\'bagging_fraction\', 0.0001, 1.0)\n    feature_fraction = trial.suggest_uniform(\'feature_fraction\', 0.0001, 1.0)\n    \n    model = lgb.LGBMClassifier(\n        num_leaves=num_leaves,\n        n_estimators=n_estimators, \n        max_depth=max_depth, \n        min_child_samples=min_child_samples, \n        min_data_in_leaf=min_data_in_leaf,\n        learning_rate=learning_rate,\n        feature_fraction=feature_fraction,\n        random_state=666\n    )\n    return model\n\ndef objective(trial):\n    model = create_mo

In [9]:
params = {'num_leaves': 28,
          'n_estimators': 237,
          'max_depth': 8,
          'min_child_samples': 951,
          'learning_rate': 0.20668643739310866,
          'min_data_in_leaf': 64,
          'bagging_fraction': 0.861523972755874,
          'feature_fraction': 0.8748562209840771}

In [10]:
model = lgb.LGBMClassifier(**params)

model.fit(train[features], train[target])
print('score: ', roc_auc_score(test[target].values, model.predict_proba(test[features])[:,1]))

score:  0.6919544617003373


# Submit

In [11]:
print(example_test.shape)
example_test.head()

(15219, 133)


Unnamed: 0,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,date,ts_id
0,0.0,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,...,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0,0
1,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,...,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,0,1
2,0.0,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,0,2
3,0.0,-1,1.174378,0.34464,0.066872,0.009357,-1.006373,-0.676458,,,...,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,0,3
4,0.138531,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,,,...,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,0,4


In [12]:
sample_prediction_df.head()

Unnamed: 0,ts_id,action
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [13]:
env = janestreet.make_env() 
iter_test = env.iter_test() 

In [14]:
%%time

for (test, sample_prediction_df) in iter_test:
    test = test.fillna(-999)
    sample_prediction_df['action'] = model.predict(test[features])
    env.predict(sample_prediction_df)

CPU times: user 19min 54s, sys: 37.5 s, total: 20min 32s
Wall time: 5min 21s
