# Module

In [1]:
import gc
import optuna
import janestreet
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Dataset

In [2]:
%%time

train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

print(train.shape)
train.head()

(2390491, 138)
CPU times: user 1min 28s, sys: 6.06 s, total: 1min 34s
Wall time: 2min 27s


Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


# Preprocess

In [3]:
train = train[train['weight'] != 0]

print(train.isnull().sum()[1:7])

weight    0
resp_1    0
resp_2    0
resp_3    0
resp_4    0
resp      0
dtype: int64


In [4]:
# train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')
train['action'] = (train['resp'] > 0) * 1

train['action'].head()

1    0
4    0
6    1
7    1
8    0
Name: action, dtype: int64

In [5]:
train = train.fillna(0.5)

print('Total Null: ', train.isnull().sum().sum())

Total Null:  0


In [6]:
features = train.columns[train.columns.str.contains('feature')]
target = 'action'

print(len(features))
print(features[:2], '...', features[128:])

130
Index(['feature_0', 'feature_1'], dtype='object') ... Index(['feature_128', 'feature_129'], dtype='object')


In [7]:
X_train = train[features]
y_train = train[target]

del(train, features, target)
gc.collect()

89

# Modeling

In [8]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, random_state=666, test_size=0.2)

print(X_tr.shape)
print(y_val.shape)
print(X_tr.shape)
print(y_val.shape)

(1585029, 130)
(396258,)
(1585029, 130)
(396258,)


In [9]:
'''
%%time

def create_model(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    tree_method = 'gpu_hist'
    missing = -999
    random_state = 666
    
    model = xgb.XGBClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        tree_method='gpu_hist',
        missing=-999,
        random_state=666)
        
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_tr, y_tr)
    y_proba = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_proba)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)
params = study.best_params

print(params)

'''

'\n%%time\n\ndef create_model(trial):\n    n_estimators = trial.suggest_int("n_estimators", 50, 500)\n    max_depth = trial.suggest_int(\'max_depth\', 3, 8)\n    learning_rate = trial.suggest_uniform(\'learning_rate\', 0.0001, 0.99)\n    subsample = trial.suggest_uniform(\'subsample\', 0.1, 1.0)\n    colsample_bytree = trial.suggest_uniform(\'colsample_bytree\', 0.1, 1.0)\n    tree_method = \'gpu_hist\'\n    missing = -999\n    random_state = 666\n    \n    model = xgb.XGBClassifier(\n        n_estimators=n_estimators, \n        max_depth=max_depth, \n        learning_rate=learning_rate,\n        subsample=subsample,\n        colsample_bytree=colsample_bytree,\n        tree_method=\'gpu_hist\',\n        missing=-999,\n        random_state=666)\n        \n    return model\n\ndef objective(trial):\n    model = create_model(trial)\n    model.fit(X_tr, y_tr)\n    y_proba = model.predict_proba(X_val)[:,1]\n    score = roc_auc_score(y_val, y_proba)\n    return score\n\nstudy = optuna.create_

In [10]:
params = {'n_estimators': 483,
          'max_depth': 7,
          'learning_rate': 0.36530710758634694,
          'subsample': 0.9285431842666421,
          'colsample_bytree': 0.9936719249552045,
          'tree_method': 'gpu_hist',
          'missing': -999,
          'random_state': 666}

In [11]:
del(X_tr, X_val, y_tr, y_val)
gc.collect()

80

In [12]:
%%time

cls = xgb.XGBClassifier(**params)
cls.fit(X_train, y_train)

CPU times: user 33.7 s, sys: 4.45 s, total: 38.1 s
Wall time: 38.9 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9936719249552045, gamma=0,
              gpu_id=0, importance_type='gain', interaction_constraints='',
              learning_rate=0.36530710758634694, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=-999,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=483, n_jobs=0, num_parallel_tree=1, random_state=666,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9285431842666421, tree_method='gpu_hist',
              validate_parameters=1, verbosity=None)

In [13]:
del(X_train, y_train, params)
gc.collect()

20

# Submit

In [14]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [15]:
'''
%%time

for (test, sample_prediction) in iter_test:
    test = test.fillna(0.5)
    sample_prediction['action'] = cls.predict(test[features])
    env.predict(sample_prediction)
    
'''

"\n%%time\n\nfor (test, sample_prediction) in iter_test:\n    test = test.fillna(0.5)\n    sample_prediction['action'] = cls.predict(test[features])\n    env.predict(sample_prediction)\n    \n"

In [16]:
%%time

for (test, sample_prediction) in iter_test:
    test = test.fillna(0.5)
    
    features = test.columns[test.columns.str.contains('feature')]
    X_test = test[features]
    
    if test['weight'].item() > 0:
        sample_prediction['action'] = cls.predict(X_test)
    else:
        sample_prediction['action'] = 0
    
    env.predict(sample_prediction)

CPU times: user 3min 59s, sys: 420 ms, total: 4min
Wall time: 4min 1s
