### JaneStreet
### Plain NN


In [None]:
# Network for Jane Street Market Prediction on Kaggle
# https://www.kaggle.com/c/jane-street-market-prediction
# https://www.kaggle.com/wrinkledtime
# https://github.com/timestocome

In [None]:
# The Jane Street competition has blinded data and the goal is to predict stock market winners 6 months from the start

In [1]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score, confusion_matrix

In [2]:

TRAINING = True
SUBMISSION = True
TESTING = False
KAGGLE = True


In [3]:

if not KAGGLE:
    from google.colab import drive 
    drive.mount('/content/drive')

In [4]:
if not KAGGLE:
    train = pd.read_csv('/content/drive/MyDrive/Kaggle Stock Prediction/train.csv')
else:
    train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')


In [5]:
features = [c for c in train.columns if 'feature' in c]
resps = [c for c in train.columns if 'resp_' in c]


# remove 0 weight rows, they will always be zero for buys and it decreases accuracy
train = train[train['weight'] > 0]


# ffill is too slow, nan probably represents non-trading days
#train.fillna(train.mean(),inplace=True)
#f_mean = np.mean(train[features[1:]].values,axis=0)
train.fillna(0, inplace=True)


# reduce memory use
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) 

# setup targets
train['action'] =  (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) ).astype('int')


In [6]:
print(train.columns)

Index(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp',
       'feature_0', 'feature_1', 'feature_2',
       ...
       'feature_122', 'feature_123', 'feature_124', 'feature_125',
       'feature_126', 'feature_127', 'feature_128', 'feature_129', 'ts_id',
       'action'],
      dtype='object', length=139)


In [7]:
# date shuffle
# These are stock purchases in time with days running from 0..499
# to prevent information bleed days must be grouped when shuffling data
# if data is not shuffled then the current market trends will be built in and 
#    market trends may not hold


max_date = train['date'].max()


# create a range from first date to last, shuffle then split into two parts
def get_dates():

    dates = np.arange(0, max_date+1)
    np.random.shuffle(dates)

    train_dates = dates[0:440]
    test_dates = dates[440:-1]

    return train_dates, test_dates




# collect shuffled dates and split data based on date
def get_batch(features, df, shuffle):

    if shuffle:
        tr_dates, test_dates = get_dates()

        test = df[df['date'].isin(test_dates)]
        tr = df[df['date'].isin(tr_dates)]

    else:
        test = df[df['date'] > 420]
        tr = df[df['date'] <= 420]


    #0.0001 to cover rounding errors on 0. floats
    X_train = tr[features] 
    y_train = np.stack([(tr[c] > 0.).astype('int') for c in resps]).T

    X_val = test[features]
    y_val = np.stack([(test[c] > 0.).astype('int') for c in resps]).T
    
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    return X_train, y_train, X_val, y_val


In [8]:

def check_scores(reg, X_tr, X_val, y_tr, y_val, th=0.5):

    predict_train = reg.predict(X_tr)
    predict_train = np.where(predict_train.mean(axis=1) > th, 1, 0)
    y_tr = np.where(y_tr.mean(axis=1) > th, 1, 0)

    score = roc_auc_score(y_tr, predict_train)
    print('\nTrain score %.2f' % (score * 100.))


    predict_val = reg.predict(X_val)
    predict_val = np.where(predict_val.mean(axis=1) > th, 1, 0)
    y_val = np.where(y_val.mean(axis=1) > th, 1, 0)



    score = roc_auc_score(y_val, predict_val)
    print('Validation score %.2f' % (score* 100.))


    print('\ntotal buys train %d, validate %d' % (predict_val.sum(), y_val.sum()))
    cm = confusion_matrix(y_val, predict_val)
    print('true n, false p, false n, true p')
    print(cm)
    correct = cm[0][0] + cm[1][1]
    print('\ncorrect %d %.2f%% \n' % (correct, correct/len(predict_val)* 100.))

    print('val', y_tr[0:20])
    print('predicted', predict_val[0:30])

    return predict_val

In [9]:



def create_model(num_columns=130, num_labels=5, hidden_units=160, dropout_rates=.25, label_smoothing=1e-2, learning_rate=1e-3):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates)(x)

    x = tf.keras.layers.Dense(hidden_units)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(dropout_rates)(x)

    x = tf.keras.layers.Dense(hidden_units)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(dropout_rates)(x)

    x = tf.keras.layers.Dense(hidden_units)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(dropout_rates)(x)


    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model



In [10]:
X_train, y_train, X_val, y_val = get_batch(features, train, True)
n_features = len(features)
n_targets = len(resps)

(1746766, 130) (1746766, 4) (230957, 130) (230957, 4)


In [11]:


def run_model(features, resps, df):

    X_train, y_train, X_val, y_val = get_batch(features, df, False)

    n_features = X_train.shape[-1]
    n_resps = y_train.shape[-1]

    model = create_model(n_features, n_resps)

    #print('before training')
    #check_scores(model, X_train, X_val, y_train, y_val)
    #print('----------------------------------------------------')



    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=1024, 
                callbacks=[EarlyStopping(monitor='val_AUC', mode='max', patience=20, restore_best_weights=True)])
        


    print('\nbefore fine tuning',)
    check_scores(model, X_train, X_val, y_train, y_val)
    print('----------------------------------------------------')



    # rebuild and fine tune model
    model.compile(Adam(lr=0.0001), loss=BinaryCrossentropy(), metrics=[tf.keras.metrics.AUC(name='auc')])
    ft_history = model.fit(X_val, y_val, epochs=3, batch_size=4096)
    model_fname = ('fine_tuned_model.hdf5')
    print(model_fname)
    model.save(model_fname)


    print('\nfinished scores...')
    check_scores(model, X_train, X_val, y_train, y_val)
    print('----------------------------------------------------')


    return model


In [12]:
%%time

model = run_model(features, resps, train)

(1625019, 130) (1625019, 4) (356268, 130) (356268, 4)
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200

before fine tuning

Train score 54.85
Validation score 54.31

total buys train 201741, validate 146797
true n, false p, false n, true p
[[ 98292 111179]
 [ 56235  90562]]

correct 188854 53.01% 

val [0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1]
predicted [1 1 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0]
----------------------------------------------------
Epoch 1/3
Epoch

In [13]:
def check_thresholds():
    print('\n0.49')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.490)
    print('\n0.495')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.495)
    print('\n0.50')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.500)
    print('\n0.505')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.505)
    print('\n0.510')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.510)
    print('\n0.515')
    check_scores(model, X_train, X_val, y_train, y_val, th=0.515)

if TESTING:
    check_thresholds()

In [14]:
if not KAGGLE:
    model.save('/content/drive/MyDrive/Kaggle Stock Prediction/model.hdf5')
else:
    model.save('model.hdf5')


In [15]:

if not KAGGLE:
    saved_model = tf.keras.models.load_model('/content/drive/MyDrive/Kaggle Stock Prediction/model.hdf5')
else:
    saved_model = tf.keras.models.load_model('model.hdf5')


print(saved_model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 130)]             0         
_________________________________________________________________
batch_normalization (BatchNo (None, 130)               520       
_________________________________________________________________
dropout (Dropout)            (None, 130)               0         
_________________________________________________________________
dense (Dense)                (None, 160)               20960     
_________________________________________________________________
batch_normalization_1 (Batch (None, 160)               640       
_________________________________________________________________
activation (Activation)      (None, 160)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 160)              

In [16]:
if TESTING:
    model = saved_model
    check_scores(model, X_train, X_val, y_train, y_val, th=0.495)

In [17]:

# if training, pull out unseen test data and see if it looks okay ( not all 1s or 0s) and is close to training accuracy
if TRAINING:

    # split into test/train for sanity checking
    n_train = len(train)
    n_test = int(n_train * .1)
    train_idx = 0
    test_idx = n_train - n_test

    print(X_train.shape, y_train.shape)
    print(X_val.shape, y_val.shape)

    n_inputs = X_train.shape[1]
    n_outputs = y_train.shape[1]


    check_scores(model, X_train, X_val, y_train, y_val)


(1746766, 130) (1746766, 4)
(230957, 130) (230957, 4)

Train score 54.78
Validation score 54.91

total buys train 120921, validate 95836
true n, false p, false n, true p
[[69881 65240]
 [40155 55681]]

correct 125562 54.37% 

val [0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1]
predicted [1 1 1 1 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1]


In [18]:
%%time

if TESTING:
    
    test_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv')
    test_df = test_df.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns})

    
    
    import janestreet
    env = janestreet.make_env()
    th = 0.515

    for (test_df, pred_df) in env.iter_test():

        if test_df['weight'].item() > 0:
            # fetch a row, convert to numpy array
            x_tt = test_df.loc[:, features]

            # remove nan
            x_tt.fillna(0, inplace=True)

            # use model to make prediction
            preds = model(x_tt.values)
           
            # compare predicted return to threshold and buy if over threshold
            pred_df['action'] = np.where(preds.mean(axis=1) >= th, 1, 0).astype(int)

        else:
            # if weight 0 pass on this one
            pred_df.action = 0

        env.predict(pred_df)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs


In [19]:


print('submission...')
def fetch_model():
    if not KAGGLE:
        saved_model = tf.keras.models.load_model('/content/drive/MyDrive/Kaggle Stock Prediction/model.hdf5')
    else:
        saved_model = tf.keras.models.load_model('model.hdf5')

    print(saved_model.summary())

    return saved_model


th = 0.515

if SUBMISSION:
        
    #model = fetch_model()

    import janestreet
    env = janestreet.make_env()
    
    
    for (test_df, pred_df) in env.iter_test():

        if test_df['weight'].item() > 0:
    
            # fetch row 
            x_tt = test_df.loc[:, features]
                
            # remove nan
            x_tt.fillna(0, inplace=True)
            
            # use model to make a prediction
            pred = model(x_tt.values)
            
            # convrt the targets down to one and see if overthreshold
            pred = np.mean(pred, axis=1)
            pred_df['action'] = np.where(pred > th, 1, 0).astype(int)

                
                
        else:
            pred_df['action'] = 0
            
            
        env.predict(pred_df)
print('finished')

submission...
finished
