### adapted from https://www.kaggle.com/aimind/bottleneck-encoder-mlp-keras-tuner-8601c5


### *** todo fix train/submission switches

In [None]:
# Network for Jane Street Market Prediction on Kaggle
# https://www.kaggle.com/c/jane-street-market-prediction
# https://www.kaggle.com/wrinkledtime
# https://github.com/timestocome

In [None]:
# The Jane Street competition has blinded data and the goal is to predict stock market winners 6 months from the start

In [1]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)


from sklearn.model_selection import GroupKFold, KFold, TimeSeriesSplit
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


from tqdm import tqdm
from random import choices


import matplotlib.pyplot as plt



In [2]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

/kaggle/input/jane-street-market-prediction/example_sample_submission.csv
/kaggle/input/jane-street-market-prediction/features.csv
/kaggle/input/jane-street-market-prediction/example_test.csv
/kaggle/input/jane-street-market-prediction/train.csv
/kaggle/input/jane-street-market-prediction/janestreet/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jane-street-market-prediction/janestreet/__init__.py


In [3]:

TRAINING = True
FOLDS = 5
SEED = 42


# drop rows that might skew the data
train = train.query('date > 85').reset_index(drop = True) 

# reduce memory footprint
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use
train.fillna(train.mean(),inplace=True)

# drop rows with a weight of 0 ( weight tells the return on investment )
train = train.query('weight > 0').reset_index(drop = True)


# only set action to buy if all returns > 0
train['action'] =  (  (train['resp_1'] > 0.00001 ) & (train['resp_2'] > 0.00001 ) & (train['resp_3'] > 0.00001 ) & (train['resp_4'] > 0.00001 ) &  (train['resp'] > 0.00001 )   ).astype('int')


# collect feature columns, should probably drop feature_0
features = [c for c in train.columns if 'feature' in c]
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']


# convert inputs to numpy
X = train[features].values


# collect all 5 return values if > 0
y = np.stack([(train[c] > 0.000001).astype('int') for c in resp_cols]).T # Multitarget


# Calculate means to use to fill in nan
f_mean = np.mean(train[features[1:]].values,axis=0)

In [4]:
# Deep Bottleneck Classifiers in Supervised Dimension Reduction
# https://projet.liris.cnrs.fr/imagine/pub/proceedings/ICANN-2010/papers/6354/63540001.pdf

def create_autoencoder(input_dim, output_dim, noise=0.05):

    i = Input(input_dim)
    
    # normalize input, add Gaussian noise, relu make network asymmetric, dropout reduces overfitting, restore input
    encoded = BatchNormalization()(i)
    encoded = GaussianNoise(noise)(encoded)
    encoded = Dense(640,activation='relu')(encoded)
    decoded = Dropout(0.2)(encoded)
    decoded = Dense(input_dim, name='decoded')(decoded)
    
    # take decoded input, make network asymmetric (relu), normalize it, dropout, fit to targets
    x = Dense(320,activation='relu')(decoded)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(output_dim, activation='sigmoid', name='label_output')(x)
    
    encoder = Model(inputs=i,outputs=encoded)
    autoencoder = Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=Adam(0.001),loss={'decoded':'mse','label_output':'binary_crossentropy'})
    return autoencoder, encoder

In [5]:
# train autoencoder once, save and reuse.....

autoencoder, encoder = create_autoencoder(X.shape[-1], y.shape[-1], noise=0.1)
if TRAINING:
    autoencoder.fit(X,(X,y),
                    epochs=1002,
                    batch_size=16384, 
                    validation_split=0.1,
                    callbacks=[EarlyStopping('val_loss', patience=10,restore_best_weights=True)])
    encoder.save_weights('encoder.hdf5')
    encoder.save('saved_model_encoder.hdf5')
else:
    encoder.load_weights('encoder.hdf5')
encoder.trainable = False

Epoch 1/1002
Epoch 2/1002
Epoch 3/1002
Epoch 4/1002
Epoch 5/1002
Epoch 6/1002
Epoch 7/1002
Epoch 8/1002
Epoch 9/1002
Epoch 10/1002
Epoch 11/1002
Epoch 12/1002
Epoch 13/1002
Epoch 14/1002
Epoch 15/1002
Epoch 16/1002
Epoch 17/1002
Epoch 18/1002
Epoch 19/1002
Epoch 20/1002
Epoch 21/1002
Epoch 22/1002
Epoch 23/1002
Epoch 24/1002
Epoch 25/1002
Epoch 26/1002
Epoch 27/1002
Epoch 28/1002
Epoch 29/1002
Epoch 30/1002
Epoch 31/1002
Epoch 32/1002
Epoch 33/1002
Epoch 34/1002
Epoch 35/1002
Epoch 36/1002
Epoch 37/1002
Epoch 38/1002
Epoch 39/1002
Epoch 40/1002
Epoch 41/1002
Epoch 42/1002
Epoch 43/1002
Epoch 44/1002
Epoch 45/1002
Epoch 46/1002
Epoch 47/1002
Epoch 48/1002
Epoch 49/1002
Epoch 50/1002
Epoch 51/1002
Epoch 52/1002
Epoch 53/1002
Epoch 54/1002
Epoch 55/1002
Epoch 56/1002
Epoch 57/1002


In [6]:
# encode inputs using trained bottleneck encoder
# concat encoded and raw input data
# normalize layer, dropout
# dense layer, batch normalization, dropout - swish is a smoothed leaky relu
# 5 targets - all return info provided *** not sure sigmoid is correct activation here, try mse
# *** add label smoothing?

def create_model(input_dim, output_dim, encoder, lr=0.0001):

    inputs = Input(shape=(input_dim, ))
    
    x = encoder(inputs)
    x = Concatenate()([x, inputs])         # use both raw and encoded features
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
    # ?  up to 5 repeats    
    x = Dense(256, activation='swish')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)

    outputs = Dense(output_dim, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=outputs)

    # ? add label smoothing
    model.compile(Adam(learning_rate=lr), 
                  loss=BinaryCrossentropy(), 
                  metrics=[tf.keras.metrics.AUC(name = 'auc')])

    return model

In [7]:
# split data for training and train model


tscv = TimeSeriesSplit()


n_splits = 5
max_train_size = None
test_size = None
gap = 0     # *** ? raise this to prevent info bleed


seed = 42
fold = 0

# starts with 1/5 of data, then 2/5... 
# note that by splitting the data this way the network will skew towards newer information
for train_index, test_index in tscv.split(X):


    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    n_inputs = X_train.shape[1]
    n_outputs = y_train.shape[1]

    print('train idx, test idx', train_index, test_index)
    print('train/test', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    model = create_model(n_inputs, n_outputs, encoder)
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=4096, 
              callbacks=[EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=True)])
    model.save(f'model_{seed}_{fold}.hdf5')

    # rebuild and fine tune model
    model.compile(Adam(lr=0.00001), loss=BinaryCrossentropy(), metrics=[tf.keras.metrics.AUC(name='auc')])
    ft_history = model.fit(X_test, y_test, epochs=3, batch_size=4096)
    model.save(f'fine_tuned_model{seed}_{fold}.hdf5')


    # if training hold out last batch of data to check accuracy
    fold += 1
    #if TRAINING:
    #    if fold >= (n_splits):
    #        print('Stopping on fold %d to preserve test set' % fold)
    

train idx, test idx [     0      1      2 ... 261902 261903 261904] [261905 261906 261907 ... 523804 523805 523806]
train/test (261905, 130) (261902, 130) (261905, 5) (261902, 5)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/10

In [8]:
# check saved models

for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/model_42_2.hdf5
/kaggle/working/fine_tuned_model42_0.hdf5
/kaggle/working/model_42_0.hdf5
/kaggle/working/model_42_3.hdf5
/kaggle/working/__notebook__.ipynb
/kaggle/working/fine_tuned_model42_4.hdf5
/kaggle/working/fine_tuned_model42_1.hdf5
/kaggle/working/model_42_4.hdf5
/kaggle/working/fine_tuned_model42_2.hdf5
/kaggle/working/saved_model_encoder.hdf5
/kaggle/working/fine_tuned_model42_3.hdf5
/kaggle/working/model_42_1.hdf5
/kaggle/working/encoder.hdf5


In [9]:
threshold = 0.5

def check_scores(model, X_tr, X_val, y_tr, y_val):

    predict_val = model.predict(X_val)
    predict_val = predict_val.mean(axis=1)
    predict_val = np.where(predict_val > threshold, 1, 0).astype('int')

    
    predict_train = model.predict(X_tr)
    predict_train = predict_train.mean(axis=1)
    predict_train = np.where(predict_train > threshold, 1, 0).astype('int')
    
    
    y_tr = y_tr.mean(axis=1)
    y_tr = np.where(y_tr > threshold, 1, 0).astype('int')
    
    y_val = y_val.mean(axis=1)
    y_val = np.where(y_val > threshold, 1, 0).astype('int')

    
   
    
    score =  roc_auc_score(y_tr, predict_train)
    print('\n\nTrain score', score)

    score = roc_auc_score(y_val, predict_val)
    print('Validation score', score)

   
    
    cm = confusion_matrix(y_val, predict_val)
    print('\n\ntrue n %d, false p %d, false n %d, true p %d' %(cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
    print(cm)
    

In [10]:
'''
# if training, pull out unseen test data and see if it looks okay ( not all 1s or 0s) and is close to training accuracy
if TRAINING:

    # split into test/train for sanity checking
    n_train = len(train)
    n_test = int(n_train * .1)
    train_idx = 0
    test_idx = n_train - n_test

    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)

    n_inputs = X_train.shape[1]
    n_outputs = y_train.shape[1]


    check_scores(model, X_train, X_test, y_train, y_test)
'''

'\n# if training, pull out unseen test data and see if it looks okay ( not all 1s or 0s) and is close to training accuracy\nif TRAINING:\n\n    # split into test/train for sanity checking\n    n_train = len(train)\n    n_test = int(n_train * .1)\n    train_idx = 0\n    test_idx = n_train - n_test\n\n    print(X_train.shape, y_train.shape)\n    print(X_test.shape, y_test.shape)\n\n    n_inputs = X_train.shape[1]\n    n_outputs = y_train.shape[1]\n\n\n    check_scores(model, X_train, X_test, y_train, y_test)\n'

In [11]:
# *** load saved model here? 


'''

if TRAINING:
    
    test_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv')
    test_df = test_df.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use

    
    
    import janestreet
    env = janestreet.make_env()
    th = 0.5

    for (test_df, pred_df) in env.iter_test():

        if test_df['weight'].item() > 0:
            # fetch a row, convert to numpy array
            x_tt = test_df.loc[:, features].values

            # if nan use mean
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean

            # use model to make prediction
            preds = model(x_tt)

            # model targets all 5 returns, get median return prediction
            preds = np.median(preds, axis=1)  

            # compare predicted return to threshold and buy if over threshold
            pred_df.action = np.where(preds >= th, 1, 0).astype(int)
        else:
            # if weight 0 pass on this one
            pred_df.action = 0

        env.predict(pred_df)
'''

"\n\nif TRAINING:\n    \n    test_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv')\n    test_df = test_df.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use\n\n    \n    \n    import janestreet\n    env = janestreet.make_env()\n    th = 0.5\n\n    for (test_df, pred_df) in env.iter_test():\n\n        if test_df['weight'].item() > 0:\n            # fetch a row, convert to numpy array\n            x_tt = test_df.loc[:, features].values\n\n            # if nan use mean\n            if np.isnan(x_tt[:, 1:].sum()):\n                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean\n\n            # use model to make prediction\n            preds = model(x_tt)\n\n            # model targets all 5 returns, get median return prediction\n            preds = np.median(preds, axis=1)  \n\n            # compare predicted return to threshold and buy if over threshold\n            pred_df.action = n

In [12]:
# *** load saved model here? 
# encoder = tf.keras.models.load_model('saved_model_encoder.hdf5')
# model = tf.keras.modesl.load(model('fine_tuned_model42_5.hdf5'))

Submission = True

if Submission:
    
    
    import janestreet
    env = janestreet.make_env()
    th = 0.5

    for (test_df, pred_df) in env.iter_test():

        if test_df['weight'].item() > 0:
            # fetch a row, convert to numpy array
            x_tt = test_df.loc[:, features].values

            # if nan use mean
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean

            # use model to make prediction
            preds = model(x_tt)

            # model targets all 5 returns, get median return prediction
            preds = np.median(preds, axis=1)  

            # compare predicted return to threshold and buy if over threshold
            pred_df.action = np.where(preds >= th, 1, 0).astype(int)
        else:
            # if weight 0 pass on this one
            pred_df.action = 0

        env.predict(pred_df)
