In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import gc

import tensorflow as tf
import keras
from keras import backend as K
from keras import layers
from keras.layers.core import Activation
from keras.utils.generic_utils import get_custom_objects

# from IPython.display import display
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

### Reading data

Because the original data file in csv format is too large to load the entire file into memory directly, you can use the following commented out code sniplet if you want to load the data from csv file. We will read data from Parquet file here. Refer to Fast Data Loading and Low Mem with Parquet Files: https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files for more details.

In [None]:
# dtypes = {'row_id': 'object', 'time_id':'int16', 'investment_id':'int16'}
# for i in range(300):
#     dtypes['f_' + str(i)] = 'float16'
# dtypes['target'] = 'float32'

# reader = pd.read_csv('../input/ubiquant-market-prediction/train.csv', dtype=dtypes, iterator=True)
# train = reader.get_chunk()

train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

### Take a look at the data

In [None]:
train.info()
train.head()

To reduce memory usage, it is possible to convert float32 to float16, but be careful, check the range of values for each column when converting

In [None]:
def get_colums_by_dtype(df, dtype):
    s = (df.dtypes==dtype)
    return list(s[s].index)

for col in get_colums_by_dtype(train, 'float32'):
    train[col] = train[col].astype(np.float16)

### Splitting data

In [None]:
train.pop("row_id")
time_id = train.pop("time_id")
investment_id = train.pop("investment_id")
y = train.pop("target")

Making dataset for multiple input models

In [None]:
def make_train_test_dataset(time_id, investment_id, features, y=None, train_split=0.8, test_split=0.2, batch_size=1024, shuffle=False):
    if y is not None:
        slices = ((time_id, investment_id, features), y)
    else:
        slices = ((time_id, investment_id, features))
    ds = tf.data.Dataset.from_tensor_slices(slices)
    train_size = int(train_split * len(time_id))
    test_size = int(test_split * len(time_id))
    train_ds = ds.take(train_size)    
    test_ds = ds.skip(train_size)
    if shuffle:
        train_ds = train_ds.shuffle(buffer_size=1024, seed=12)
    train_ds = train_ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    test_ds = test_ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)

    return train_ds, test_ds

### Building models
#### Define models

In [None]:
time_id_unique = time_id.unique()
time_id_size = len(time_id_unique)
time_id_lookup_layer = layers.IntegerLookup(max_tokens=time_id_size)

investment_id_unique = investment_id.unique()
investment_id_size = len(time_id_unique)
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)

with tf.device("cpu"):
    time_id_lookup_layer.adapt(time_id)
    investment_id_lookup_layer.adapt(investment_id)

In [None]:
def get_bert_model():
    time_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    time_id_x = investment_id_lookup_layer(time_id_inputs)
    time_id_x = layers.Embedding(time_id_size, 32, input_length=1)(time_id_x)
    time_id_x = layers.Reshape((-1, ))(time_id_x)
    time_id_x = layers.Dense(32, activation='swish')(time_id_x)
    time_id_x = layers.Dense(32, activation='swish')(time_id_x)

    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(32, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(32, activation='swish')(investment_id_x)
    
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(128, activation='swish')(feature_x)
    feature_x = layers.Dense(64, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([time_id_x, investment_id_x, feature_x])
    x = layers.BatchNormalization(name='batch_norm1')(x)
    x = layers.Dense(256, activation='swish', name='dense1')(x)
    x = layers.Dropout(0.1, name='dropout1')(x)
    x = layers.Reshape((1, -1), name='reshape1')(x)
    x = layers.BatchNormalization(name='batch_norm2')(x)
    x = layers.LSTM(128, return_sequences=True, activation='relu', name='lstm1')(x)
    x = layers.LSTM(16, return_sequences=False, activation='relu', name='lstm2')(x)
    
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[time_id_inputs, investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', rmse])
    return model


In [None]:
# train_size = int(len(time_id) * 0.8)
# X_train = (time_id.iloc[0:train_size],investment_id.iloc[0:train_size],train.iloc[0:train_size])
# X_test = (time_id[train_size:],investment_id[train_size:],train[train_size:])
# y_train = y[0:train_size]
# y_test = y[train_size:]

# model = get_bert_model()
# mc = keras.callbacks.ModelCheckpoint("best_bert_model.tf", save_format='tf', monitor="val_mse", save_best_only=True, mode="min", verbose=1)
# model.fit(X_train,y_train,validation_data=(X_test,y_test), epochs=2, callbacks=[mc])

In [None]:
train_ds, test_ds = make_train_test_dataset(time_id, investment_id, train, y)
del train
gc.collect()

In [None]:
model = get_bert_model()
tf.keras.utils.plot_model(model, show_shapes=True)
mc = keras.callbacks.ModelCheckpoint("best_bert_model.tf", save_format='tf', monitor="val_mse", save_best_only=True, mode="min", verbose=1)
model.fit(train_ds, validation_data=test_ds, epochs=15, callbacks=[mc])
#model.fit(X_train,y_train,validation_data=(X_test,y_test), epochs=2, callbacks=[mc])

### Making prediction and Submitting result

We found that the best model is best_model_with_custom_activation.h5. so we will use this model to make prediction as our submission.

In [None]:
df_test = pd.read_parquet('../input/ubiquant-parquet/example_test.parquet')

In [None]:
df_test

In [None]:
sample_prediction_df = df_test[['row_id','time_id']]

df_test.pop('row_id')
test_time_id = df_test.pop('time_id')
test_investment_id = df_test.pop('investment_id')

In [None]:
model_loaded = keras.models.load_model('best_bert_model.tf')

preds = model_loaded.predict((test_time_id, test_investment_id, df_test))

In [None]:
sample_prediction_df = pd.concat([sample_prediction_df,pd.DataFrame(preds, columns=['target'])], axis=1)

In [None]:
sample_prediction_df.head()

In [None]:
sample_prediction_df.to_csv('submission_sample_prediction_df.csv')