In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import gc

import tensorflow as tf
import keras
from keras import backend as K
from keras import layers
from keras.layers.core import Activation
from keras.utils.generic_utils import get_custom_objects

# from IPython.display import display
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# gpu_devices = tf.config.experimental.list_physical_devices('GPU')
# for device in gpu_devices:
#     tf.config.experimental.set_memory_growth(device, True)

### Reading data

Reading data from Parquet file. refer to Fast Data Loading and Low Mem with Parquet Files: https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files

In [None]:
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

### Take a look at the data

In [None]:
train.info()

You will find that row_id is of type string, time_id and investment_id are of type uint16, and others are of type float32.

In [None]:
def get_colums_by_dtype(df, dtype):
    s = (df.dtypes==dtype)
    return list(s[s].index)

print('object columns:', get_colums_by_dtype(train, 'object'))
print('object columns:', get_colums_by_dtype(train, 'uint16'))

Because the dataset is so big, to save memory you can convert float32 to float16

In [None]:
for col in get_colums_by_dtype(train, 'float32'):
    train[col] = train[col].astype(np.float16)

### handling outliers

In [None]:
investment_count = train.groupby(['investment_id'])['target'].count()

investment_count.hist(bins=60, alpha = 0.5, figsize=(10, 5))
plt.xlabel('Investment Count')
plt.ylabel('Frequency')
plt.title('Frequency of investment count')
plt.grid(True)
plt.show()

In [None]:
investment_mean = train.groupby(['investment_id'])['target'].mean()
investment_mean.hist(bins=60, alpha = 0.5, figsize=(10, 5))
plt.xlabel('Investment mean by target')
plt.ylabel('Frequency')
plt.title("Frequency of investment mean by target")
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
ax = sns.jointplot(x=investment_count, y=investment_mean, kind='reg',height=8, color = 'blue')
ax.ax_joint.set_xlabel('Investment count')
ax.ax_joint.set_ylabel('Target mean')
plt.show()

In [None]:
outlier_investment = investment_mean[abs(investment_mean)>0.4].index.tolist()
X = train[~train['investment_id'].isin(outlier_investment)]

In [None]:
del train
del investment_mean
del investment_count
del outlier_investment
gc.collect()

In [None]:
X.info()

### Splitting data

How to split X into train set and test set, generally you can do like this: 

In [None]:
# y = X.pop("target")
# _ = X.pop('row_id')
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# # X_train, y_train, X_test, y_test = X[0:2122479], y[0:2122479], X[2122479:], y[2122479:]

However, if you want to achieve better prediction, maybe you need to consider the distribution of time_id and investment_id. Let's take a look at their hist diagrams.

In [None]:
X[['investment_id', 'time_id']].hist(bins=50, figsize=(12,5))
plt.show

You can see from the above that the distribution of investment_id is relatively uniform, while the time_id is distributed incrementally. In order to prevent sampling bias, We will split data based on time_id by using stratified sampling.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = list(split.split(X, X['time_id']))[0]

In [None]:
X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

In [None]:
del X
gc.collect()

Verify the sampling result。

In [None]:
display(X_train['time_id'].value_counts() / len(X_train))
display(X_test['time_id'].value_counts() / len(X_test))

In [None]:
y_train = X_train.pop('target')
y_test = X_test.pop('target')
_ = X_train.pop('row_id')
X_train_time_id = X_train.pop('time_id')
_ = X_test.pop('row_id')
X_test_time_id = X_test.pop('time_id')

In [None]:
print(X_train.info())
print(X_test.info())

Making dataset for multiple input models

In [None]:
def make_dataset(investment_id, features, y=None, batch_size=1024):
    if y is not None:
        slices = ((investment_id, features), y)
    else:
        slices = ((investment_id, features))
        
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    
    return ds

### Building models
#### Define models

In [None]:
def get_model(hidden_layer_nodes=[256, 128, 32], activation='swish', batchNormalization=True, dropOut=None):
    model = keras.Sequential()
    if not batchNormalization:
        batchNormalization = False
    if not isinstance(batchNormalization, list):
        batchNormalization = [batchNormalization] * len(hidden_layer_nodes)
    if not dropOut:
        dropOut = 0.0
    if not isinstance(dropOut, list):
        dropOut = [dropOut] * len(hidden_layer_nodes)
    for node_num,bn,dropout_rate in zip(hidden_layer_nodes, batchNormalization, dropOut):
        model.add(keras.layers.Dense(node_num))
        if bn:
            model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Activation(activation)) 
        if dropout_rate>0:
            model.add(keras.layers.Dropout(dropout_rate)) 
            

    # Add activation function to classifier
    model.add(keras.layers.Dense(1, activation='linear'))

    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")

    model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse", rmse])
    #model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', rmse])
    return model

In [None]:
def get_multi_input_model():
    investment_id_input = tf.keras.Input(shape=(1,), dtype=tf.uint16, name='investment_id')
    investment_id_x = layers.Dense(64, activation='relu')(investment_id_input)

    features_input = tf.keras.Input(shape=(300,), dtype=tf.float16, name='features')
    features_x = layers.Dense(512, activation='swish')(features_input)
    features_x = layers.Dense(256, activation='swish')(features_x)


    concatenated = layers.concatenate([investment_id_x, features_x], axis=-1)
    mixed_x = layers.Dense(32, activation='relu')(concatenated)
    output = layers.Dense(1)(mixed_x)

    model = keras.models.Model([investment_id_input, features_input], output, name='model_with_multi_input')
    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    model.compile(optimizer='adam', loss='mse', metrics=['mse', rmse])
    
    return model

### First: A basic model

In [None]:
mc = keras.callbacks.ModelCheckpoint("best_model_basic.h5", monitor="val_mse", save_best_only=True, mode="min", verbose=1)
model=get_model(activation='swish')
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test), callbacks=[mc])

### Second: A basic model with early stop callback

In [None]:
es = keras.callbacks.EarlyStopping(monitor="val_mse", patience=4, mode="min")
mc = keras.callbacks.ModelCheckpoint("best_model_basic_with_early_stop.h5", monitor="val_mse", save_best_only=True, mode="min", verbose=1)
model=get_model(activation='swish')
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[es, mc])

### Third: Model with custom defined activation

In [None]:
def custom_activation(x):
  
    #return (1/(1 + K.exp(-x)))
    return K.sin(math.pi * x /2)
     
get_custom_objects().update({'custom_activation': Activation(custom_activation)})

In [None]:
mc = keras.callbacks.ModelCheckpoint("best_model_with_custom_activation.h5", monitor="val_mse", save_best_only=True, mode="min", verbose=1)
model=get_model(activation='custom_activation')
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test), callbacks=[mc])

### Forth: Model with multiple inputs

Like BERT, the investment_id here is encoded separately.

In [None]:
investment_id = X_train.pop('investment_id')
train_ds = make_dataset(investment_id=investment_id, features=X_train, y=y_train)
val_investment_id = X_test.pop('investment_id')

In [None]:
model = get_multi_input_model()
tf.keras.utils.plot_model(model, show_shapes=True)
mc = keras.callbacks.ModelCheckpoint("best_model_with_multi_input.h5", monitor="val_mse", save_best_only=True, mode="min", verbose=1)

In [None]:
model.fit(train_ds, validation_data=((val_investment_id, X_test), y_test), epochs=15, callbacks=[mc])

### Making prediction and Submitting result

We found that the best model is best_model_with_custom_activation.h5. so we will load the model and make prediction as our submission.

In [None]:
df_test = pd.read_parquet('../input/ubiquant-parquet/example_test.parquet')

In [None]:
sample_prediction_df = df_test[['row_id','time_id']]

In [None]:
df_test.drop(['row_id','time_id'],axis=1,inplace=True)

In [None]:
model = keras.models.load_model('best_model_with_custom_activation.h5')
preds = model.predict(df_test)

In [None]:
sample_prediction_df = pd.concat([sample_prediction_df,pd.DataFrame(preds, columns=['target'])], axis=1)

In [None]:
sample_prediction_df.head()

In [None]:
sample_prediction_df.to_csv('submission_sample_prediction_df.csv')