# 03_tml_pca

We will stack PCA onto the 02 series notebooks to try and reduce the dimensions of our data. We think that not all features are important

In [13]:
''' data and math '''
import pandas as pd
import numpy as np

''' plotting images '''
from matplotlib import pyplot as plt
%matplotlib inline

''' traversing directories '''
import os
from pathlib import Path

''' utilities '''
from tqdm import tqdm

''' metrics '''
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [14]:
''' used to reference the root directory, for directory traversal ''' 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
mount_dir = '/content/gdrive'
root_dir = Path('/content/gdrive/My Drive/it3011_project')

Mounted at /content/gdrive


# Loading data

In [15]:
# load data
train = pd.read_csv(root_dir/"data/train.csv")
test = pd.read_csv(root_dir/"data/test.csv")
print("data loaded")

data loaded


In [16]:
# check shape
print(train.shape)
print(test.shape)

(280145, 138)
(120504, 138)


In [17]:
# create train/test sets
features = [feature for feature in test.keys() if "feature" in feature]
x_train = train.loc[:, features].values
y_train = train.loc[:,['action']].values.flatten()
x_test = test.loc[:, features].values
y_test = test.loc[:,['action']].values.flatten()
print("train/test set created")

train/test set created


In [18]:
# scaling data to make it easier for models to train
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

# test set scaled on the same scaler as train, because models are fitted on the train distributions and not test distributions
x_test = scaler.transform(x_test)

# Helper functions

In [19]:
# constants
SEED = 42

In [20]:
# create the utility score, which takes in the prediction value and the ground truth action and generates a score
# link: https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation

# data: original train/test data    action: the y-value. can either be y_pred or original values too, if we want the max score attainable
def utility_score(data, action): 
  dates_set = set(data.date.values)
  dates = data.loc[:, ['date']].values.flatten()
  weights = data.loc[:, ['weight']].values.flatten()
  resps = data.loc[:, ['resp']].values.flatten()
  actions = action.flatten()

  i = len(dates_set)
  p_i = []

  for date in dates_set:
    indices = np.where(dates == date)[0]
    p_i_temp = 0
    for j in indices:
      p_i_temp = p_i_temp + weights[j] * resps[j] * actions[j]
    p_i.append(p_i_temp)
  
  p_i_squared = [p_i1*p_i2 for p_i1,p_i2 in zip(p_i,p_i)]
  t = ( sum(p_i) / np.sqrt(sum(p_i_squared)) ) * np.sqrt(250/i)
  u = min(max(t, 0), 6) * sum(p_i)

  return u

def max_train_utility_score(data=train, action=y_train):
  return utility_score(data, action)

def max_test_utility_score(data=test, action=y_test):
  return utility_score(data, action)

In [21]:
def model_scores(model, test=test, x_test=x_test, y_test=y_test):
  y_pred = model.predict(x_test) 

  print("Utility score: ", utility_score(test, y_pred))
  print("Accuracy: ", accuracy_score(y_test, y_pred))
  
  #print("Confusion matrix")
  #cm = confusion_matrix(y_test, y_pred)
  #fig, ax = plt.subplots(figsize=(3, 3))
  #ax.imshow(cm)
  #ax.grid(False)
  #ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
  #ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
  #ax.set_ylim(1.5, -0.5)
  #for i in range(2):
  #    for j in range(2):
  #        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
  #plt.show()  

  #print("AUC_ROC")
  #logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))
  #fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:,1])
  #plt.figure()
  #plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
  #plt.plot([0, 1], [0, 1],'r--')
  #plt.xlim([0.0, 1.0])
  #plt.ylim([0.0, 1.05])
  #plt.xlabel('False Positive Rate')
  #plt.ylabel('True Positive Rate')
  #plt.title('Receiver operating characteristic')
  #plt.legend(loc="lower right")
  #plt.show()

# Neural network

In [22]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf

!pip install tensorflow-addons
import tensorflow_addons as tfa
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices



In [23]:
tf.random.set_seed(SEED)
np.random.seed(SEED)

def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

In [24]:
# iterate over settings for the model
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

model = create_mlp(len(features), 1, hidden_units, dropout_rates, label_smoothing, learning_rate)

model.fit(x_train, y_train, epochs=100, batch_size=4096,verbose=2)

print(f"\n\nKerasNN: batch_size = 4096, epoch = 100, learning_rate = 1e-3")    

model_scores(model)

Epoch 1/100
69/69 - 8s - loss: 0.7376 - AUC: 0.5077
Epoch 2/100
69/69 - 1s - loss: 0.7149 - AUC: 0.5155
Epoch 3/100
69/69 - 1s - loss: 0.7064 - AUC: 0.5206
Epoch 4/100
69/69 - 1s - loss: 0.7021 - AUC: 0.5236
Epoch 5/100
69/69 - 1s - loss: 0.6974 - AUC: 0.5301
Epoch 6/100
69/69 - 1s - loss: 0.6950 - AUC: 0.5338
Epoch 7/100
69/69 - 1s - loss: 0.6934 - AUC: 0.5370
Epoch 8/100
69/69 - 1s - loss: 0.6919 - AUC: 0.5399
Epoch 9/100
69/69 - 1s - loss: 0.6908 - AUC: 0.5434
Epoch 10/100
69/69 - 1s - loss: 0.6899 - AUC: 0.5462
Epoch 11/100
69/69 - 1s - loss: 0.6892 - AUC: 0.5493
Epoch 12/100
69/69 - 1s - loss: 0.6885 - AUC: 0.5518
Epoch 13/100
69/69 - 1s - loss: 0.6880 - AUC: 0.5547
Epoch 14/100
69/69 - 1s - loss: 0.6878 - AUC: 0.5547
Epoch 15/100
69/69 - 1s - loss: 0.6871 - AUC: 0.5583
Epoch 16/100
69/69 - 1s - loss: 0.6866 - AUC: 0.5607
Epoch 17/100
69/69 - 1s - loss: 0.6862 - AUC: 0.5621
Epoch 18/100
69/69 - 1s - loss: 0.6853 - AUC: 0.5652
Epoch 19/100
69/69 - 1s - loss: 0.6850 - AUC: 0.5659
Ep

ValueError: ignored

# Conclusion

* var_smoothing <= 1e-03 gives stable results
* utility score generally around 32.65379488679639
* accuracy generally around 0.4958424616610237