# 06_tml_test

From the 04/05 series of notebooks, we have used 10-fold cross validation to try and determine the optimal model, and its set of hyperparameters for this problem. 

Now, we will evaluate the top three models on the test set and select the winner. We also run a pure RNG as a baseline comparison

In [None]:
''' data and math '''
import pandas as pd
import numpy as np

''' plotting images '''
from matplotlib import pyplot as plt
%matplotlib inline

''' traversing directories '''
import os
from pathlib import Path

''' utilities '''
from tqdm import tqdm

''' metrics '''
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
''' used to reference the root directory, for directory traversal ''' 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
mount_dir = '/content/gdrive'
root_dir = Path('/content/gdrive/My Drive/it3011_project')

Mounted at /content/gdrive


# Loading data

In [None]:
# load data
train = pd.read_csv(root_dir/"data/train_no_na.csv")
test = pd.read_csv(root_dir/"data/test_no_na.csv")
print("data loaded")

data loaded


In [None]:
# check shape
print(train.shape)
print(test.shape)

(279331, 138)
(120163, 138)


In [None]:
# create train/val/test sets
features = [feature for feature in test.keys() if "feature" in feature]
x_train = train.loc[:, features].values
y_train = train.loc[:,['action']].values.flatten()
x_test = test.loc[:, features].values
y_test = test.loc[:,['action']].values.flatten()
print("train/test set created")

train/test set created


# Helper functions

In [None]:
# constants
SEED = 42

In [None]:
# create the utility score, which takes in the prediction value and the ground truth action and generates a score
# link: https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation

# data: original train/test data    action: the y-value. can either be y_pred or original values too, if we want the max score attainable
def utility_score(data, action): 
  dates_set = set(data.date.values)
  dates = data.loc[:, ['date']].values.flatten()
  weights = data.loc[:, ['weight']].values.flatten()
  resps = data.loc[:, ['resp']].values.flatten()
  actions = action.flatten()

  i = len(dates_set)
  p_i = []

  for date in dates_set:
    indices = np.where(dates == date)[0]
    p_i_temp = 0
    for j in indices:
      p_i_temp = p_i_temp + weights[j] * resps[j] * actions[j]
    p_i.append(p_i_temp)
  
  p_i_squared = [p_i1*p_i2 for p_i1,p_i2 in zip(p_i,p_i)]
  t = ( sum(p_i) / np.sqrt(sum(p_i_squared)) ) * np.sqrt(250/i)
  u = min(max(t, 0), 6) * sum(p_i)

  return u

def max_train_utility_score(data=train, action=y_train):
  return utility_score(data, action)

def max_test_utility_score(data=test, action=y_test):
  return utility_score(data, action)

In [None]:
def model_scores(model, test, x_test, y_test):
  y_pred = model.predict(x_test) 
  
  # # get some scores from helpers
  utility = utility_score(test, y_pred)
  accuracy =  accuracy_score(y_test, y_pred)

  # # confusion matrix
  # print("confusion matrix")
  cm = confusion_matrix(y_test, y_pred)
  true_pos = cm[1][1]
  true_neg = cm[0][0]
  false_pos = cm[0][1]
  false_neg = cm[1][0]

  # # plot confusion matrix
  # fig, ax = plt.subplots(figsize=(3, 3))
  # ax.imshow(cm)
  # ax.grid(False)
  # ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
  # ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
  # ax.set_ylim(1.5, -0.5)
  # for i in range(2):
  #     for j in range(2):
  #         ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
  # plt.show()  

  # # AUC-ROC
  # print("AUC_ROC")
  logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))

  # # plot auc-roc
  # fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:,1])
  # plt.figure()
  # plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
  # plt.plot([0, 1], [0, 1],'r--')
  # plt.xlim([0.0, 1.0])
  # plt.ylim([0.0, 1.05])
  # plt.xlabel('False Positive Rate')
  # plt.ylabel('True Positive Rate')
  # plt.title('Receiver operating characteristic')
  # plt.legend(loc="lower right")
  # plt.show()

  return utility, accuracy, logit_roc_auc, true_pos, true_neg, false_pos, false_neg

In [None]:
import datetime
import csv

def save_scores(output_filename, workbook_name, model_name, model_params, utility, accuracy, logit_roc_auc, true_pos, true_neg, false_pos, false_neg):
  # create output file if not exists
  try:
    f = open(root_dir/output_filename)
  except IOError:
    with open (root_dir/output_filename, 'a') as csvfile:
      headers = ["workbook_name", "model_name", "model_params", "utility", "accuracy", "logit_roc_auc", "true_pos", "true_neg", "false_pos", "false_neg", "timestamp"]
      writer = csv.DictWriter(csvfile, delimiter=',', lineterminator='\n',fieldnames=headers)
      writer.writeheader() 
      print("created output file")  
    csvfile.close()

  # output file exists, append
  timestamp = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
  
  ''' create another df that looks just like the excel file and concat with ''' 
  new_scores = pd.DataFrame(np.array([[workbook_name, model_name, model_params, utility, accuracy, logit_roc_auc, true_pos, true_neg, false_pos, false_neg, timestamp]]),
                   columns=["workbook_name", "model_name", "model_params", "utility", "accuracy", "logit_roc_auc", "true_pos", "true_neg", "false_pos", "false_neg", "timestamp"],
                  )

  new_scores.to_csv(root_dir/output_filename, mode='a', header=False, index=False)
  print("saved model metrics")

# RNG model
We just want to have a baseline to ensure our models are not worse than randomly generated predictions

In [None]:
# iterate over settings for the model
all_scores = []

output_filename = "TEST_SCORES.csv"
workbook_name = "06_tml_test_rng"
model_name = "RNG model"
model_params = f"random"

print("")
print("model_name: ", model_name)
print("model_params: ", model_params)

for i in range(50):      
  print(f"random iteration = {i+1}")
  
  y_pred = np.random.randint(2, size=y_test.shape[0])
  
  utility = utility_score(test, y_pred)
  accuracy = accuracy_score(y_test, y_pred)
  all_scores.append((utility, accuracy))
  
all_scores_ave = [sum(ele) / len(all_scores) for ele in zip(*all_scores)]
filler_scores = [None]*5
all_scores_ave += filler_scores
print("all_scores key: utility, accuracy, logit_roc_auc, true_pos, true_neg, false_pos, false_neg")
print("all_scores_ave: ", all_scores_ave)

# save average scores
save_scores(output_filename, workbook_name, model_name, model_params, *all_scores_ave)


model_name:  RNG model
model_params:  random
random iteration = 1
random iteration = 2
random iteration = 3
random iteration = 4
random iteration = 5
random iteration = 6
random iteration = 7
random iteration = 8
random iteration = 9
random iteration = 10
random iteration = 11
random iteration = 12
random iteration = 13
random iteration = 14
random iteration = 15
random iteration = 16
random iteration = 17
random iteration = 18
random iteration = 19
random iteration = 20
random iteration = 21
random iteration = 22
random iteration = 23
random iteration = 24
random iteration = 25
random iteration = 26
random iteration = 27
random iteration = 28
random iteration = 29
random iteration = 30
random iteration = 31
random iteration = 32
random iteration = 33
random iteration = 34
random iteration = 35
random iteration = 36
random iteration = 37
random iteration = 38
random iteration = 39
random iteration = 40
random iteration = 41
random iteration = 42
random iteration = 43
random iteration 

# Conclusion

* Utility score: 
* Accuracy: 