# 02_logistic_regression.ipynb

We will use the train/test data generated from *00_create_dataset.ipynb* and perform basic **MODEL_TYPE**

In [None]:
''' data and math '''
import pandas as pd
import numpy as np

''' plotting images '''
from matplotlib import pyplot as plt
%matplotlib inline

''' traversing directories '''
import os
from pathlib import Path

''' utilities '''
from tqdm import tqdm

''' machine learning '''
from sklearn.linear_model import LogisticRegression

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
''' used to reference the root directory, for directory traversal ''' 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
mount_dir = '/content/gdrive'
root_dir = Path('/content/gdrive/My Drive/it3011_project')

Mounted at /content/gdrive


# Helper functions

In [None]:
# create the utility score, which takes in the prediction value and the ground truth action and generates a score
# link: https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation

# data: original train/test data    action: the y-value. can either be y_pred or original values too, if we want the max score attainable
def utility_score(data, action): 
  dates_set = set(data.date.values)
  dates = data.loc[:, ['date']].values.flatten()
  weights = data.loc[:, ['weight']].values.flatten()
  resps = data.loc[:, ['resp']].values.flatten()
  actions = action.flatten()

  i = len(dates_set)
  p_i = []

  for date in dates_set:
    indices = np.where(dates == date)[0]
    p_i_temp = 0
    for j in indices:
      p_i_temp = p_i_temp + weights[j] * resps[j] * actions[j]
    p_i.append(p_i_temp)
  
  p_i_squared = [p_i1*p_i2 for p_i1,p_i2 in zip(p_i,p_i)]
  t = ( sum(p_i) / np.sqrt(sum(p_i_squared)) ) * np.sqrt(250/i)
  u = min(max(t, 0), 6) * sum(p_i)

  return u

def max_train_utility_score():
  # value obtained from notebook 01_dataset_understanding.ipynb
  max_achievable_train_utility = 38666.152212179244
  return max_achievable_train_utility

def max_test_utility_score():
  # value obtained from notebook 01_dataset_understanding.ipynb
  max_achievable_test_utility = 15405.02761054398
  return max_achievable_test_utility

# Loading data

In [None]:
# load data
train = pd.read_csv(root_dir/"data/train.csv")
test = pd.read_csv(root_dir/"data/test.csv")
print("data loaded")

data loaded


In [None]:
# check shape
print(train.shape)
print(test.shape)

(280145, 139)
(120504, 139)


In [None]:
# create train/test sets
features = [feature for feature in test.keys() if "feature" in feature]
x_train = train.loc[:, features].values
y_train = train.loc[:,['action']].values
x_test = test.loc[:, features].values
y_test = test.loc[:,['action']].values
print("train/test set created")

train/test set created


# Model

In [None]:
# Create an instance of Logistic Regression classifier
model = LogisticRegression(C=1e20, solver = 'liblinear')

# Train the model
model.fit(x_train, y_train.ravel())

LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
print('The learned weights are {} {}'.format(model.intercept_, model.coef_)) 

preds = model.predict(x_test) # Predict on our training set.

print('The classification accuracy: {}'.format(((preds == y_test.ravel()).mean())))

The learned weights are [-0.02797704] [[-0.14796289  0.02722212 -0.01663411 -0.02466658  0.01061685  0.01919168
  -0.01937307  0.08590228 -0.06826533  0.01685758 -0.00316646  0.02980999
  -0.01305163 -0.01631075  0.01844183 -0.01286032  0.01353556 -0.06668352
   0.05621711 -0.04126954  0.04725339 -0.00579766  0.01107847  0.04365576
  -0.03741793  0.03742271 -0.03664424 -0.08887066  0.08886653 -0.04087686
   0.0077605   0.03157906 -0.03173844  0.00162607  0.01171705  0.03168376
  -0.03565757  0.00887427 -0.00784462  0.05069127 -0.04738121 -0.00357195
  -0.01394831 -0.00988387  0.01367834 -0.00985621 -0.01617558 -0.04938509
   0.07464404 -0.0478322  -0.0017317   0.00892451  0.0019583   0.01838361
  -0.00899073  0.00233544  0.00051904 -0.00317258  0.00128933  0.00044526
   0.0322852  -0.01107593 -0.0502786   0.05341674  0.00423454  0.0534036
  -0.04557376 -0.02407869  0.02394942 -0.01676263  0.00533988  0.021619
   0.02547805 -0.00217501 -0.03175647  0.01420044 -0.00639388  0.00567477
  -

Well... the accuracy is not very good. Let's not use logistic regression. Perhaps there are too many features that a higher order regression line results in overfitting on noise, or the complexity results in high variance.