In [1]:
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost
from xgboost import  XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

import pandas as pd
import numpy as np

tf.get_logger().setLevel('ERROR')
tf.test.is_gpu_available()


2024-03-01 10:15:52.194853: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-01 10:15:52.216225: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-01 10:15:52.216248: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-01 10:15:52.216776: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-01 10:15:52.220681: I tensorflow/core/platform/cpu_feature_guar

True

In [2]:
apple_data = pd.read_csv("apple_processed_data.csv")
tesla_data = pd.read_csv("tesla_processed_data.csv")

In [3]:
def train_valid_test(data, target, split):
    train, valid, test = split
    n = data.shape[0]
    train_end = int(train * n)
    valid_end = int(n * (train + valid))
    x_train, y_train = data[:train_end], target[:train_end]
    x_val, y_val = data[train_end:valid_end], target[train_end:valid_end]
    x_test, y_test = data[valid_end:], target[valid_end:]

    return x_train, y_train, x_val, y_val, x_test, y_test

In [4]:
data = apple_data

In [5]:
predictors = data.drop(['is_up', 'Date', 'news'], axis=1)
target = data['is_up']

x_train, y_train, x_val, y_val, x_test, y_test = train_valid_test(predictors, target, (0.6, 0.2, 0.2))

## Transform data

In [6]:
sc = StandardScaler()
sc.fit(x_train)
x_train_scaled = sc.transform(x_train)
x_val_scaled = sc.transform(x_val)
x_test_scaled = sc.transform(x_test)

## Baseline model

In [8]:
counting = data['is_up'].value_counts()
counting[1] / np.sum(counting)

0.5263157894736842

In [9]:
accuracy_score(y_val, [1] * len(y_val))

0.5393258426966292

## Default hyperparameters with some Algo

Logistic regression

In [88]:
lr = LogisticRegression(random_state = 12)
lr.fit(x_train_scaled, y_train)

print("Training accuracy:", lr.score(x_train_scaled, y_train))
print("Validation accuracy:", lr.score(x_val_scaled, y_val))

Training accuracy: 0.5302056555269923
Validation accuracy: 0.5397301349325337


Random Forest Classifier

In [89]:
rf = RandomForestClassifier(bootstrap = False, random_state = 12)
rf.fit(x_train_scaled, y_train)

print("Training accuracy:", rf.score(x_train_scaled, y_train))
print("Validation accuracy:", rf.score(x_val_scaled, y_val))

Training accuracy: 1.0
Validation accuracy: 0.45577211394302847


CatBoostClassifier

In [90]:
cat = CatBoostClassifier(silent=True, random_state = 12)
cat.fit(x_train_scaled, y_train)

print("Training accuracy:", cat.score(x_train_scaled, y_train))
print("Validation accuracy:", cat.score(x_val_scaled, y_val))

Training accuracy: 0.8046272493573264
Validation accuracy: 0.4662668665667166


XGBoost Classifier 

In [91]:
xg = XGBClassifier()
xg.fit(x_train_scaled, y_train)

print("Training accuracy:", xg.score(x_train_scaled, y_train))
print("Validation accuracy:", xg.score(x_val_scaled, y_val))

Training accuracy: 0.9794344473007712
Validation accuracy: 0.4572713643178411


Simple Deep Learning

In [92]:
model = tf.keras.Sequential()
model.add(Dense(1, activation = 'tanh'))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

In [93]:
hist = model.fit(x_train_scaled, y_train, epochs=40, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [94]:
threshold = 0
y_val_pred = (model.predict(x_val_scaled) > threshold).astype(int)
print("Training accuracy:", accuracy_score(y_train, (model.predict(x_train_scaled) > threshold).astype(int)))
print("Validation accuracy:", accuracy_score(y_val, y_val_pred))

Training accuracy: 0.5282776349614395
Validation accuracy: 0.5217391304347826
