In [109]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [111]:
##############################################
# Load the dataset, scaling features, creating sequences
ticker_of_interest = 'TSLA'
# Set sequence length (e.g., 10 days of data for each prediction)
SEQ_LENGTH = 4
##############################################
df = pd.read_csv("stock_data.csv").dropna()

df = df.loc[df['ticker'] == ticker_of_interest]
df = df[[
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close',
 'Volume',
 'neutral-count-finbert',
 'positive-count-finbert',
 'negative-count-finbert',
 'average-confidence-finbert',
 'average-neutral-score-gemini',
 'average-positive-score-gemini',
 'average-negative-score-gemini',
 'prediction-label',
 ]]
df.reset_index(inplace=True)

# Scale the features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[[
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close',
 'Volume',
 'neutral-count-finbert',
 'positive-count-finbert',
 'negative-count-finbert',
 'average-confidence-finbert',
 'average-neutral-score-gemini',
 'average-positive-score-gemini',
 'average-negative-score-gemini',
 ]])

# Create a new DataFrame with scaled features
df_scaled = pd.DataFrame(scaled_features, columns=[
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close',
 'Volume',
 'neutral-count-finbert',
 'positive-count-finbert',
 'negative-count-finbert',
 'average-confidence-finbert',
 'average-neutral-score-gemini',
 'average-positive-score-gemini',
 'average-negative-score-gemini',
 ])

# Add the target label back to the DataFrame
df_scaled['prediction-label'] = df['prediction-label']

# Create sequences for LSTM input
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data.iloc[i:i+seq_length, :-1].values)  # All features, excluding the label column
        y.append(data.iloc[i+seq_length, -1])  # The label at the next timestep
    return np.array(X), np.array(y)

# Create sequences
X, y = create_sequences(df_scaled, SEQ_LENGTH)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # Ensure y is of shape [batch_size, 1]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.4, shuffle=False)

# Create DataLoader for batching
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [113]:
###################################
# This class defines the LSTM neural network 
# This does not need to be changed
###################################

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Define a fully connected (linear) layer for output
        self.fc = nn.Linear(hidden_size, output_size)

        # Sigmoid activation function because this is a binary classification problem
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # LSTM outputs
        lstm_out, (hn, cn) = self.lstm(x)  # hn is the hidden state from the last LSTM layer
        
        # We take the output from the last time step
        out = self.fc(hn[-1])  # hn[-1] is the last hidden state (representing the entire sequence)
        out = self.sigmoid(out)  # Apply sigmoid to get a probability
        
        return out


In [115]:
############################################################
# Hyperparameters
input_size = 13  # Number of features in each time step (open, close, sentiment_score)
hidden_size = 50  # Number of LSTM units in each layer
output_size = 1  # Output size (1 for binary classification)
num_layers = 1  # Number of LSTM layers
batch_size = 16
num_epochs = 100
learning_rate = 0.001
# Hyperparameters that can be tuned to see how the accuracy of the model changes.
#######################################################################
# Instantiate the model
model = LSTMModel(input_size, hidden_size, output_size, num_layers)

# Loss function and optimizer
criterion = nn.BCELoss()  # Using binary cross entropy loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
    model.train()
    
    # Loop over batches of data using train_loader
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Ensure that the inputs are of the correct shape [batch_size, seq_length, input_size]
        inputs = inputs.float()  # Ensure input is of type float
        targets = targets.float().view(-1, 1)  # Ensure target shape is [batch_size, 1]

        # Forward pass
        outputs = model(inputs)  # Get model predictions for the current batch
        loss = criterion(outputs, targets)  # Calculate the loss

        # Backward pass and optimization
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model parameters
        
    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    
print("We want the losses to converge to zero here at the last epoch.")
# After training, evaluate the model
model.eval()  # Set model to evaluation mode
correct = 0
total = 0

with torch.no_grad():  # Disable gradient calculation during evaluation
    for inputs, targets in test_loader:
        inputs = inputs.float()
        targets = targets.float().view(-1, 1)
        
        outputs = model(inputs)  # Get predictions
        predicted_class = (outputs >= 0.5).float()  # Classify as 1 if probability >= 0.5, else 0
        
        total += targets.size(0)
        correct += (predicted_class == targets).sum().item()

accuracy = correct / total
print(f"Accuracy on test data: {accuracy * 100:.2f}%")

Epoch [10/100], Loss: 0.6856
Epoch [20/100], Loss: 0.6759
Epoch [30/100], Loss: 0.6608
Epoch [40/100], Loss: 0.6348
Epoch [50/100], Loss: 0.5893
Epoch [60/100], Loss: 0.5158
Epoch [70/100], Loss: 0.4149
Epoch [80/100], Loss: 0.2831
Epoch [90/100], Loss: 0.1218
Epoch [100/100], Loss: 0.0332
We want the losses to converge to zero here at the last epoch.
Accuracy on test data: 60.00%


In [95]:
#################################################
# With fewer data points, it is better to perform cross validation on the data to 
# get a more stable measurement of accuracy. This code block performs 
# k_fold_cross validation
#################################################
def k_fold_cross_validation(X, y, k=4):
    kf = KFold(n_splits=k, shuffle=True)  # Split data into k folds
    fold_accuracies = []  # Store accuracy for each fold

    for fold, (train_index, val_index) in enumerate(kf.split(X)):
        # Split data into training and validation sets
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Convert to PyTorch tensors
        train_data = torch.tensor(X_train, dtype=torch.float32)
        val_data = torch.tensor(X_val, dtype=torch.float32)
        train_labels = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
        val_labels = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
        
        # Initialize the model
        model = LSTMModel(input_size=input_size, hidden_size=hidden_size, output_size=output_size, num_layers=num_layers)
        
        # Optimizer
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        # Training loop for the current fold
        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(train_data)
            loss = criterion(outputs, train_labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        
        # Validation phase
        model.eval()
        with torch.no_grad():
            val_outputs = model(val_data)
            val_predictions = (val_outputs > 0.5).float()  # Convert to binary predictions (0 or 1)
            val_accuracy = accuracy_score(val_labels, val_predictions)
            fold_accuracies.append(val_accuracy)

        # Print progress for each fold
        print(f"Fold {fold + 1}, Validation Accuracy: {val_accuracy:.4f}")
    
    # Calculate and return the average accuracy across all folds
    average_accuracy = sum(fold_accuracies) / k
    print(f"Average Accuracy across {k}-folds: {average_accuracy:.4f}\n")
    return average_accuracy

In [117]:
#####################################
#This code runs the k-fold cross validation k number of times. 
#You can set a value for k here. The larger the k, the more you train the model.
k = 4
#####################################

print('Ticker: ' + ticker_of_interest)
total = 0
num_times = 5
for i in range(num_times):
     total += k_fold_cross_validation(X, y)

print("Average accuracy across all runs: " + str(total/num_times))

Ticker: TSLA
Fold 1, Validation Accuracy: 0.3333
Fold 2, Validation Accuracy: 0.0000
Fold 3, Validation Accuracy: 0.3333
Fold 4, Validation Accuracy: 0.5000
Average Accuracy across 4-folds: 0.2917

Fold 1, Validation Accuracy: 0.3333
Fold 2, Validation Accuracy: 0.6667
Fold 3, Validation Accuracy: 0.6667
Fold 4, Validation Accuracy: 0.5000
Average Accuracy across 4-folds: 0.5417

Fold 1, Validation Accuracy: 0.3333
Fold 2, Validation Accuracy: 0.3333
Fold 3, Validation Accuracy: 0.3333
Fold 4, Validation Accuracy: 0.5000
Average Accuracy across 4-folds: 0.3750

Fold 1, Validation Accuracy: 0.6667
Fold 2, Validation Accuracy: 0.3333
Fold 3, Validation Accuracy: 0.3333
Fold 4, Validation Accuracy: 1.0000
Average Accuracy across 4-folds: 0.5833

Fold 1, Validation Accuracy: 0.6667
Fold 2, Validation Accuracy: 0.0000
Fold 3, Validation Accuracy: 0.6667
Fold 4, Validation Accuracy: 0.5000
Average Accuracy across 4-folds: 0.4583

Average accuracy across all runs: 0.45
