In [None]:
#@title Imports
!pip install transformers

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
import kagglehub
import pandas as pd
from transformers import pipeline
import random
from tqdm.keras import TqdmCallback



In [None]:
#@title Import data
csv_file_path = '/content/drive/MyDrive/theory_of_ml/sp500_headlines_2008_2024.csv'
df = pd.read_csv(csv_file_path)
print(df.head())

                                               Title        Date       CP
0   JPMorgan Predicts 2008 Will Be "Nothing But Net"  2008-01-02  1447.16
1  Dow Tallies Biggest First-session-of-year Poin...  2008-01-02  1447.16
2                   2008 predictions for the S&P 500  2008-01-02  1447.16
3  U.S. Stocks Higher After Economic Data, Monsan...  2008-01-03  1447.16
4  U.S. Stocks Climb As Hopes Increase For More F...  2008-01-07  1416.18


In [None]:
#@title Apply sentiment analysis
sentiment_analyzer = pipeline('sentiment-analysis')

scores, sentiments = [], []
for i in range(len(df)):
  if i % int(len(df)/100) == 0:
    print(f'{i}/{len(df)} ({i*100/len(df):.2f})%')
  headline = df['Title'][i]
  analysis = sentiment_analyzer(headline)[0]
  score, sentiment = analysis['score'], analysis['label']
  scores.append(score)
  sentiments.append(sentiment)
  if random.random() > 0.99:
    print(f'\nSample')
    print(f'Headline: {headline}')
    print(f'Score: {score}')
    print(f'Sentiment: {sentiment}\n')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
14387/19127 (75.22)%
14388/19127 (75.22)%
14389/19127 (75.23)%
14390/19127 (75.23)%
14391/19127 (75.24)%
14392/19127 (75.24)%
14393/19127 (75.25)%
14394/19127 (75.25)%
14395/19127 (75.26)%
14396/19127 (75.27)%
14397/19127 (75.27)%
14398/19127 (75.28)%
14399/19127 (75.28)%
14400/19127 (75.29)%
14401/19127 (75.29)%
14402/19127 (75.30)%
14403/19127 (75.30)%
14404/19127 (75.31)%
14405/19127 (75.31)%
14406/19127 (75.32)%
14407/19127 (75.32)%
14408/19127 (75.33)%
14409/19127 (75.33)%
14410/19127 (75.34)%
14411/19127 (75.34)%
14412/19127 (75.35)%
14413/19127 (75.35)%
14414/19127 (75.36)%
14415/19127 (75.36)%
14416/19127 (75.37)%
14417/19127 (75.38)%
14418/19127 (75.38)%
14419/19127 (75.39)%
14420/19127 (75.39)%
14421/19127 (75.40)%
14422/19127 (75.40)%
14423/19127 (75.41)%
14424/19127 (75.41)%
14425/19127 (75.42)%
14426/19127 (75.42)%
14427/19127 (75.43)%
14428/19127 (75.43)%
14429/19127 (75.44)%
14430/19127 (75.44)%
14431/19127

In [None]:
#@title Prepare data

# Parameters.
n = 10
split_ratio = 0.8
days_ahead = 0

# Add sentiment columns to the df.
df['Sentiment'] = sentiments
df['Score'] = scores
df['Sentiment'] = df['Sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

# Convert date to dayse since jan 1 2000.
df['Day'] = pd.to_datetime(df['Date'])
df['Day'] = (df['Day'] - pd.Timestamp('2000-01-01')).dt.days

# Average sentiment and score per day.
grouped_df = df.drop(['Title', 'Date'], axis=1)
grouped_df = df.groupby('Day').agg({'Sentiment': 'mean', 'Score': 'mean', 'CP': 'mean'}).reset_index()


# Create X and y.
stock_mean = grouped_df['CP'].mean()
stock_std = grouped_df['CP'].std()
for column in ['Day', 'Score', 'Sentiment', 'CP']:
  grouped_df[column] = grouped_df[column].astype(float)
  grouped_df[column] = (grouped_df[column] - grouped_df[column].mean()) / grouped_df[column].std()
  grouped_df[column] = np.clip(grouped_df[column], -2, 2)
  grouped_df[column] = (grouped_df[column] - grouped_df[column].min()) / (grouped_df[column].max() - grouped_df[column].min())
X = grouped_df[['Day', 'Score', 'Sentiment', 'CP']]
y = grouped_df['CP']

# Create sequences for training.
def create_sequences(X, y, n, days_ahead):
  new_X, new_y = [], []
  for i in range(len(X) - n - days_ahead):
    new_X.append(X[i:(i + n)])
    new_y.append(y[i + n + days_ahead])
  return np.array(new_X), np.array(new_y)
X, y = create_sequences(X, y, n, days_ahead)

# Split training and testing.
split_indices = np.random.permutation(len(X))
split_index = int(split_ratio * len(X))
train_indices = split_indices[:split_index]
test_indices = split_indices[split_index:]
X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (2797, 10, 4)
y_train shape: (2797,)
X_test shape: (700, 10, 4)
y_test shape: (700,)


In [None]:
#@title Build the LSTM model
model = Sequential()
model.add(Input(shape=(n, X.shape[-1])))
model.add(LSTM(units=50, activation='relu')) # units is the number of neurons in the LSTM layer
model.add(Dense(units=1)) # Output layer for predicting the next value

# Compile the model
model.compile(optimizer='adam', loss='mse') # adam optimizer and mean squared error loss are common choices

In [None]:
#@title Train model
model.fit(X_train, y_train, epochs=10, verbose=0, callbacks=[TqdmCallback(verbose=1)])

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

<keras.src.callbacks.history.History at 0x7f86c94a0190>

In [None]:
#@title Test model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
train_loss = np.mean(np.abs(y_pred_train - y_train.reshape(-1, 1)))
test_loss = np.mean(np.abs(y_pred_test - y_test.reshape(-1, 1)))
print(f'Train loss: {train_loss}')
print(f'Test loss: {test_loss}')

[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Train loss: 0.010116064179796052
Test loss: 0.01048076289316726


In [None]:
# Predict a few samples
actual_changes = []
prediction_changes = []
verbose = False
for i in range(50):
  if i % int(len(X_test)/100) == 0:
    print(f'{i}/{len(X_test)} ({i*100/len(X_test):.2f})%')

  sample_index = random.randint(0, len(X_test) - 1)
  sample_X = X_test[sample_index]
  previous_closing = sample_X[-1][-1]
  actual = y_test[sample_index]
  prediction = model.predict(np.array([sample_X]), verbose=False)[0][0]

  previous_closing = (previous_closing * stock_std) + stock_mean
  actual = (actual * stock_std) + stock_mean
  prediction = (prediction * stock_std) + stock_mean

  actual_change = actual - previous_closing
  actual_changes.append(actual_change)
  prediction_change = prediction - previous_closing
  prediction_changes.append(prediction_change)

  if verbose:
    print(f'\nSample {i + 1}:')
    print(f'Previous closing: ${previous_closing:.1f}')
    print(f'Predicted Change: ${prediction:.1f}')
    print(f'Actual Change: ${actual_change:.1f}')
    print(f'Prediction Change: ${prediction_change:.1f}')

0/700 (0.00)%
7/700 (1.00)%
14/700 (2.00)%
21/700 (3.00)%
28/700 (4.00)%
35/700 (5.00)%
42/700 (6.00)%
49/700 (7.00)%


In [None]:
changes = pd.DataFrame({'Actual Change': actual_changes, 'Prediction Change': prediction_changes})
same_signs = [1 if actual_changes[i] * prediction_changes[i] > 0 else 0 for i in range(len(actual_changes))]
print(f'Accuracy: {sum(same_signs) / len(same_signs)}')

Accuracy: 0.52


In [None]:
changes

Unnamed: 0,Actual Change,Prediction Change
0,3.757891,5.543389
1,0.01902,-14.394542
2,14.094128,6.104596
3,-1.298823,-4.29431
4,3.067721,49.48108
5,-15.629348,-16.002415
6,-14.297919,-7.693262
7,1.092315,4.096015
8,2.526998,-10.359113
9,2.763395,-3.406767
