In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px


In [2]:
df = pd.read_csv('../data/trump_tweets.csv')
df

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,
...,...,...,...,...,...,...,...,...
43347,1273405198698975232,https://twitter.com/realDonaldTrump/status/127...,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377,,
43348,1273408026968457216,https://twitter.com/realDonaldTrump/status/127...,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659,@seanhannity,
43349,1273442195161387008,https://twitter.com/realDonaldTrump/status/127...,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344,,
43350,1273442469066276864,https://twitter.com/realDonaldTrump/status/127...,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022,,


In [3]:
# narrow to only when trump was president (we'll try both later on)

df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] >= '2017-01-20']

In [4]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [19]:
from sentence_transformers import SentenceTransformer

df = df.sort_values(by='date')


sentences = df.content.to_list()
dates = df.date.to_list()

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)


In [20]:
# create a new df with the embeddings and dates
tweets_df = pd.DataFrame(embeddings)
tweets_df['date'] = dates

In [34]:
# get intraday data for S&P 500

# https://www.kaggle.com/datasets/gratefuldata/intraday-stock-data-1-min-sp-500-200821?resource=download

spy = pd.read_parquet('../data/spy.parquet')
spy.date = pd.to_datetime(spy.date)

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

In [28]:
# Load and preprocess data

def create_training_data(df, embeddings, tweets_df):

    df['price_change'] = df['close'].pct_change().fillna(0)  # Calculate % change in price
    df['volume_change'] = df['volume'].pct_change().fillna(0)

    # Normalize price and volume data
    scaler = StandardScaler()
    
    df[['price_change', 'volume_change']] = scaler.fit_transform(df[['close', 'volume']])

    # covert timestamp to seconds for proper sequence ordering
    df['timestamp'] = pd.to_datetime(df['date']).astype(int) // 10**9
    tweets_df['timestamp'] = pd.to_datetime(tweets_df['date']).astype(int) // 10**9
    
    
    # merge the dataframes
    X = pd.merge_asof(df.sort_values("timestamp"),
                   tweets_df.sort_values("timestamp"),
                   on="timestamp",
                   direction="backward")  # backward fill tweets (important for time series)

    # reduce to only the columns we want
    X = X[range(0, 384)]

    # targe var 
    y = df['price_change'].shift(-1).fillna(0).values  # next minute's price change
    
    return X, y

X, y = create_training_data(spy, embeddings, tweets_df)

In [32]:


# Convert to tensors
X_tensor = torch.from_numpy(X.values).float()
y_tensor = torch.from_numpy(y).float().unsqueeze(1)


# dataloader 
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


# create basic class for trans model 
class TweetImpactTransformer(nn.Module):
    def __init__(self, input_dim, model_dim=64, num_heads=4, num_layers=2, dropout=0.1):
        super(TweetImpactTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)  
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(model_dim, 1)  # Final layer

    def forward(self, x):
        x = self.embedding(x)  # Project to trans dim
        x = x.unsqueeze(1)  #  sequence length dim
        x = self.transformer_encoder(x)  # trough Trans 
        return self.fc(x[:, -1, :])  # Predict

# Instantiate 
input_dim = X.shape[1]  # Market data + tweet embedding dim 
model = TweetImpactTransformer(input_dim)

# loss & optimiz
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train 
num_epochs = 10
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Save model
#torch.save(model.state_dict(), "tweet_impact_transformer.pth")





Epoch 1, Loss: nan
Epoch 2, Loss: nan


KeyboardInterrupt: 