In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px


In [4]:
google = '/content/my580-applied-language-models/data/trump_tweets.csv'
local = '..data/trump_tweets.csv'

df = pd.read_csv(local)
df

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,
...,...,...,...,...,...,...,...,...
43347,1273405198698975232,https://twitter.com/realDonaldTrump/status/127...,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377,,
43348,1273408026968457216,https://twitter.com/realDonaldTrump/status/127...,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659,@seanhannity,
43349,1273442195161387008,https://twitter.com/realDonaldTrump/status/127...,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344,,
43350,1273442469066276864,https://twitter.com/realDonaldTrump/status/127...,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022,,


In [5]:
# narrow to only when trump was president (we'll try both later on)

df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] >= '2017-01-20']

In [6]:
pip install tf-keras



In [7]:
from sentence_transformers import SentenceTransformer

df = df.sort_values(by='date')


sentences = df.content.to_list()
dates = df.date.to_list()

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# create a new df with the embeddings and dates
tweets_df = pd.DataFrame(embeddings)
tweets_df['date'] = dates

In [9]:
# get intraday data for S&P 500

# https://www.kaggle.com/datasets/gratefuldata/intraday-stock-data-1-min-sp-500-200821?resource=download

google = '/content/my580-applied-language-models/data/spy.parquet'
local = '../data/spy.parquet'

spy = pd.read_parquet(google)
spy.date = pd.to_datetime(spy.date)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

In [12]:
# Load and preprocess data

def create_training_data(df, embeddings, tweets_df):

    df['price_change'] = df['close'].pct_change().fillna(0)  # Calculate % change in price
    df['volume_change'] = df['volume'].pct_change().fillna(0)

    # Normalize price and volume data
    scaler = StandardScaler()

    df[['price_change', 'volume_change']] = scaler.fit_transform(df[['close', 'volume']])

    # covert timestamp to seconds for proper sequence ordering
    df['timestamp'] = pd.to_datetime(df['date']).astype(int) // 10**9
    tweets_df['timestamp'] = pd.to_datetime(tweets_df['date']).astype(int) // 10**9


    # merge the dataframes
    df = pd.merge_asof(df.sort_values("timestamp"),
                   tweets_df.sort_values("timestamp"),
                   on="timestamp",
                   direction="backward")  # backward fill tweets (important for time series)


    #  replace NaNs with zeros
    df = df.fillna(0)

    X = df[range(0, 384)]

    # targe var
    y = df['price_change'].shift(-1).fillna(0).values  # next minute's price change

    return X, y

X, y = create_training_data(spy, embeddings, tweets_df)

In [14]:
# fit an LSTM nn


# Convert to tensors
X_tensor = torch.from_numpy(X.values).float()
y_tensor = torch.from_numpy(y).float().unsqueeze(1)


# dataloader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)


# Define LSTM Model
class TweetImpactLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, num_layers=2):
        super(TweetImpactLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))  # Adding batch dimension
        return self.fc(lstm_out[:, -1, :])  # Predict next price change

# Instantiate model
input_dim = X.shape[1]  # Market data + tweet embedding size
model = TweetImpactLSTM(input_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
num_epochs = 10
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.14983174204826355
Epoch 2, Loss: 0.1988566815853119
Epoch 3, Loss: 0.2032514363527298
Epoch 4, Loss: 0.16456535458564758
Epoch 5, Loss: 0.1969897449016571
Epoch 6, Loss: 0.12334292382001877
Epoch 7, Loss: 0.21468760073184967
Epoch 8, Loss: 0.14040429890155792
Epoch 9, Loss: 0.1530974805355072
Epoch 10, Loss: 0.1593034565448761


In [16]:

# transformer model (takes forever)

# Convert to tensors
X_tensor = torch.from_numpy(X.values).float()
y_tensor = torch.from_numpy(y).float().unsqueeze(1)


# dataloader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)


# create basic class for trans model
class Tweet_Transformer(nn.Module):
    def __init__(self, input_dim, model_dim=64, num_heads=4, num_layers=2, dropout=0.1):
        super(Tweet_Transformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(model_dim, 1)  # Final layer

    def forward(self, x):
        x = self.embedding(x)  # Project to trans dim
        x = x.unsqueeze(1)  #  sequence length dim
        x = self.transformer_encoder(x)  # trough Trans
        return self.fc(x[:, -1, :])  # Predict

# Instantiate
input_dim = X.shape[1]  # Market data + tweet embedding dim
model = Tweet_Transformer(input_dim)

# loss & optimiz
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train
num_epochs = 5
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Save model
#torch.save(model.state_dict(), "tweet_impact_transformer.pth")





Epoch 1, Loss: 0.15517407655715942
Epoch 2, Loss: 0.17112144827842712
Epoch 3, Loss: 0.14139056205749512
Epoch 4, Loss: 0.14171840250492096
Epoch 5, Loss: 0.17687277495861053


In [27]:
# Save model
torch.save(model.state_dict(), "tweet_impact_transformer.pth")

In [35]:
model

Tweet_Transformer(
  (embedding): Linear(in_features=384, out_features=64, bias=True)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, b

In [48]:
import torch
import numpy as np
import pandas as pd

# Load trained model
model = Tweet_Transformer(input_dim)  # Recreate model architecture
model.load_state_dict(torch.load("tweet_impact_transformer.pth"))
model.eval()  # Set to evaluation mode

# Function to process a new tweet
def predict_stock_change(new_tweet, embed_function):
    """
    Predicts stock price change given a new tweet.

    Args:
        new_tweet (str): The new tweet from Trump.
        embed_function (function): Function to generate tweet embedding.

    Returns:
        float: Predicted price change.
    """

    # Step 1: Embed the new tweet
    tweet_embedding = embed_function(new_tweet)  # Ensure output is same format as training

    # Step 2: Convert to PyTorch tensor
    input_tensor = torch.tensor(tweet_embedding, dtype=torch.float32)#.unsqueeze(0)  # Add batch dim
    #input_tensor = torch.tensor(tweet_embedding, dtype=torch.float32).unsqueeze(1)  # Add batch dim
    #input_tensor = torch.tensor(tweet_embedding, dtype=torch.float32).unsqueeze(0).unsqueeze(1)  # (B, T, D)

    # Step 3: Predict using the trained model
    with torch.no_grad():
        predicted_change = model(input_tensor).item()

    return predicted_change







# Example: Predict stock movement after a tweet
new_tweet = "We are gonna go after the deep state and all the other loser and haters. Drain the swamp!!"
latest_price_change = 0.0021  # Last minute price change
latest_volume_change = 0.0  # Last minute volume change






# Assume embed_function(new_tweet) returns the same format used in training
def embed_function(text):
    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    if isinstance(text, str):
        x = [new_tweet]
    else:
        x = text
    tweet_embedding = embedding_model.encode(x)

    return torch.from_numpy(tweet_embedding).float()






predicted_change = predict_stock_change(new_tweet, embed_function)
print(f"Predicted stock price change: {predicted_change:.6f}")


  model.load_state_dict(torch.load("tweet_impact_transformer.pth"))


Predicted stock price change: -0.652239


  input_tensor = torch.tensor(tweet_embedding, dtype=torch.float32)#.unsqueeze(0)  # Add batch dim


# Modify Transformer to Output Multiple Steps

In [None]:
import torch
import torch.nn as nn




# transformer model (takes forever)

# Convert to tensors
X_tensor = torch.from_numpy(X.values).float()
y_tensor = torch.from_numpy(y).float().unsqueeze(1)


# dataloader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)



class MultiStepTransformer(nn.Module):
    def __init__(self, input_dim, model_dim=64, num_heads=4, num_layers=2, forecast_steps=5, dropout=0.1):
        super(MultiStepTransformer, self).__init__()
        self.forecast_steps = forecast_steps
        self.embedding = nn.Linear(input_dim, model_dim)  # Project input to model dimension
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(model_dim, forecast_steps)  # Output multiple steps

    def forward(self, x):
        x = self.embedding(x)  # Map input to transformer dimension
        x = x.unsqueeze(1)  # Add sequence dimension (B, 1, D)
        x = self.transformer_encoder(x)  # Pass through Transformer
        return self.fc(x[:, -1, :])  # Predict `forecast_steps` minutes ahead

# Instantiate
input_dim = X.shape[1]  # Market data + tweet embedding dim
model = Tweet_Transformer(input_dim)

# loss & optimiz
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train
num_epochs = 5
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


In [None]:
forecast_steps = 5  # Number of minutes to predict ahead

# Shift data to create `forecast_steps` targets
y = np.column_stack([df['price_change'].shift(-i) for i in range(1, forecast_steps + 1)])

# Convert to PyTorch tensors
y_tensor = torch.tensor(y, dtype=torch.float32)  # Now shape is (samples, forecast_steps)


In [None]:
def predict_stock_changes(new_tweet, embed_function, forecast_steps=5):
    """
    Predicts stock price changes for the next `forecast_steps` minutes based on a new tweet.

    Args:
        new_tweet (str): The new tweet from Trump.
        embed_function (function): Function to generate tweet embedding.
        forecast_steps (int): How many minutes ahead to predict.

    Returns:
        list: Predicted price changes for the next `forecast_steps` minutes.
    """

    # Step 1: Embed the new tweet
    tweet_embedding = embed_function(new_tweet)

    # Step 2: Convert to PyTorch tensor
    input_tensor = torch.tensor(tweet_embedding, dtype=torch.float32).unsqueeze(0).unsqueeze(1)

    # Step 3: Predict using the trained model
    with torch.no_grad():
        predicted_changes = model(input_tensor).cpu().numpy().flatten()

    return predicted_changes

# Example Usage
new_tweet = "The stock market is booming! America is winning!"
predicted_changes = predict_stock_changes(new_tweet, embed_function, forecast_steps=5)

print(f"Predicted stock price changes for the next 5 minutes: {predicted_changes}")
