# Fine-Tuned RoBERTa-Twitter on TweetFinSent for Stock Sentiment Analysis
This notebook fine-tunes the RoBERTa-Twitter model on the TweetFinSent dataset to predict stock-specific sentiment (expected return) for tweets. It also extracts ticker symbols from Trump's tweets to enable future linking to stock prices via yfinance.

In [24]:
# Install dependencies and download dataset
!pip install -q transformers datasets pandas scikit-learn tweepy kagglehub
# !git clone https://github.com/jpmcair/tweetfinsent.git

In [25]:
from tweepy import Client
import time, json, pandas as pd

bearer_token = 'AAAAAAAAAAAAAAAAAAAAAM1r2gEAAAAA1Dupgy6nr1OsJWO0R4BYQq%2Fkwec%3D9f33N9AGt6xfEOy98ATMG3uHUooyejko89AUZJk5wxQ2eIirdb'
client = Client(bearer_token=bearer_token)

# Load TweetFinSent training JSON (Tweet_ID, Sentiment, Ticker)
with open('tweetfinsent/TweetFinSent_Train.json') as f:
    train_meta = json.load(f)

# Get list of IDs
tweet_ids = [t['Tweet_ID'] for t in train_meta if 'Tweet_ID' in t]

def fetch_tweets(ids):
    results = []
    for i in range(0, len(ids), 100):
        batch = ids[i:i+100]
        try:
            tweets = client.get_tweets(ids=batch, tweet_fields=["created_at", "text"])
            if tweets.data:
                results.extend(tweets.data)
        except Exception as e:
            print(f"Błąd przy paczce {i}-{i+100}: {e}")
        time.sleep(1.2)
    return results

tweets = fetch_tweets(tweet_ids)

# Create lookup dictionary
id_to_text = {str(t.id): t.text for t in tweets if t and hasattr(t, 'text')}

# # Build training dataframe
# train_records = []
# label_map = {'POSITIVE': 2, 'NEUTRAL': 1, 'NEGATIVE': 0}
# for row in train_meta:
#     tweet_id = row['Tweet_ID']
#     if tweet_id in id_to_text and row['Sentiment'] in label_map:
#         train_records.append({
#             'text': id_to_text[tweet_id],
#             'label': label_map[row['Sentiment']],
#             'target': row['Target_Ticker']
#         })

# train_df = pd.DataFrame(train_records)


Błąd przy paczce 100-200: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 200-300: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 300-400: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 400-500: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 500-600: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 600-700: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 700-800: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 800-900: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 900-1000: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 1000-1100: 429 Too Many Requests
Too Many Requests
Błąd przy paczce 1100-1200: 429 Too Many Requests
Too Many Requests


In [26]:
len(tweets)


67

In [27]:
import json

with open("hydrated_tweets_67.json", "w") as f:
    json.dump([tweet.data for tweet in tweets if tweet], f)


In [38]:
id_to_text = {str(t.id): t.text for t in tweets if t and hasattr(t, 'text')}

train_records = []
label_map = {'POSITIVE': 2, 'NEUTRAL': 1, 'NEGATIVE': 0}

for row in train_meta:
    tweet_id = row['Tweet_ID']
    if tweet_id in id_to_text and row['Sentiment'] in label_map:
        train_records.append({
            'text': id_to_text[tweet_id],
            'label': label_map[row['Sentiment']],
            'target': row['Target_Ticker']
        })

train_df = pd.DataFrame(train_records)
print(train_df.shape)
# train_df.head()


(67, 3)


In [39]:
train_df.to_csv("stock_sentiment_train_sample.csv")
train_df.head()


Unnamed: 0,text,label,target
0,$TSLA Long: DayTradePlan\n\n07/02 695 call @ $...,2,TSLA
1,$TSLA 👀👀👀👀,1,TSLA
2,$CLOV apes strong!\n#CLOV #squeezeclov #TrustT...,2,CLOV
3,@omedyentral @GordonJohnson19 @CNBC Lol…He’s b...,1,TSLA
4,Could $GME close above 200 hundred today thus ...,2,GME


In [44]:
# Tokenize and prepare datasets
from transformers import AutoTokenizer
from datasets import Dataset
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

train_ds = Dataset.from_pandas(train_df[['text', 'label']])
# val_ds = Dataset.from_pandas(val_df[['text', 'label']])

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)


train_ds = train_ds.map(tokenize, batched=True)
# val_ds = val_ds.map(tokenize, batched=True)

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

In [45]:
# Fine-tune model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    # eval_dataset=val_ds
)
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=0.7675496419270833, metrics={'train_runtime': 85.2966, 'train_samples_per_second': 2.356, 'train_steps_per_second': 0.176, 'total_flos': 13221449240832.0, 'train_loss': 0.7675496419270833, 'epoch': 3.0})

In [47]:
# Save model and tokenizer
model.save_pretrained('./stock_sentiment_model')
tokenizer.save_pretrained('./stock_sentiment_model')

KeyboardInterrupt: 

In [48]:
# Load Trump's tweets and extract tickers
import kagglehub
import os, glob, re
path_trump = kagglehub.dataset_download('codebreaker619/donald-trump-tweets-dataset')
csv_files = glob.glob(os.path.join(path_trump, '*.csv'))
df = pd.read_csv(csv_files[0])
df = df.dropna(subset=['text'])
df['text'] = df['text'].astype(str)
df['datetime'] = pd.to_datetime(df['date'])

def extract_tickers(text):
    return re.findall(r'\$\w+', text)

df['tickers'] = df['text'].apply(extract_tickers)
df = df.explode('tickers')
df = df[['text', 'datetime', 'tickers']].dropna()
df = df.sort_values(by='datetime')
df.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/codebreaker619/donald-trump-tweets-dataset?dataset_version_number=1...


100%|██████████| 4.25M/4.25M [00:00<00:00, 203MB/s]

Extracting files...





Unnamed: 0,text,datetime,tickers
13467,Trump Tycoon App for iPhone & iPod Touch - It'...,2010-01-15 16:28:02,$2
13401,Great job on the Larry King Live Gulf Telethon...,2010-06-22 16:09:28,$1
13400,"The Eric Trump Foundation has raised over $1,0...",2010-06-24 18:44:46,$1
13390,I'm proud to accept the 2010 HollyRod Foundati...,2010-07-20 15:10:28,$700
14111,ObamaCare Tragedy Primed to Further Explode th...,2011-07-08 16:22:30,$500


In [51]:
# Apply fine-tuned model to get stock sentiment
from transformers import pipeline
clf = pipeline('text-classification', model='./stock_sentiment_model', tokenizer='./stock_sentiment_model')
df['sentiment'] = df['text'].apply(lambda x: clf(x[:512])[0]['label'])
df.head()

Device set to use cuda:0


Unnamed: 0,text,datetime,tickers,predicted_stock_sentiment,sentiment
13467,Trump Tycoon App for iPhone & iPod Touch - It'...,2010-01-15 16:28:02,$2,LABEL_2,LABEL_2
13401,Great job on the Larry King Live Gulf Telethon...,2010-06-22 16:09:28,$1,LABEL_2,LABEL_2
13400,"The Eric Trump Foundation has raised over $1,0...",2010-06-24 18:44:46,$1,LABEL_2,LABEL_2
13390,I'm proud to accept the 2010 HollyRod Foundati...,2010-07-20 15:10:28,$700,LABEL_2,LABEL_2
14111,ObamaCare Tragedy Primed to Further Explode th...,2011-07-08 16:22:30,$500,LABEL_1,LABEL_1


In [52]:
# prompt: dowload this file up to csv

train_df.to_csv("stock_sentiment_train_sample.csv")