In [22]:
fixed_lines = []

with open("../data/full_dataset-release.csv", "r", encoding="utf-8") as f:
    lines = f.readlines()

i = 0
while i < len(lines):
    line = lines[i].strip()

    if line and line[0].isdigit() and i + 1 < len(lines):
        next_line = lines[i + 1].strip()

        if next_line.startswith(","):
            combined = line + next_line
            fixed_lines.append(combined)
            i += 2
            continue

    fixed_lines.append(line)
    i += 1

with open("../data/fixed_dataset.csv", "w", encoding="utf-8") as f:
    for line in fixed_lines:
        f.write(line + "\n")

In [2]:
import pandas as pd

df = pd.read_csv('../data/fixed_dataset.csv')

df.columns = [
    'id', 'text', 'ticker', 'date', 'price',
    'return_1d', 'return_2d', 'return_3d', 'return_7d',
    'volume', 'volatility_10d', 'volatility_30d',
    'lstm_sentiment', 'textblob_sentiment'
]

df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
df = df.dropna(subset=['date'])

df = df.drop(columns=['id', 'lstm_sentiment', 'textblob_sentiment'])

  df = pd.read_csv('../data/fixed_dataset.csv')


In [4]:
df.head()

Unnamed: 0,text,ticker,date,price,return_1d,return_2d,return_3d,return_7d,volume,volatility_10d,volatility_30d
0,RT @robertoglezcano: @amazon #Patents Show Fl...,Amazon,2017-01-31,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992
1,@FAME95FM1 Jamaicans make money with @Payoneer...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
2,@CBSi Jamaicans make money with @Payoneer @Pay...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
3,@Hitz92fm Jamaicans make money with @Payoneer ...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
4,RT @loadsofvans: Retweet this post &amp; follo...,Amazon,2017-01-31,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992


In [5]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

chunk_size = 50000
num_chunks = 7

chunks = [df[i * chunk_size : (i + 1) * chunk_size] for i in range(num_chunks)]

for i, chunk in enumerate(chunks):
    chunk.to_csv(f"./data/df_chunk_{i+1}.csv", index=False)

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
model = AutoModelForSequenceClassification.from_pretrained("../models/finbert-finetuned1")
tokenizer = AutoTokenizer.from_pretrained("../models/finbert-finetuned1")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

In [30]:
chunk_paths = [f"./data/df_chunk_{i}.csv" for i in range(4, 8)]

for i, path in enumerate(chunk_paths):
    i = i + 3
    df = pd.read_csv(path)
    texts = df['text'].tolist()

    batch_size = 64
    preds, probs = [], []

    for j in tqdm(range(0, len(texts), batch_size), desc=f"Classifying Chunk {i+1}"):
        batch_texts = texts[j:j + batch_size]
        enc = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            logits = model(**enc).logits
            batch_probs = softmax(logits, dim=1)
            preds.extend(batch_probs.argmax(dim=1).cpu().tolist())
            probs.extend(batch_probs.cpu().tolist())
            
    df = df.iloc[:len(preds)].copy()
            
    df['sentiment_label'] = preds
    df['positive'] = [p[0] for p in probs]
    df['neutral'] = [p[1] for p in probs]
    df['negative'] = [p[2] for p in probs]

    df.to_csv(f"./data/df_chunk_{i+1}_with_sentiment.csv", index=False)

Classifying Chunk 4: 100%|██████████| 782/782 [49:32<00:00,  3.80s/it] 
Classifying Chunk 5: 100%|██████████| 782/782 [50:52<00:00,  3.90s/it]
Classifying Chunk 6: 100%|██████████| 782/782 [45:24<00:00,  3.48s/it]
Classifying Chunk 7: 100%|██████████| 782/782 [44:24<00:00,  3.41s/it]


In [18]:
import glob

all_chunks = pd.concat(
    [pd.read_csv(f) for f in sorted(glob.glob("./data/df_chunk_*_with_sentiment.csv"))],
    ignore_index=True
)

In [19]:
grouped = all_chunks.groupby(['ticker', 'date']).agg({
    'text': lambda x: list(x),
    'sentiment_label': lambda x: list(x),
    'positive': lambda x: list(x),
    'neutral': lambda x: list(x),
    'negative': lambda x: list(x),
    'price': lambda x: list(x),
    'volume': lambda x: list(x), 
    'volatility_10d': lambda x: list(x), 
    'volatility_30d': lambda x: list(x),
    'return_1d': lambda x: list(x),
    'return_2d': lambda x: list(x),
    'return_3d': lambda x: list(x)
}).reset_index()

daily_sentiment = all_chunks.groupby(["ticker", "date"]).agg({
    'text': lambda x: list(x),
    "positive": "mean",
    "neutral": "mean",
    "negative": "mean",
    'price': "first",
    'volume': "first", 
    'volatility_10d': "first", 
    'volatility_30d': "first",
    "return_1d": "first",
    "return_2d": "first",
    "return_3d": "first"
}).reset_index()

In [20]:
daily_sentiment = daily_sentiment.dropna(subset=['positive', 'neutral', 'negative', 'return_1d'])
daily_sentiment = daily_sentiment.sort_values(by=['ticker', 'date'])
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder()
daily_sentiment['ticker_encoded'] = le.fit_transform(daily_sentiment['ticker'])
    
daily_sentiment = daily_sentiment.sort_values(['ticker', 'date']).reset_index(drop=True)

daily_sentiment.head()

Unnamed: 0,ticker,date,text,positive,neutral,negative,price,volume,volatility_10d,volatility_30d,return_1d,return_2d,return_3d,ticker_encoded
0,21CF,2017-01-31,[RT @21CF: 21CF internal memo from Executive C...,0.000167,0.999619,0.000214,31.38,5170587.0,16.864,14.768,-0.000319,-0.002868,-0.002868,0
1,21CF,2017-04-29,[RT @21CF: Read what @Gotham star @ben_mckenzi...,0.000117,0.999737,0.000147,30.54,6681951.0,17.751,16.189,0.0,0.000655,0.006549,0
2,ASOS,2017-01-31,[RT @n76seary: RT @StudentBunker: #FreebieFrid...,0.000176,0.999649,0.000175,5266.0,342823.0,32.807,28.367,-0.012533,0.008355,0.008355,1
3,ASOS,2017-02-01,[ASOS SALON Pretty Floral Soft Midi with Embel...,0.000231,0.999595,0.000175,5267.0,301346.0,26.819,28.35,-0.00019,-0.012721,0.008164,1
4,ASOS,2017-02-28,"[GUADALUPE PASS AMOS,TX (GDP) ASOS reports gus...",0.222588,0.626411,0.151001,5432.0,608408.0,10.72,21.31,-0.002577,-0.018962,-0.018962,1


In [23]:
daily_sentiment['return_1d'] *= 100
daily_sentiment['return_2d'] *= 100

daily_sentiment["price_volatility_ratio"] = daily_sentiment['price'] / daily_sentiment['volatility_30d']
daily_sentiment["volume_volatility_ratio"] = daily_sentiment['volume'] / daily_sentiment['volatility_30d']
daily_sentiment["volatility_diff"] = daily_sentiment['volatility_10d'] - daily_sentiment['volatility_30d']
daily_sentiment['day_of_week'] = pd.to_datetime(daily_sentiment['date']).dt.dayofweek

In [24]:
grouped.to_csv("../data/full_sentiment_dataset.csv", index=False)
daily_sentiment.to_csv("../data/full_sentiment_dataset_cleaned.csv", index=False)