In [23]:
fixed_lines = []

with open("../data/full_dataset-release.csv", "r", encoding="utf-8") as f:
    lines = f.readlines()

i = 0
while i < len(lines):
    line = lines[i].strip()

    if line and line[0].isdigit() and i + 1 < len(lines):
        next_line = lines[i + 1].strip()

        if next_line.startswith(","):
            combined = line + next_line
            fixed_lines.append(combined)
            i += 2
            continue

    fixed_lines.append(line)
    i += 1

with open("../data/fixed_dataset.csv", "w", encoding="utf-8") as f:
    for line in fixed_lines:
        f.write(line + "\n")

In [58]:
import pandas as pd

df = pd.read_csv('../data/fixed_dataset.csv')

df.columns = [
    'id', 'text', 'ticker', 'date', 'price',
    'return_1d', 'return_2d', 'return_3d', 'return_7d',
    'volume', 'volatility_10d', 'volatility_30d',
    'lstm_sentiment', 'textblob_sentiment'
]

df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
df = df.dropna(subset=['date'])

df = df.drop(columns=['id', 'lstm_sentiment', 'textblob_sentiment'])

  df = pd.read_csv('../data/fixed_dataset.csv')


In [59]:
df.head()

Unnamed: 0,text,ticker,date,price,return_1d,return_2d,return_3d,return_7d,volume,volatility_10d,volatility_30d
0,RT @robertoglezcano: @amazon #Patents Show Fl...,Amazon,2017-01-31,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992
1,@FAME95FM1 Jamaicans make money with @Payoneer...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
2,@CBSi Jamaicans make money with @Payoneer @Pay...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
3,@Hitz92fm Jamaicans make money with @Payoneer ...,PayPal,2017-01-31,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099
4,RT @loadsofvans: Retweet this post &amp; follo...,Amazon,2017-01-31,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992


In [60]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

chunk_size = 50000
num_chunks = 7

chunks = [df[i * chunk_size : (i + 1) * chunk_size] for i in range(num_chunks)]

for i, chunk in enumerate(chunks):
    chunk.to_csv(f"./data/df_chunk_{i+1}.csv", index=False)

In [61]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

In [62]:
model = AutoModelForSequenceClassification.from_pretrained("../models/finbert-finetuned1")
tokenizer = AutoTokenizer.from_pretrained("../models/finbert-finetuned1")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

In [68]:
chunk_paths = [f"./data/df_chunk_{i}.csv" for i in range(4, 8)]

for i, path in enumerate(chunk_paths):
    i = i + 3
    df = pd.read_csv(path)
    texts = df['text'].tolist()

    batch_size = 64
    preds, probs = [], []

    for j in tqdm(range(3, len(texts), batch_size), desc=f"Classifying Chunk {i+1}"):
        batch_texts = texts[j:j + batch_size]
        enc = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            logits = model(**enc).logits
            batch_probs = softmax(logits, dim=1)
            preds.extend(batch_probs.argmax(dim=1).cpu().tolist())
            probs.extend(batch_probs.cpu().tolist())
            
    df['sentiment_label'] = preds
    df['positive'] = [p[0] for p in probs]
    df['neutral'] = [p[1] for p in probs]
    df['negative'] = [p[2] for p in probs]

    df.to_csv(f"./data/df_chunk_{i+1}_with_sentiment.csv", index=False)

Classifying Chunk 5:   0%|          | 0/782 [00:00<?, ?it/s]

Classifying Chunk 5:   0%|          | 0/782 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [45]:
import glob

all_chunks = pd.concat(
    [pd.read_csv(f) for f in sorted(glob.glob("./data/df_chunk_*_with_sentiment.csv"))],
    ignore_index=True
)

In [None]:
grouped = all_chunks.groupby(['ticker', 'date']).agg({
    'text': lambda x: list(x),
    'sentiment_label': lambda x: list(x),
    'positive': lambda x: list(x),
    'neutral': lambda x: list(x),
    'negative': lambda x: list(x),
    'price': lambda x: list(x),
    'volume': lambda x: list(x), 
    'volatility_10d': lambda x: list(x), 
    'volatility_30d': lambda x: list(x),
    'return_1d': lambda x: list(x),
    'return_2d': lambda x: list(x),
    'return_3d': lambda x: list(x)
}).reset_index()

daily_sentiment = all_chunks.groupby(["ticker", "date"]).agg({
    'text': lambda x: list(x),
    "positive": "mean",
    "neutral": "mean",
    "negative": "mean",
    'price': lambda x: list(x),
    'volume': lambda x: list(x), 
    'volatility_10d': lambda x: list(x), 
    'volatility_30d': lambda x: list(x),
    "return_1d": "first",
    "return_2d": "first",
    "return_3d": "first"
}).reset_index()

In [47]:
daily_sentiment = daily_sentiment.dropna(subset=['positive', 'neutral', 'negative', 'return_1d'])
daily_sentiment = daily_sentiment.sort_values(by=['ticker', 'date'])
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

In [52]:
from sklearn.preprocessing import LabelEncoder

In [55]:
# To fill in missing dates per ticker

filled_df = []

for ticker, group in daily_sentiment.groupby('ticker'):
    group = group.sort_values('date').reset_index(drop = True)
    index = pd.date_range(start = group['date'].min(), 
                          end = group['date'].max(),
                          freq = 'D')
    group = group.set_index('date').reindex(index).reset_index()
    group['ticker'] = ticker
    group.rename(columns = {'index': 'date'}, inplace = True)
    filled_df.append(group)
    
df_full = pd.concat(filled_df, ignore_index=True)

for col in ['positive', 'neutral', 'negative']:
    df_full[col] = df_full[col].fillna(0)

le = LabelEncoder()
df_full['ticker_encoded'] = le.fit_transform(df_full['ticker'])
    
df_full = df_full.sort_values(['ticker', 'date']).reset_index(drop=True)

df_full.head()

Unnamed: 0,date,ticker,positive,neutral,negative,return_1d,return_2d,return_3d,ticker_encoded
0,2017-01-31,21CF,0.000167,0.999619,0.000214,-0.000319,-0.002868,-0.002868,0
1,2017-02-01,21CF,0.0,0.0,0.0,,,,0
2,2017-02-02,21CF,0.0,0.0,0.0,,,,0
3,2017-02-03,21CF,0.0,0.0,0.0,,,,0
4,2017-02-04,21CF,0.0,0.0,0.0,,,,0


In [57]:
grouped.to_csv("../data/full_sentiment_dataset.csv", index=False)
df_full.to_csv("../data/full_sentiment_dataset_cleaned.csv", index=False)