In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
import pandas as pd

In [None]:
# Data preprocessing - fix dates

df = pd.read_pickle('../data/motley-fool-data.pkl')
df['date'].apply(type).value_counts()

def normalize_date(val):
    if isinstance(val, list):
        return val[0] if len(val) > 0 else None
    return val

df["date"] = df["date"].apply(normalize_date)
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
df = df.dropna(subset=["date"])

In [None]:
# Data Preprocessing - sentences

from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from utils.preprocessing import process_row

df = df[df["transcript"].apply(lambda x: isinstance(x, str))]

rows = df.to_dict(orient = "records")

def run_parallel_processing():
    with Pool(processes = cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_row, rows), total=len(rows), desc="Processing"))

    return results

results = run_parallel_processing()

In [None]:
# Store data in csv - Part 1

sentence_data = [item for sublist in results for item in sublist]
sentence_df = pd.DataFrame(sentence_data)
sentence_df.to_csv("../data/parsed_sentences.csv", index=False)
sentence_df.head()

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def scrape_news(ticker):
    url = f"https://finviz.com/quote.ashx?t={ticker}"
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    
    if res.status_code != 200:
        print("Failed to fetch data")
        return []
    
    table = soup.find('table', class_ = 'fullview-news-outer')
    
    if table is None:
        print("No news table found")
        return []
    
    rows = table.find_all('tr')
    data, date = [], None
    
    for row in rows:
        dt_raw = row.td.text.strip()
        title = row.a.text.strip()
        if len(dt_raw.split()) == 2:
            date, time = dt_raw.split()
            if date == "Today":
                curr_date = datetime.now().date()
            elif date == "Yesterday":
                curr_date = (datetime.now() - timedelta(days=1)).date()
            else:
                curr_date = datetime.strptime(date, '%b-%d-%y').date()
        else:
            time = dt_raw
        if curr_date:
            data.append({'ticker': ticker, 'date': str(curr_date), 'time': time, 'text': title})
    
    return data        

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)
sp500_table = tables[0]
sp500_tickers = sp500_table['Symbol'].tolist()
sp500_tickers = [ticker.replace('.', '-') for ticker in sp500_tickers]

In [None]:
import time
import random

all_news = []

for ticker in sp500_tickers:
    time.sleep(random.uniform(1.0, 2.5))
    all_news.extend(scrape_news(ticker))

In [None]:
pd.DataFrame(all_news).to_csv("../data/finviz_headlines2.csv", index=False)

In [None]:
transcripts = pd.read_csv('../data/parsed_sentences.csv')
headlines = pd.read_csv('../data/finviz_headlines.csv')

transcripts = transcripts[["text", "date", "ticker"]]
headlines = headlines[["text", "date", "ticker"]]

transcripts = transcripts.sample(n = 25000, random_state=42)

combine = pd.concat([transcripts, headlines], ignore_index=True)
combine.dropna(subset=['text', 'date', 'ticker'], inplace=True)
combine.to_csv('../data/combined.csv', index = False)

In [None]:
# Load model

model = AutoModelForSequenceClassification.from_pretrained("../models/finbert-finetuned1")
tokenizer = AutoTokenizer.from_pretrained("../models/finbert-finetuned1")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

In [None]:
df = pd.read_csv('../data/combined.csv')
texts = df['text'].tolist()

batch_size = 64
preds, probs = [], []

for i in tqdm(range(0, len(texts), batch_size), desc="Classifying"):
    batch_texts = texts[i:i + batch_size]
    enc = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits
        batch_probs = softmax(logits, dim=1)
        preds.extend(batch_probs.argmax(dim=1).cpu().tolist())
        probs.extend(batch_probs.cpu().tolist())

In [None]:
df['sentiment_label'] = preds
df[['positive', 'neutral', 'negative']] = pd.DataFrame(probs)

grouped = df.groupby(['ticker', 'date']).agg({
    'text': lambda x: list(x),
    'ticker': lambda x: list(x),
    'date': lambda x: list(x),
    'sentiment_label': lambda x: list(x),
    'positive': lambda x: list(x),
    'neutral': lambda x: list(x),
    'negative': lambda x: list(x)
}).reset_index()

grouped.to_csv("../data/grouped_by_date.csv", index=False)

In [None]:
df = pd.read_csv('../data/grouped_by_date.csv')
df['date'] = pd.to_datetime(df['date'])