In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import urllib3

!pip show urllib3

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Utility for Webscraping and Sentiment Analysis

In [5]:
def get_article_text(url, title):
    # scrape content
    try: # attempt to request page from url
        html = urllib3.request("GET", url).data
        soup = BeautifulSoup(html, "html.parser") # parser for html code

        if "https://news.google.com" in url:
            redirected_url = soup.find('a')['href']

            # Make another HTTP GET request to the extracted link
            resp = urllib3.request("GET", redirected_url)
            html = resp.data
            soup = BeautifulSoup(html, "html.parser") # parser for html code

        text = title + " " # text to be appended

        # get paragraph text next
        p_arr = soup.find_all('p') # get array of all p tag html elements in article
        for p in p_arr:
            text += p.get_text()
        return text
    except:
        print(f"Could not open page: {url}")
        return None

from tqdm.auto import tqdm
def get_sentiment_results(df):
    sentiment_scores = []
    sentiment_probs = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        article_text = get_article_text(row['url'],
                                        row['title'])
        if article_text:
            # Combine title and text if available
            text = article_text
            # Perform sentiment analysis
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)
            score = probs.argmax(dim=-1).item()
            probs = probs.detach().cpu().numpy().tolist()
            sentiment_scores.append(score)
            sentiment_probs.append(probs)
        else:
            # Handle cases where the article could not be fetched or text is missing
            sentiment_scores.append(None)
            sentiment_probs.append(None)
    return sentiment_scores, sentiment_probs

# Target Stock and News

In [6]:
stocks = ['AAPL', 'META', 'NVDA']
related_terms = {
    'AAPL': ['AAPL', 'Apple'],
    'META': ['META', 'Metaverse'],
    'NVDA': ['NVDA', 'NVIDIA'],
}
# company url names for Investing.com
url_names = {
    'AAPL': 'apple-computer-inc',
    'META': 'facebook-inc',
    'NVDA': 'nvidia-corp',
}
start = "2022-01-01"
end = "2024-01-01"

In [None]:
for stock in stocks:
    csv = f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_news.csv"
    df = pd.read_csv(csv)
    print(len(df))
    print(len(df[df.duplicated(['title']) == True]))
    df = df.drop_duplicates(['title'])
    print(len(df))
    scores, probs = get_sentiment_results(df)
    df['sentiment_score'] = scores
    df['sentiment_probs'] = probs
    df.to_csv(f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_sentiment.csv", index=False)