# Mounting to Google Drive
We mount the Google Colab file to our Drive and direct it to the correct directory.

In [1]:
### MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!ls

drive  sample_data


In [3]:
###Set the path of the folder where your colab file and data exist in Google Drive in the ------ portion

%cd "drive/MyDrive/cse354finalproj"

/content/drive/.shortcut-targets-by-id/1Tw4bh9dvewvoo-24u_J8tIUioi5k1--j/cse354finalproj


# SETING UP THE PROJECT
In this section, we import all necessary libraries such as torch, transformer, and pandas. We also defined constants for later use. These constants hold information on paths to directories, csv filepaths, and numbers used for our model. 

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 29.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 2.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0

In [5]:
###IMPORTS
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import os
from sklearn.metrics import precision_score, recall_score, f1_score
torch.manual_seed(42)
np.random.seed(42)

In [6]:
###CONSTANTS
BATCH_SIZE = 16
EPOCHS = 10
TEST_PATH = "data/test_data.csv"
TRAIN_PATH = "data/train_data.csv"
VAL_PATH = "data/val_data.csv"
SAVE_PATH = "models/FinBERT"

In [7]:
def load_dataset(path):
  dataset = pd.read_csv(path)
  return dataset

In [8]:
train_data = load_dataset(TRAIN_PATH)
val_data = load_dataset(VAL_PATH)
test_data = load_dataset(TEST_PATH)

# Initialize the Model
For our model, we will be using the pretrain FinBERT model, which is a model that was trained on financial data.


In [9]:
class FinBERT():

  def __init__(self, model_name='ProsusAI/finbert', num_classes=3):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer

# Create DataLoader Class
To feed our data into our FinBERT model, we need to get the data and tokenize it first. This class is responsible for preparing the data before sending it through the model.


In [10]:
class DatasetLoader(Dataset):

  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def tokenize_data(self):
    tokens = []
    labels = []
    label_dict = {'negative': 2, 'positive': 1, 'neutral': 0}

    review_list = self.data['headline'].to_list()
    label_list = self.data['sentiment'].to_list()

    for (review, label) in tqdm(zip(review_list, label_list), total=len(review_list)):
  
      token_review = self.tokenizer(review, max_length=512, truncation=True, return_tensors='pt')
      tokens.append(token_review['input_ids'][0])
      labels.append(label_dict[label])
  
    tokens = pad_sequence(tokens, batch_first=True)
    labels = torch.tensor(labels)
    dataset = TensorDataset(tokens, labels)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    processed_dataset = self.tokenize_data()

    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=batch_size
    )

    return data_loader

# Building the Trainer Class
This model contains functions that generate performance metrics, train parameters, and evaluate the generated parameters.

In [18]:
class Trainer():

  def __init__(self, options):
    self.device = options['device']
    self.train_data = options['train_data']
    self.val_data = options['val_data']
    self.batch_size = options['batch_size']
    self.epochs = options['epochs']
    self.save_path = options['save_path']
    transformer = FinBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='micro')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='micro')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='micro')
    return precision, recall, f1

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for batch_idx, (reviews, labels) in enumerate(tqdm(data_loader)):
      self.model.zero_grad()

      reviews = reviews.to('cuda' if torch.cuda.is_available() else 'cpu')
      labels = labels.to('cuda' if torch.cuda.is_available() else 'cpu')
      outputs = self.model(reviews, labels=labels)
      loss = outputs.loss
      logits = outputs.logits

      precision, recall, f1 = self.get_performance_metrics(logits.cpu().data.numpy(), labels.cpu().data.numpy())
      total_precision += precision
      total_recall += recall
      total_f1 += f1
      total_loss += loss

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for (reviews, labels) in tqdm(data_loader):

        reviews = reviews.to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = labels.to('cuda' if torch.cuda.is_available() else 'cpu')
        outputs = self.model(reviews, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        precision, recall, f1 = self.get_performance_metrics(logits.cpu().data.numpy(), labels.cpu().data.numpy())
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_loss += loss

    
    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)
    val_dataset = DatasetLoader(self.val_data, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)
    optimizer = AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)

    if self.epochs == 0:
      self.save_transformer()

    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')
      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        self.save_transformer()
        last_best = val_f1
        print("Model saved.")


In [12]:
options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = train_data
options['val_data'] = val_data
options['save_path'] = SAVE_PATH
options['epochs'] = EPOCHS 
trainer = Trainer(options)
trainer.execute()

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

100%|██████████| 3391/3391 [00:00<00:00, 4821.04it/s]
100%|██████████| 728/728 [00:00<00:00, 4807.75it/s]
100%|██████████| 212/212 [01:23<00:00,  2.53it/s]


Epoch 1: train_loss: 0.7409 train_precision: 0.6303 train_recall: 0.6303 train_f1: 0.6303


100%|██████████| 46/46 [00:03<00:00, 14.24it/s]


Epoch 1: val_loss: 2.0893 val_precision: 0.3071 val_recall: 0.3071 val_f1: 0.3071
Saving model..
Model saved.


100%|██████████| 212/212 [01:27<00:00,  2.41it/s]


Epoch 2: train_loss: 0.4253 train_precision: 0.8346 train_recall: 0.8346 train_f1: 0.8346


100%|██████████| 46/46 [00:03<00:00, 13.65it/s]


Epoch 2: val_loss: 1.7162 val_precision: 0.3777 val_recall: 0.3777 val_f1: 0.3777
Saving model..
Model saved.


100%|██████████| 212/212 [01:30<00:00,  2.34it/s]


Epoch 3: train_loss: 0.2411 train_precision: 0.9216 train_recall: 0.9216 train_f1: 0.9216


100%|██████████| 46/46 [00:03<00:00, 13.45it/s]


Epoch 3: val_loss: 0.7361 val_precision: 0.7269 val_recall: 0.7269 val_f1: 0.7269
Saving model..
Model saved.


100%|██████████| 212/212 [01:31<00:00,  2.32it/s]


Epoch 4: train_loss: 0.1264 train_precision: 0.9614 train_recall: 0.9614 train_f1: 0.9614


100%|██████████| 46/46 [00:03<00:00, 13.19it/s]


Epoch 4: val_loss: 0.9604 val_precision: 0.7065 val_recall: 0.7065 val_f1: 0.7065


100%|██████████| 212/212 [01:32<00:00,  2.29it/s]


Epoch 5: train_loss: 0.0694 train_precision: 0.9802 train_recall: 0.9802 train_f1: 0.9802


100%|██████████| 46/46 [00:03<00:00, 13.08it/s]


Epoch 5: val_loss: 0.8627 val_precision: 0.7731 val_recall: 0.7731 val_f1: 0.7731
Saving model..
Model saved.


100%|██████████| 212/212 [01:32<00:00,  2.29it/s]


Epoch 6: train_loss: 0.0492 train_precision: 0.9861 train_recall: 0.9861 train_f1: 0.9861


100%|██████████| 46/46 [00:03<00:00, 13.03it/s]


Epoch 6: val_loss: 1.1008 val_precision: 0.6834 val_recall: 0.6834 val_f1: 0.6834


100%|██████████| 212/212 [01:33<00:00,  2.28it/s]


Epoch 7: train_loss: 0.0351 train_precision: 0.9909 train_recall: 0.9909 train_f1: 0.9909


100%|██████████| 46/46 [00:03<00:00, 13.00it/s]


Epoch 7: val_loss: 1.0122 val_precision: 0.7351 val_recall: 0.7351 val_f1: 0.7351


100%|██████████| 212/212 [01:33<00:00,  2.27it/s]


Epoch 8: train_loss: 0.0277 train_precision: 0.9920 train_recall: 0.9920 train_f1: 0.9920


100%|██████████| 46/46 [00:03<00:00, 13.04it/s]


Epoch 8: val_loss: 1.1557 val_precision: 0.7228 val_recall: 0.7228 val_f1: 0.7228


100%|██████████| 212/212 [01:33<00:00,  2.27it/s]


Epoch 9: train_loss: 0.0356 train_precision: 0.9894 train_recall: 0.9894 train_f1: 0.9894


100%|██████████| 46/46 [00:03<00:00, 13.03it/s]


Epoch 9: val_loss: 1.3141 val_precision: 0.6793 val_recall: 0.6793 val_f1: 0.6793


100%|██████████| 212/212 [01:33<00:00,  2.28it/s]


Epoch 10: train_loss: 0.0259 train_precision: 0.9903 train_recall: 0.9903 train_f1: 0.9903


100%|██████████| 46/46 [00:03<00:00, 13.02it/s]

Epoch 10: val_loss: 1.3339 val_precision: 0.7296 val_recall: 0.7296 val_f1: 0.7296





In [19]:
options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = train_data
options['val_data'] = val_data
options['save_path'] = SAVE_PATH + "-no-fine-tune"
options['epochs'] = 0
trainer = Trainer(options)
trainer.execute()

100%|██████████| 3391/3391 [00:00<00:00, 5107.04it/s]
100%|██████████| 728/728 [00:00<00:00, 5217.33it/s]


# TESTING
In this section, we will be testing our fine-tuned FinBERT model on data that was set aside specifically for testing. We will measure the performance metrics to see how the model did.


In [20]:
class Tester():

  def __init__(self, options):
    self.save_path = options['save_path']
    self.device = options['device']
    self.test_data = options['test_data']
    self.batch_size = options['batch_size']
    transformer = FinBERT(self.save_path)
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='micro')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='micro')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='micro')
    return precision, recall, f1

  def test(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for (reviews, labels) in tqdm(data_loader):

        reviews = reviews.to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = labels.to('cuda' if torch.cuda.is_available() else 'cpu')
        outputs = self.model(reviews, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        precision, recall, f1 = self.get_performance_metrics(logits.cpu().data.numpy(), labels.cpu().data.numpy())
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_loss += loss
    
    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def execute(self):
    test_dataset = DatasetLoader(self.test_data, self.tokenizer)
    test_data_loader = test_dataset.get_data_loaders(self.batch_size)

    test_precision, test_recall, test_f1, test_loss = self.test(test_data_loader)

    print()
    print(f'test_loss: {test_loss:.4f} test_precision: {test_precision:.4f} test_recall: {test_recall:.4f} test_f1: {test_f1:.4f}')

In [21]:
options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_data
options['save_path'] = SAVE_PATH
tester = Tester(options)
tester.execute()

100%|██████████| 727/727 [00:00<00:00, 4482.84it/s]
100%|██████████| 46/46 [00:04<00:00,  9.72it/s]


test_loss: 0.7092 test_precision: 0.8230 test_recall: 0.8230 test_f1: 0.8230





In [22]:
options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_data
options['save_path'] = SAVE_PATH + "-no-fine-tune"
tester = Tester(options)
tester.execute()

100%|██████████| 727/727 [00:00<00:00, 4183.15it/s]
100%|██████████| 46/46 [00:04<00:00,  9.78it/s]


test_loss: 2.0410 test_precision: 0.0476 test_recall: 0.0476 test_f1: 0.0476





# WEB SCRAPPING
Below is our web scrapper for gathering realtime stock news headlines to run through our model. The website we are getting our data from is finviz.com, more specifically the url https://finviz.com/news.ashx. The python modules we will be using for this are Selenium and BeautifulSoup.

# Important Note
The code in this section should be run locally on your machine, not in Google Colab due to compatibility issues with certain modules and Colab. After running, a .csv file containing headlines from the url https://finviz.com/news.ashx will be saved into your directory.

In [None]:
!pip install selenium
!pip install webdriver-manager
!pip install requests

In [None]:
# Import necessary modules
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from webdriver_manager.utils import ChromeType
import time

opts = webdriver.ChromeOptions()
opts.add_argument('headless')
cap = DesiredCapabilities.CHROME
cap["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1280,720)
driver.get('https://finviz.com/news.ashx')
time.sleep(5)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

hrefs = soup.find_all('a', {"class": "nn-tab-link"})
headlines = []
for href in hrefs:
    headlines.append(href.text)
headlines = headlines[1:]

df = pd.DataFrame(headlines, columns=['headline'])
df.to_csv('headlines.csv')

# CLASSIFYING FINVIZ HEADLINES
Now, after training our sentiment analysis model and scraping headlines from www.finviz.com, it is time to put our model to work. Feeding the csv file we generated from our web scraping code into our fine-tuned FinBERT model, we will be able to classify each headline from www.finviz.com.

In [23]:
# Constants
FINVIZ_PATH = 'data/finviz.csv'
finviz_data = load_dataset(FINVIZ_PATH)

In [24]:
# Makes predictions from the Finviz data
transformer = FinBERT(SAVE_PATH)
model, tokenizer = transformer.get_tokenizer_and_model()
model.to('cuda' if torch.cuda.is_available() else 'cpu')


inputs = tokenizer(list(finviz_data['headline']), padding = True, truncation = True, return_tensors='pt')
inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
outputs = model(**inputs)
preds = torch.nn.functional.softmax(outputs.logits, dim=-1)

In [25]:
# Creates a DataFrame from the predictions for easy viewing
pos = preds[:, 0].tolist()
neg = preds[:, 1].tolist()
neu = preds[:, 2].tolist()
headlines = list(finviz_data['headline'])
cl = []

# Classifying based on highest probability
for i in range(len(pos)):
  if max(pos[i], neg[i], neu[i]) == pos[i]:
    cl.append("Positive")
  elif max(pos[i], neg[i], neu[i]) == neg[i]:
    cl.append("Negative")
  elif max(pos[i], neg[i], neu[i]) == neu[i]:
    cl.append("Neutral")

# Create the DataFrame
results = pd.DataFrame()
results['Headline'] = headlines
results['Positive'] = pos
results['Neutral'] = neu
results['Negative'] = neg
results['Class'] = cl

results

Unnamed: 0,Headline,Positive,Neutral,Negative,Class
0,Bill Gates Says Economy ‘Bears’ Have Strong Ar...,0.055202,0.001305,0.943493,Negative
1,Firm set to buy McColl's urged to take on staf...,0.987570,0.010691,0.001739,Positive
2,‘Please Help’: A Nationwide Baby Formula Short...,0.010702,0.078566,0.910732,Negative
3,Stock Funds Felt the Tech Pain in April,0.054555,0.114166,0.831279,Negative
4,Vanishing Value Trade Puts Emerging Markets at...,0.517871,0.005980,0.476149,Positive
...,...,...,...,...,...
175,Why Am I Losing Money In The Market?,0.015851,0.872644,0.111505,Neutral
176,Making a Fresh Start: Lessons From Molly Ruth,0.921134,0.067908,0.010958,Positive
177,New Q3 Issue Just Released Today,0.717181,0.279741,0.003078,Positive
178,Taking A Break From Blogging And Social Media,0.694926,0.301250,0.003824,Positive


In [26]:
# Exports the generated DataFrame to the results directory
results.to_csv("results/results.csv")