In [None]:
# !pip install datasets
# !pip install transformers

In [None]:
import os
import sys

import torch 
import torch.nn as nn
from tqdm import tqdm

from datasets import load_dataset
from torch.utils.data import DataLoader

from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
financial_phrasebank = load_dataset('financial_phrasebank', 'sentences_50agree') 
financial_phrasebank

#### Training Examples: 
- 0: 'negative'
- 1: 'neutral'
- 2: 'positive'

In [None]:
print("Sentence:", financial_phrasebank['train'][0]['sentence'])
print("Label:", financial_phrasebank['train'][0]['label'])

In [None]:
print("Sentence:", financial_phrasebank['train'][3]['sentence'])
print("Label:", financial_phrasebank['train'][3]['label'])

#### Create a validation and test set

In [None]:
from sklearn.model_selection import train_test_split

train_inputs, remain_inputs, train_labels, remain_labels = train_test_split(financial_phrasebank['train']['sentence'], 
                                                                            financial_phrasebank['train']['label'], 
                                                                            train_size=0.8)

test_inputs, validation_inputs, test_labels, validation_labels = train_test_split(remain_inputs, remain_labels, test_size=0.5)



In [None]:
financial_phrasebank_dict = {
    'train': {'sentence': train_inputs, 
              'label': train_labels
              },
    'valid': {'sentence': validation_inputs, 
              'label': validation_labels
              },
    'test': {'sentence': test_inputs, 
              'label': test_labels
             }
    }

In [None]:
financial_phrasebank_dict['train']['sentence'][0]

#### Tokenize

In [None]:
# from transformers import AutoTokenizer 
# tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=True) 

In [None]:
print("Sentence:", financial_phrasebank['train']['sentence'][0])
print("Tokens:", tokenizer.tokenize(financial_phrasebank['train']['sentence'][0]))
print("Token IDs:", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(financial_phrasebank['train']['sentence'][0])))

Tokenize and add paddings to all of the sentences and map the tokens to thier word IDs.
For every sentence encode will:
  - (1) Tokenize the sentence.
  - (2) Prepend the `[CLS]` token to the start. - token id 101
  - (3) Append the `[SEP]` token to the end. - token id 102
  - (4) Map tokens to their IDs.
  - (5) Ensure all sentences are equal length. Pad sequences with 0 

In [None]:
financial_phrasebank_dict['train'].keys()

In [None]:
def tokenize_datasets(data):
  tokenized_datasets = {}
  for collection in data: 
    tokenized_datasets[collection] = tokenizer(data[collection]['sentence'], padding='max_length', max_length = 64, truncation=True, return_tensors='pt')
    tokenized_datasets[collection]['label'] = data[collection]['label']
  
  return tokenized_datasets

In [None]:
tokenized_datasets = tokenize_datasets(financial_phrasebank_dict)

In [None]:
tokenized_datasets['train'].keys()

In [None]:
tokenized_datasets['train']['input_ids'][10]

In [None]:
print('Max train sentence length: ', max([len(sen) for sen in financial_phrasebank_dict['train']['sentence']]))

In [None]:
len(tokenized_datasets['valid']['input_ids'])

#### Create dataset and dataloader


In [None]:
def data_loader(batch_size, tokenized_data):
  #Creating the DataLoader which will help us to load data into the GPU/CPU
  batch_size = batch_size
  dataloaders = {}
  # Create the DataLoader for our data set.
  for collection in tokenized_data: 
    data = TensorDataset(tokenized_data[collection]['input_ids'], tokenized_data[collection]['token_type_ids'], 
                         tokenized_data[collection]['attention_mask'], torch.tensor(tokenized_data[collection]['label']))
    if collection == 'train':
      sampler = RandomSampler(data)
    else: 
      sampler = SequentialSampler(data)
    
    dataloaders[collection] = DataLoader(data, sampler=sampler, batch_size=batch_size)

  return dataloaders

In [None]:
data_loaders = data_loader(batch_size=16, tokenized_data=tokenized_datasets)

#### Loading the pre-trained BERT model from huggingface library: 
BertForSequenceClassification the pretrained BERT model with a single linear classification layer on top. 

In [None]:
# model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=3)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)
# model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert',num_labels=3)

In [None]:
# model

In [None]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
torch.cuda.is_available()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
import numpy as np
def accuracy(preds, labels): 
  preds = preds.detach().cpu().numpy()
  labels = labels.to('cpu').numpy()
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

# def accuracy(out, labels):
#     out = out.cpu().numpy()
#     labels = labels.to('cpu').numpy()
#     outputs = np.argmax(out, axis=1)
#     return np.sum(outputs == labels) / len(labels)

In [None]:
def train(model, optimizer, criterion, train_dataloader, val_dataloader, num_epochs, load_pretrained=False):
    
    plot_cache = {'train_loss':[], 'train_acc': [], 'val_loss':[], 'val_acc': []}
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    
    for epoch in range(num_epochs):
        print("")
        print("Epoch:", epoch)
        if not load_pretrained:
            
            model.train() 
            counter = 0
            
            train_batch_loss = 0
            train_batch_acc = 0
            
            for step, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                counter += 1

                b_input_ids = batch[0].to(device)
                b_token_type_ids = batch[1].to(device)
                b_input_mask = batch[2].to(device)
                b_labels = batch[3].to(device)

                logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
                # print(logits.size())   
                # print(b_labels.size())
                loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                #train batch accuracy & loss: 
                train_batch_loss += loss.item()
                train_batch_acc += accuracy(logits, b_labels)

            epoch_train_loss = train_batch_loss / counter
            train_losses.append(epoch_train_loss)

            epoch_train_acc = train_batch_acc / counter
            train_accs.append(epoch_train_acc)

            print("")
            print("  Average training loss: {0:.2f}".format(epoch_train_loss))
            print("  Average training acc: {0:.2f}".format(epoch_train_acc))

        model.eval()
        with torch.no_grad():

            val_batch_loss = 0
            val_batch_acc = 0 
            val_counter = 0 

            for step, batch in enumerate(val_dataloader):
                val_counter +=1 
                b_input_ids = batch[0].to(device)
                b_token_type_ids = batch[1].to(device)
                b_input_mask = batch[2].to(device)
                b_labels = batch[3].to(device)

                logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
                        
                val_loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
                
                #validation batch accuracy & loss: 
                val_batch_loss += val_loss.item()
                val_batch_acc += accuracy(logits, b_labels)
                # print("Batch acc:", val_batch_acc)
                # print("Batch loss:", val_batch_loss)

            epoch_val_loss = val_batch_loss / val_counter
            val_losses.append(epoch_val_loss)

            epoch_val_acc = val_batch_acc / val_counter
            val_accs.append(epoch_val_acc)

            print("")
            print("  Average validation loss: {0:.2f}".format(epoch_val_loss))
            print("  Average validation accuracy: {0:.2f}".format(epoch_val_acc))

        plot_cache['val_loss'].append(epoch_val_loss)
        plot_cache['val_acc'].append(epoch_val_acc)

        plot_cache['train_loss'].append(epoch_train_loss)
        plot_cache['train_acc'].append(epoch_train_acc)
    
    return plot_cache

In [None]:
plot_cache = train(model.to(device), optimizer, criterion, train_dataloader=data_loaders['train'], val_dataloader=data_loaders['valid'], num_epochs=5,  load_pretrained=False)

#### Evaluation 

In [None]:
#Evaluating our model on the test set

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(data_loaders['test'])))

# Put model in evaluation mode
model.eval()

# Tracking variables 
pred_labels , true_labels = [], []

# Predict 
for batch in data_loaders['test']:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  #Unpack the inputs from our dataloader
  b_input_ids,b_token_type_ids,  b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  pred_labels.append(logits)
  true_labels.append(label_ids)

In [None]:
# 0: 'negative'
# 1: 'neutral'
# 2: 'positive
neutral, positive, negative = 0,0,0
for num in financial_phrasebank_dict['test']['label']: 
  if num == 1:
    neutral += 1 
  elif num == 2: 
    positive += 1 
  else: 
    negative += 1

test_size = len(financial_phrasebank_dict['test']['label'])

In [None]:
print('Positive samples: %d of %d (%.2f%%)' % (positive, test_size, (positive / test_size * 100.0)))

In [None]:
print('Length of one batch of predictions:' , len(pred_labels[0]), '\n' , pred_labels[0]) 

In [None]:
print('Length of one batch of true labels:' , len(true_labels[0]), '\n' , true_labels[0]) 

In [None]:
# For each input batch the predictions are a 3-column ndarray (one column for "0", one column for "1", and one column for "2"). 
# Pick the label with the highest value and turn this
predictions = []
for i in range(len(pred_labels)):
  predictions.append(np.argmax(pred_labels[i], axis=1).flatten())

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [None]:
actual = np.concatenate(true_labels, axis =0)
predictions = np.concatenate(predictions, axis =0)

In [None]:
print("Accuracy", accuracy_score(actual, predictions))
print("Precision", precision_score(actual, predictions, average='macro'))
print("F1 Score", f1_score(actual, predictions, average='macro'))

# Refrences: 
 - https://huggingface.co/FinanceInc

# Appendix:

In [None]:
# def train(model, optimizer, criterion, train_dataloader, num_epochs, load_pretrained=False):
#     plot_cache = {'train_loss':[], 'train_acc': [], 'val_loss':[], 'val_acc': []}
#     train_losses = []
#     train_accs = []
#     val_losses = []
#     val_accs = []
    
#     for epoch in range(num_epochs):
#         print("Epoch:", epoch)
#         if not load_pretrained:
#           model.train() 
#             counter = 0
#             train_batch_loss = 0
#             train_batch_acc = 0

#             val_batch_loss = 0
#             val_batch_acc = 0 

            
#             for step, batch in enumerate(train_dataloader):
                
#                 b_input_ids = batch[0].to(device)
#                 b_token_type_ids = batch[1].to(device)
#                 b_input_mask = batch[2].to(device)
#                 b_labels = batch[3].to(device)

#                 optimizer.zero_grad()
#                 counter += 1

#                 logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
#                 # print(logits.size())   
#                 # print(b_labels.size())
#                 loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
#                 train_batch_loss += loss.item()

#                 #train batch accuracy: 
#                 train_batch_acc += accuracy(logits, b_labels)

#                 loss.backward()
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#                 optimizer.step()
            
#             epoch_train_loss = train_batch_loss / counter
#             print(counter)
#             train_losses.append(epoch_train_loss)

#             epoch_train_acc = train_batch_acc / counter
#             # print(counter)
#             train_accs.append(epoch_train_acc)

#             print("")
#             print("  Average training loss: {0:.2f}".format(epoch_train_loss))
#             print("  Average training acc: {0:.2f}".format(epoch_train_acc))

#         plot_cache['train_loss'].append(epoch_train_loss)
#         plot_cache['train_acc'].append(epoch_train_acc)
    
#     return plot_cache, model

In [None]:
plot_cache, model = train(model.to(device), optimizer, criterion, train_dataloader=data_loaders['train'], num_epochs=5, load_pretrained=False)

In [None]:
# num_epochs = 1

# val_losses = []
# val_accs = []
# plot_cache = {'val_loss':[], 'val_acc': []}


# for epoch in range(num_epochs): 
#   print("Epoch:", epoch)
#   val_batch_loss = 0
#   val_batch_acc = 0 
    
#   counter = 0 
#   model.eval()
  
#   with torch.no_grad():
#     for step, batch in enumerate(data_loaders['valid']):
#       counter += 1
#       # print("Iteration:", step)
#       b_input_ids = batch[0].to(device)
#       b_token_type_ids = batch[1].to(device)
#       b_input_mask = batch[2].to(device)
#       b_labels = batch[3].to(device)

#       logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
            
#       val_loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
#       val_batch_loss += val_loss.item()
#       # print(type(val_batch_loss))

#       #batch accuracy 
#       val_batch_acc += accuracy(logits, b_labels)
#       print("Batch acc:", val_batch_acc)
#       print("Batch loss:", val_batch_loss)

#       # print(type(val_batch_acc))

      

#     # epoch_val_loss = val_batch_loss / counter
#     # print(counter)
#     # val_losses.append(epoch_val_loss)

#     epoch_val_acc = val_batch_acc / counter
#     # # print(counter)
#     # val_accs.append(epoch_val_acc)

#     # print("")
#     # print("  Average validation loss: {0:.2f}".format(epoch_val_loss))
#     print("  Average validation accuracy: {0:.2f}".format(epoch_val_acc))

#     # plot_cache['val_loss'].append(epoch_val_loss)
#     # plot_cache['val_acc'].append(epoch_val_acc)