In [None]:
# Parameters

RUNTIME_TYPE = 'COLAB'
SEED = 42
N_EPOCHS = 5
BATCH_SIZE = 16
ed = 'MPQA2.0_v221219_cleaned'
k_fold = 5
transformer_name = 'bert-base-uncased'
max_input_length = 256
OUTPUT_DIM = 3 # number of classes
HIDDEN_DIM = 256
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0
freeze = False
best_valid_acc_s = []
best_valid_loss_s = []
path = '[Replace the save location of the Fortorch JSON files link here.]'
path_models = '[Replace save location link here (to save models).]'

In [None]:
# Libraries Required for Google Colab

if RUNTIME_TYPE == 'COLAB':
    %pip install -U torchtext==0.10.0
    %pip install transformers

In [None]:
!pwd

In [None]:
!nvidia-smi

In [None]:
if RUNTIME_TYPE == 'COLAB':
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import torch
import random
import numpy as np
import torchtext
import transformers
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# Support for third-party widgets

if RUNTIME_TYPE == 'COLAB':
    from google.colab import output
    output.enable_custom_widget_manager()

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)

In [None]:
len(tokenizer.vocab)

In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

In [None]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [None]:
def tokenizee(sentence):
    tokens = tokenizer(sentence, truncation=True, add_special_tokens = False)
    return tokens

In [None]:
MIX = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenizee,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

SENTIMENT = data.LabelField(dtype = torch.long)

ID = data.RawField()

In [None]:
fields = {'mixAnnot': ('m', MIX), 'sentiment': ('s', SENTIMENT), 'uniqueID': ('i', ID)}

# ***Functions***





In [None]:
def find(text):
  idh, idt = 0, 0
  first = -1
  for i in range(len(text)):
    if text[i] == 101:
      if first == -1:
        first = 0
      elif first == 0:
        idh = i
        first = 1
      else:
        idt = i

  return idh, idt

In [None]:
def loss_fn(preds, targets):
    return criterion(preds, targets)

In [None]:
def accuracy_fn(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    correct = (preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def metrics_fn(y_pred, y_true):
    target_names = ['negative', 'positive', 'neutral']
    print(classification_report(y_true, y_pred, target_names=target_names))
    return classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        predictions = model(batch.m).squeeze(1)

        preds = torch.argmax(predictions, dim = 1)

        loss = loss_fn(predictions, batch.s)

        acc = accuracy_fn(preds, batch.s)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    ys_for_metrics = []
    preds_for_metric = []
    res = {}

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.m).squeeze(1)

            preds = torch.argmax(predictions, dim = 1)

            loss = loss_fn(predictions, batch.s)

            acc = accuracy_fn(preds, batch.s)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            ys_for_metrics = ys_for_metrics + (batch.s).tolist()
            preds_for_metric = preds_for_metric + preds.tolist()

        res = metrics_fn(preds_for_metric, ys_for_metrics)

        del ys_for_metrics
        del preds_for_metric

    return epoch_loss / len(iterator), epoch_acc / len(iterator), res

In [None]:
def test(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    ys_for_metrics = []
    preds_for_metric = []
    res = {}

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.m).squeeze(1)

            preds = torch.argmax(predictions, dim = 1)

            loss = loss_fn(predictions, batch.s)

            acc = accuracy_fn(preds, batch.s)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            ys_for_metrics = ys_for_metrics + (batch.s).tolist()
            preds_for_metric = preds_for_metric + preds.tolist()

        res = metrics_fn(preds_for_metric, ys_for_metrics)

        del ys_for_metrics
        del preds_for_metric

    return epoch_loss / len(iterator), epoch_acc / len(iterator), res

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
class Transformer(nn.Module):
    def __init__(self, transformer,hidden_dim,output_dim,n_layers,bidirectional,dropout):

        super().__init__()

        self.transformer = transformer

        embedding_dim = transformer.config.hidden_size

        self.rnn = nn.GRU(embedding_dim,
                        hidden_dim,
                        num_layers = n_layers,
                        bidirectional = bidirectional,
                        batch_first = True,
                        dropout = 0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)  # Linear(in_features, out_features, bias=True)

        self.out1 = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.Linear(hidden_dim, int(hidden_dim / 2)),
            nn.Linear(int(hidden_dim / 2), int(hidden_dim / 4)),
            nn.Linear(int(hidden_dim / 4), output_dim)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        #text = [batch size, sent len]

        embedded = self.transformer(text)[0]
        #embedded = [batch size, sent len, emb dim]

        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        #hidden = [batch size, hid dim]

        output = self.out1(hidden)
        #output = [batch size, out dim]

        return output

# ***K-Fold***


In [None]:
test_loss, test_acc, res, res_val = {}, {}, {}, {}

for k in range(1, k_fold+1):
    print('-'*100)
    print('k is: ', k)

    # Preparing Data
    train_data, valid_data, test_data = data.TabularDataset.splits(
                                path = path,
                                train = ed + '_fortorch_trainset_fold_{}.json'.format(k),
                                validation = ed + '_fortorch_validationset_fold_{}.json'.format(k),
                                test = ed + '_fortorch_testset_fold_{}.json'.format(k),
                                format = 'json',
                                fields = fields
    )

    print(f"Number of training examples: {len(train_data)}")
    print(f"Number of validation examples: {len(valid_data)}")
    print(f"Number of test examples: {len(test_data)}")

    for i in range(len(train_data.examples)):
        vars(train_data.examples[i])['m'] = vars(train_data.examples[i])['m']['input_ids']

    for i in range(len(valid_data.examples)):
        vars(valid_data.examples[i])['m'] = vars(valid_data.examples[i])['m']['input_ids']

    for i in range(len(test_data.examples)):
        vars(test_data.examples[i])['m'] = vars(test_data.examples[i])['m']['input_ids']

    SENTIMENT.build_vocab(train_data)

    # Setup device
    device_string = 'cuda' if torch.cuda.is_available() else 'cpu'
    device_hf = 0 if torch.cuda.is_available() else -1
    device = torch.device(device_string)
    NUM_WORKERS = 0

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size = BATCH_SIZE,
        device = device,
        sort_key = lambda x: len(x.m),
        sort_within_batch=True)

    # Build the Model
    transformer = transformers.AutoModel.from_pretrained(transformer_name)
    model = Transformer(transformer, HIDDEN_DIM,OUTPUT_DIM,N_LAYERS,BIDIRECTIONAL,DROPOUT)

    # Train the Model
    optimizer = optim.Adam(model.parameters(), lr=7e-6, weight_decay=6e-6)
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    best_valid_loss = float('inf')
    best_valid_acc = float('-inf')

    model_save_name = ed + 'k{}model.pt'.format(k)
    path_model_save = path_models + model_save_name

    for epoch in range(N_EPOCHS):

        print(f'Epoch: {epoch+1:02}')

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc, res_val[str(k)] = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), path_model_save)

        print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        print('-'*50)

    print(f'In {N_EPOCHS} epochs, best accuracy for validation set is {best_valid_acc*100:.2f}% and its loss (not best) is {best_valid_loss:.3f}\n\n')

    best_valid_acc_s.append(best_valid_acc)
    best_valid_loss_s.append(best_valid_loss)

    # Tset the Model

    path_model_load = path_models + model_save_name
    model.load_state_dict(torch.load(path_model_load))

    test_loss[str(k)], test_acc[str(k)], res[str(k)] = test(model, test_iterator, criterion)
    print(f'\t Test. Loss: {test_loss[str(k)]:.3f} |  Test. Acc: {test_acc[str(k)]*100:.2f}%')

In [None]:
from statistics import mean

for k in range(1, k_fold+1):
    print(f'For k = {k}, best accuracy for validation set is {best_valid_acc_s[k-1]*100:.2f}% and its loss (not best) is {best_valid_loss_s[k-1]:.3f}')

print(f'\nmean of accuracy is {mean(best_valid_acc_s)*100:.2f}% and mean of loss is {mean(best_valid_loss_s):.3f}')

In [None]:
temp = res_val['1']
pre_ng, re_ng, f1_ng = temp['negative']['precision'], temp['negative']['recall'], temp['negative']['f1-score']
pre_pos, re_pos, f1_pos = temp['positive']['precision'], temp['positive']['recall'], temp['positive']['f1-score']
pre_nu, re_nu, f1_nu = temp['neutral']['precision'], temp['neutral']['recall'], temp['neutral']['f1-score']
f1_weighted = temp['weighted avg']['f1-score']

for k in range(2, k_fold+1):
    temp = res_val[str(k)]
    pre_ng += temp['negative']['precision']
    re_ng += temp['negative']['recall']
    f1_ng += temp['negative']['f1-score']

    pre_pos += temp['positive']['precision']
    re_pos += temp['positive']['recall']
    f1_pos += temp['positive']['f1-score']

    pre_nu += temp['neutral']['precision']
    re_nu += temp['neutral']['recall']
    f1_nu += temp['neutral']['f1-score']

    f1_weighted += temp['weighted avg']['f1-score']

precision = [pre_ng/k_fold, pre_pos/k_fold, pre_nu/k_fold]
recall = [re_ng/k_fold, re_pos/k_fold, re_nu/k_fold]
f1_score = [f1_ng/k_fold, f1_pos/k_fold, f1_nu/k_fold]
loss = mean(best_valid_loss_s)
acc = mean(best_valid_acc_s)
f1_weighted /= k_fold

print(f'>> Validation Set\nAvg. Accuracy = {acc}\nAvg. Loss = {loss}\nAvg. F1-score(weighted avg): {f1_weighted}\nOrder: [negative, positive, neutral] \nAvg. Precision = {precision}\nAvg. Recall = {recall}\nAvg. F1-score = {f1_score}')

In [None]:
temp = res['1']
loss, acc = test_loss['1'], test_acc['1']
pre_ng, re_ng, f1_ng = temp['negative']['precision'], temp['negative']['recall'], temp['negative']['f1-score']
pre_pos, re_pos, f1_pos = temp['positive']['precision'], temp['positive']['recall'], temp['positive']['f1-score']
pre_nu, re_nu, f1_nu = temp['neutral']['precision'], temp['neutral']['recall'], temp['neutral']['f1-score']
f1_weighted = temp['weighted avg']['f1-score']

for k in range(2, k_fold+1):
    temp = res[str(k)]
    pre_ng += temp['negative']['precision']
    re_ng += temp['negative']['recall']
    f1_ng += temp['negative']['f1-score']

    pre_pos += temp['positive']['precision']
    re_pos += temp['positive']['recall']
    f1_pos += temp['positive']['f1-score']

    pre_nu += temp['neutral']['precision']
    re_nu += temp['neutral']['recall']
    f1_nu += temp['neutral']['f1-score']

    f1_weighted += temp['weighted avg']['f1-score']

    loss += test_loss[str(k)]
    acc += test_acc[str(k)]

precision = [pre_ng/k_fold, pre_pos/k_fold, pre_nu/k_fold]
recall = [re_ng/k_fold, re_pos/k_fold, re_nu/k_fold]
f1_score = [f1_ng/k_fold, f1_pos/k_fold, f1_nu/k_fold]
loss /= k_fold
acc /= k_fold
f1_weighted /= k_fold

print(f'>> Test Set\nAvg. Accuracy = {acc}\nAvg. Loss = {loss}\nAvg. F1-score(weighted avg): {f1_weighted}\n Order: [negative, positive, neutral] \nAvg. Precision = {precision}\nAvg. Recall = {recall}\nAvg. F1-score = {f1_score}')