In [2]:
!pip install transformers
import os
from zipfile import ZipFile
from tqdm.notebook import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 27.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 50.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


**Load CSV Data**

train --> queries to be trained

testA --> testing data with topics that have already been seen by the model

testB --> data with unseen topics 

In [3]:
import pandas as pd
import os

dataDir = '../data/semeval'
train = pd.read_csv(os.path.join(dataDir, 'train.csv'))

temp = pd.read_csv(os.path.join(dataDir, 'val.csv'))
train = pd.concat([train, temp])


testA = pd.read_csv(os.path.join(dataDir, 'testA.csv'))
testB = pd.read_csv(os.path.join(dataDir, 'testB.csv'))
len(train), len(testA), len(testB)

(2814, 1249, 707)

**Split data into validation and testing**

Validation data is obtained from the testing data

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(testA.index.values, 
                                                  testA.Stance.values, 
                                                  test_size=0.5, 
                                                  random_state=42, 
                                                  stratify=testA.Stance.values)
testA.loc[X_train, 'data_type'] = 'val'
val = testA[testA.data_type == 'val']
testA = testA[testA.data_type == 'testA']
len(val), len(testA), len(train)

(624, 625, 2814)

In [5]:
possible_labels = testA.Stance.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{0: 0, 1: 1, 2: 2}

In [6]:
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

**Initiate Bert Tokeizer**

Generate Tensor dataset after tokenizing train, val, testA and testB

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
max_length = 320
encoded_data_train = tokenizer.batch_encode_plus(
    train.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True,  
    padding='max_length',
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_test_A = tokenizer.batch_encode_plus(
    testA.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True,  
    padding='max_length',
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_test_B = tokenizer.batch_encode_plus(
    testB.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True,  
    padding='max_length',
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)



input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train.Stance.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val.Stance.values)

input_ids_test_A = encoded_data_test_A['input_ids']
attention_masks_test_A = encoded_data_test_A['attention_mask']
labels_test_A = torch.tensor(testA.Stance.values)


input_ids_test_B = encoded_data_test_B['input_ids']
attention_masks_test_B = encoded_data_test_B['attention_mask']
labels_test_B = torch.tensor(testB.Stance.values)


print(input_ids_train.shape, attention_masks_train.shape, labels_train.shape)
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test_A = TensorDataset(input_ids_test_A, attention_masks_test_A, labels_test_A)
dataset_test_B = TensorDataset(input_ids_test_B, attention_masks_test_B, labels_test_B)

torch.Size([2814, 320]) torch.Size([2814, 320]) torch.Size([2814])


**Obtain Device**

In [49]:
import torch
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()
device

device(type='cuda')

**Load a pretrained BERT model**

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=True,
                                                      output_hidden_states=True)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

**Dataloader**

Create dataloader for batch training, for faster convergence

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 24

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test_A = DataLoader(dataset_test_A, 
                                   sampler=SequentialSampler(dataset_test_A), 
                                   batch_size=batch_size)

dataloader_test_B = DataLoader(dataset_test_B, 
                                   sampler=SequentialSampler(dataset_test_B), 
                                   batch_size=batch_size)


**Initialize optimizer and scheduler**

In case you want to use a saved optimizer uncomment the load lines to load the weights, otherwise initialize new optimizer and store it as the new optimizer and scheduler for future re-training.

In [11]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  weight_decay = 0.8,
                  eps=1e-8)
                  
epochs = 12

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
torch.save(optimizer.state_dict(), '../models/BERTStanceLargeOptimizer')
torch.save(scheduler.state_dict(), '../models/BERTStanceLargeScheduler')
# optimizer.load_state_dict(torch.load('/content/drive/MyDrive/SWM/models/BERTStanceOptimizer'))
# scheduler.load_state_dict(torch.load('/content/drive/MyDrive/SWM/models/BERTStanceScheduler'))
optimizer, scheduler.state_dict()



(AdamW (
 Parameter Group 0
     betas: (0.9, 0.999)
     correct_bias: True
     eps: 1e-08
     initial_lr: 1e-05
     lr: 1e-05
     weight_decay: 0.8
 ),
 {'base_lrs': [1e-05],
  'last_epoch': 0,
  '_step_count': 1,
  'verbose': False,
  '_get_lr_called_within_step': False,
  '_last_lr': [1e-05],
  'lr_lambdas': [None]})

**Model Validation**

In [7]:
import random
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import json
from sklearn.metrics import f1_score

def f1_score_macro(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro')

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

        
def saveList(l, f):
    with open(f'../weights/BERTStance{f}.json', "w") as fp:
        json.dump(l, fp)
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

**Model Training**

In [None]:
# Train Model
model.to(device)
lrs = []
validationAccs = []
trainingLoss = []
validationLoss = []
trainingF1 = []
validationF1 = []

for epoch in tqdm(range(1, epochs+1)):
    correct = 0
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
        outputs = model(**inputs)
        # if torch.argmax(outputs[1][0]) == batch[2]:
        #     correct += 1
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        lrs.append(optimizer.param_groups[0]['lr'])
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)  

    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    pred_vals = [np.argmax(pred) for pred in predictions]
    val_acc = accuracy_score(pred_vals, true_vals)
    
    if not validationAccs or val_acc > validationAccs[-1]:
        model.save_pretrained('../models/BERTStance')
        torch.save(optimizer.state_dict(), '../models/BERTStanceOptimizer')
        torch.save(scheduler.state_dict(), '../models/BERTStanceScheduler')
        saveList(validationAccs, 'validationAccs')
        saveList(validationLoss, 'validationLoss')
        saveList(validationF1, 'validationF1')
        saveList(lrs, 'lrs')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')

    validationAccs.append(val_acc)
    trainingLoss.append(loss_train_avg)
    validationLoss.append(val_loss)
    validationF1.append(val_f1)

**Load our saved model**

Load our trained model from drive

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('../models/BERTStance')
model.to(device)

**Model's performance on validation and testing data**

In [None]:
val_loss, predictions, true_vals = evaluate(dataloader_validation)
val_f1 = f1_score_func(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]

val_acc = accuracy_score(pred_vals, true_vals)
val_f1_macro = f1_score_macro(predictions, true_vals)

print(f'Validation loss : {val_loss}')
print(f'F1 Score (Weighted) : {val_f1}')
print(f'F1 Score (Macro) : {val_f1_macro}')
print(f'Validation accuracy : {val_acc}')
print('\n')


test_loss_A, predictions, true_vals = evaluate(dataloader_test_A)
test_f1_A = f1_score_func(predictions, true_vals)
test_f1_A_macro = f1_score_macro(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
test_acc_A = accuracy_score(pred_vals, true_vals)

print(f'Test loss A: {test_loss_A}')
print(f'F1 Score (Weighted) A: {test_f1_A}')
print(f'F1 Score (Macro) A: {test_f1_A_macro}')
print(f'Test accuracy A: {test_acc_A}')
print('\n')

test_loss_B, predictions, true_vals = evaluate(dataloader_test_B)
test_f1_B = f1_score_func(predictions, true_vals)
test_f1_B_macro = f1_score_macro(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
test_acc_B = accuracy_score(pred_vals, true_vals)

print(f'Test loss B: {test_loss_B}')
print(f'F1 Score (Weighted) B: {test_f1_B}')
print(f'F1 Score (Macro) B: {test_f1_B_macro}')
print(f'Test accuracy B: {test_acc_B}')
print('\n')

**Targe-wise Metrics**

In [83]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
targets = set(testA.Target)
batch_size = 8
print('tempBERTStance\n')
for target in targets:
    tempTest = testA[testA.Target == target]
    print(len(tempTest))
    encoded_data = tokenizer.batch_encode_plus(
    tempTest.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=128, 
    truncation=True,
    return_tensors='pt'
    )

    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(tempTest.Stance.values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, 
                              sampler=RandomSampler(dataset), 
                              batch_size=batch_size)
    test_loss, predictions, true_vals = evaluate(dataloader)
    test_f1 = f1_score_func(predictions, true_vals)
    pred_vals = [np.argmax(pred) for pred in predictions]
    test_acc = accuracy_score(pred_vals, true_vals)

    print(f'Test loss {target} : {test_loss}')
    print(f'F1 Score (Weighted) {target} : {test_f1}')
    print(f'Test accuracy A {target} : {test_acc}')
    print('\n')

tempBERTStance

86
Test loss climate change is a real concern : 0.5862284776839343
F1 Score (Weighted) climate change is a real concern : 0.7464971833020366
Test accuracy A climate change is a real concern : 0.7906976744186046


146
Test loss legalization of abortion : 2.2293297236687257
F1 Score (Weighted) legalization of abortion : 0.2425079080064744
Test accuracy A legalization of abortion : 0.2602739726027397


102
Test loss atheism : 1.6393575347386873
F1 Score (Weighted) atheism : 0.5106560765008962
Test accuracy A atheism : 0.4803921568627451


164
Test loss hillary clinton : 1.4782671587807792
F1 Score (Weighted) hillary clinton : 0.4553781442607308
Test accuracy A hillary clinton : 0.4268292682926829


127
Test loss feminist movement : 1.9296414852142334
F1 Score (Weighted) feminist movement : 0.34101568777117147
Test accuracy A feminist movement : 0.3700787401574803




**Load data containing untrained targets**

Load data containing untrained targets and split them into train. validation and test data

In [10]:
from sklearn.model_selection import train_test_split
"""
    against - 0
    favor - 1
    neither - 2
"""

label_dict = {
    'Against' : 0,
    'In support of' : 1,
    'Indifferent towards' : 2
}

pcTest = pd.read_csv('../data/Testing/openai_generated_dataset.csv')
pcTest['Query'] = '[CLS] ' + pcTest['Query'] + ' [SEP] ' + pcTest['Target'] + ' [SEP]'
pcTest = pcTest.replace({'Stance': label_dict})
pcTest.dropna(inplace=True)
pcTest.head()
X_train, X_val, y_train, y_val = train_test_split(pcTest.index.values, 
                                                  pcTest.Stance.values, 
                                                  test_size=0.4, 
                                                  random_state=42, 
                                                  stratify=pcTest.Stance.values)
pcTest['data_type'] = 'not set'
pcTest.loc[X_train, 'data_type'] = 'train'
pcTest.loc[X_val, 'data_type'] = 'val'
pc_train = pcTest[pcTest.data_type == 'train']
pc_val = pcTest[pcTest.data_type == 'val']


X_train, X_val, y_train, y_val = train_test_split(pc_val.index.values, 
                                                  pc_val.Stance.values, 
                                                  test_size=0.4, 
                                                  random_state=42, 
                                                  stratify=pc_val.Stance.values)

pc_val.loc[X_train, 'data_type'] = 'test'
pc_test = pc_val[pc_val.data_type == 'test']
pc_val = pc_val[pc_val.data_type == 'val']
len(pc_val), len(pc_test), len(pc_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


(28, 42, 104)

**Bert Tokenizer**
Tokenize data and create tensor dataset from data

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
max_length = 320
encoded_data_train = tokenizer.batch_encode_plus(
    pc_train.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    pc_val.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True,  
    padding='max_length',
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    pc_test.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True,  
    padding='max_length',
    max_length=max_length, 
    truncation=True,
    return_tensors='pt'
)



input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(pc_train.Stance.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(pc_val.Stance.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(pc_test.Stance.values)



print(input_ids_train.shape, attention_masks_train.shape, labels_train.shape)
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

torch.Size([104, 320]) torch.Size([104, 320]) torch.Size([104])


**Dataloader**

Create dataloader for batch training, for faster convergence

In [12]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 8

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

**Initialize optimizer and scheduler**

In case you want to use a saved optimizer uncomment the load lines to load the weights, otherwise initialize new optimizer and store it as the new optimizer and scheduler for future re-training.

In [13]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  weight_decay = 0.8,
                  eps=1e-8)
                  
epochs = 12

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
optimizer, scheduler.state_dict()



(AdamW (
 Parameter Group 0
     betas: (0.9, 0.999)
     correct_bias: True
     eps: 1e-08
     initial_lr: 1e-05
     lr: 1e-05
     weight_decay: 0.8
 ),
 {'base_lrs': [1e-05],
  'last_epoch': 0,
  '_step_count': 1,
  'verbose': False,
  '_get_lr_called_within_step': False,
  '_last_lr': [1e-05],
  'lr_lambdas': [None]})

**Model Training**

In [16]:
# Train Model
device = torch.device('cuda')
model.to(device)
lrs = []
validationAccs = []
trainingLoss = []
validationLoss = []
trainingF1 = []
validationF1 = []
epochs = 8
for epoch in tqdm(range(1, epochs+1)):
    correct = 0
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
        outputs = model(**inputs)
        # if torch.argmax(outputs[1][0]) == batch[2]:
        #     correct += 1
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        lrs.append(optimizer.param_groups[0]['lr'])
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)  

    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    pred_vals = [np.argmax(pred) for pred in predictions]
    val_acc = accuracy_score(pred_vals, true_vals)
    
    if not validationAccs or val_acc > validationAccs[-1]:
        model.save_pretrained('../models/ImprovedBERTStance')
        torch.save(optimizer.state_dict(), '../models/ImprovedBERTStanceOptimizer')
        torch.save(scheduler.state_dict(), '../models/ImprovedBERTStanceScheduler')
        saveList(validationAccs, 'validationAccs')
        saveList(validationLoss, 'validationLoss')
        saveList(validationF1, 'validationF1')
        saveList(lrs, 'lrs')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')

    validationAccs.append(val_acc)
    trainingLoss.append(loss_train_avg)
    validationLoss.append(val_loss)
    validationF1.append(val_f1)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.341118262364314
Validation loss: 1.1552885472774506
F1 Score (Weighted): 0.4475085228599448
Validation accuracy: 0.4642857142857143


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.7003648235247686
Validation loss: 0.8800924196839333
F1 Score (Weighted): 0.549719887955182
Validation accuracy: 0.5714285714285714


Epoch 3:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.4384649074994601
Validation loss: 0.6489347741007805
F1 Score (Weighted): 0.7473214285714286
Validation accuracy: 0.75


Epoch 4:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.28718507289886475
Validation loss: 0.41298554837703705
F1 Score (Weighted): 0.7806122448979592
Validation accuracy: 0.7857142857142857


Epoch 5:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.17987639743548173
Validation loss: 0.38575341925024986
F1 Score (Weighted): 0.8115646258503401
Validation accuracy: 0.8214285714285714


Epoch 6:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.1277852557026423
Validation loss: 0.2074052095413208
F1 Score (Weighted): 0.9285714285714286
Validation accuracy: 0.9285714285714286


Epoch 7:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.07309959284388103
Validation loss: 0.1711860317736864
F1 Score (Weighted): 0.9285714285714286
Validation accuracy: 0.9285714285714286


Epoch 8:   0%|          | 0/13 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.06922469058862099
Validation loss: 0.15505785681307316
F1 Score (Weighted): 0.9641751437417072
Validation accuracy: 0.9642857142857143


**Metrics on new data**

In [17]:
val_loss, predictions, true_vals = evaluate(dataloader_validation)
val_f1 = f1_score_func(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]

val_acc = accuracy_score(pred_vals, true_vals)
val_f1_macro = f1_score_macro(predictions, true_vals)

print(f'Validation loss : {val_loss}')
print(f'F1 Score (Weighted) : {val_f1}')
print(f'F1 Score (Macro) : {val_f1_macro}')
print(f'Validation accuracy : {val_acc}')
print('\n')


test_loss_A, predictions, true_vals = evaluate(dataloader_test)
test_f1_A = f1_score_func(predictions, true_vals)
test_f1_A_macro = f1_score_macro(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
test_acc_A = accuracy_score(pred_vals, true_vals)

print(f'Test loss A: {test_loss_A}')
print(f'F1 Score (Weighted) A: {test_f1_A}')
print(f'F1 Score (Macro) A: {test_f1_A_macro}')
print(f'Test accuracy A: {test_acc_A}')
print('\n')


Validation loss : 0.15505785681307316
F1 Score (Weighted) : 0.9641751437417072
F1 Score (Macro) : 0.9628482972136223
Validation accuracy : 0.9642857142857143


Test loss A: 0.2765205775698026
F1 Score (Weighted) A: 0.9027409372236959
F1 Score (Macro) A: 0.902740937223696
Test accuracy A: 0.9047619047619048




**Target-wise metrics**

In [18]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
targets = set(pc_test.Target)
batch_size = 8

for target in targets:
    tempTest = pc_test[pc_test.Target == target]
    encoded_data = tokenizer.batch_encode_plus(
    tempTest.Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=128, 
    truncation=True,
    return_tensors='pt'
    )

    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(tempTest.Stance.values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, 
                              sampler=RandomSampler(dataset), 
                              batch_size=batch_size)
    test_loss, predictions, true_vals = evaluate(dataloader)
    test_f1 = f1_score_func(predictions, true_vals)
    pred_vals = [np.argmax(pred) for pred in predictions]
    test_acc = accuracy_score(pred_vals, true_vals)

    print(f'Test loss {target} : {test_loss}')
    print(f'F1 Score (Weighted) {target} : {test_f1}')
    print(f'Test accuracy {target} : {test_acc}')
    print('\n')

2
Test loss nuclear nonproliferation : 0.3042607605457306
F1 Score (Weighted) nuclear nonproliferation : 1.0
Test accuracy nuclear nonproliferation : 1.0


4
Test loss video gaming is a sport : 0.10542058944702148
F1 Score (Weighted) video gaming is a sport : 1.0
Test accuracy video gaming is a sport : 1.0


5
Test loss underage drinking : 1.0339657068252563
F1 Score (Weighted) underage drinking : 0.4533333333333333
Test accuracy underage drinking : 0.6


1
Test loss electric cars : 0.04050192981958389
F1 Score (Weighted) electric cars : 1.0
Test accuracy electric cars : 1.0


3
Test loss defund the police : 0.061946745961904526
F1 Score (Weighted) defund the police : 1.0
Test accuracy defund the police : 1.0


1
Test loss human cloning : 0.2390802502632141
F1 Score (Weighted) human cloning : 1.0
Test accuracy human cloning : 1.0


4
Test loss mandatory vaccinations : 0.016732564195990562
F1 Score (Weighted) mandatory vaccinations : 1.0
Test accuracy mandatory vaccinations : 1.0


4
Te