# Data

In [440]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import json

from transformers import BertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
import torch.nn.functional as F
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score, MulticlassF1Score

import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

In [5]:
anno_subtask2a_train = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\train.json')
anno_subtask2a_train['subset'] = 'train'
anno_subtask2a_val = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\validation.json')
anno_subtask2a_val['subset'] = 'val'
anno_subtask2a_dev = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\dev_unlabeled.json')

anno_subtask2a_combined = pd.concat([anno_subtask2a_train, anno_subtask2a_val])

anno_subtask2b_train = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\train.json')
anno_subtask2b_train['subset'] = 'train'
anno_subtask2b_val = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\val.json')
anno_subtask2b_val['subset'] = 'val'
anno_subtask2b_dev = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\dev_unlabeled.json')

anno_subtask2b_combined = pd.concat([anno_subtask2b_train, anno_subtask2b_val])

In [13]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

subtask2a = pd.merge(anno_subtask2a_combined, images_df, on='image')

path = r'X:\PhD\SemEval Task4\Data\subtask2b_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

subtask2b = pd.merge(anno_subtask2b_combined, images_df, on='image')

# has a nan text field so replace it with an empty string
subtask2b['text'] = subtask2b['text'].fillna(' ')
subtask2b

Unnamed: 0,id,text,image,label,subset,filepath
0,35807,DONALD TRUMP: BARACK\nOBAMA AND JOE BIDEN\nWIL...,prop_meme_6570.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
1,30562,00\n10% FOR\nTHE BIG GUY\nNANCY'S\nCUT\n@ImMem...,prop_meme_8346.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
2,44163,"To much political posting online\nthese days, ...",prop_meme_24378.png,non_propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
3,24224,I DON'T THINK\nYOU UNDERSTOOD\nWHAT I SAID.\nY...,prop_meme_2594.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
4,31611,ⒸSergey Mihailicenko/Anadolu Agency via Getty ...,prop_meme_7654.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
...,...,...,...,...,...,...
1345,44900,197\nNazi ain't got no humanity\nThey're the f...,prop_meme_19869.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1346,12635,HANG ONTHAVE\nA MEME\nFOR THIS\n,prop_meme_641.png,non_propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1347,12740,"HE GAVE HIS BLOOD, SWEAT AND TEARS\nFOR THE AM...",prop_meme_746.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1348,46086,BUT I WANTED NORTH\nKOREA TO NUKE US\nTO MAKE ...,prop_meme_18775.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...


## Merge the Entities

In [14]:
path_f = r'X:\PhD\SemEval Task4\Code\GoogleVision\vision_face_detect.json'
path_w = r'X:\PhD\SemEval Task4\Code\GoogleVision\web_entities.json'

with open(path_f) as f:
    faces = json.load(f)

with open(path_w) as f:
    web_ents = json.load(f)

# function for sorting the JSON file a bit more sensibly to work with
# col is in the Response: e.g., webEntities, fullMatchingImages .etc, check the json

def explode_frame(json_file, col):
    df = pd.json_normalize(json_file)
    df.set_index('Image ID', inplace=True)
    return df['Response.' + col].explode().pipe(lambda x: pd.json_normalize(x).set_index(x.index))

In [15]:
web_ents = explode_frame(web_ents, 'webEntities')

In [16]:
# grab the labels, merge the frames
labels = pd.concat([subtask2a[['id', 'image', 'labels']], subtask2b[['id', 'image', 'label']]])
labels.rename(columns={'labels': '2a_label', 'label': '2b_label'}, inplace=True)
web_ents = web_ents.merge(labels, left_on=web_ents.index, right_on='image')

# Subtask2b

In [113]:
df = web_ents.dropna(subset='2b_label') # just changing it here 
df = df.drop(columns='2a_label')
df = df.groupby('id').agg(list).reset_index()
df['description'] = df['description'].apply(lambda x: list(set(x)))
df['2b_label'] = [x for x in df['2b_label'].apply(lambda x: list(set(x)))]
df['image'] = df['image'].apply(lambda x: list(set(x)))
df = df.astype(str)

In [114]:
df['labels'] = df['2b_label'].replace({"['non_propagandistic']": 0, "['propagandistic']": 1})
df

Unnamed: 0,id,entityId,score,description,image,2b_label,labels
0,10015,"['/m/06mg_j', '/m/01jddz', '/m/02jjt', '/m/052...","[0.65761805, 0.5491231, 0.5085392, 0.4730451, ...","['Convention', 'Musical ensemble', 'Concert', ...",['prop_meme_3.png'],['non_propagandistic'],0
1,10018,"['/m/013s93', '/m/01zlzf', '/m/01n4qj', '/m/01...","[0.63, 0.43245, 0.416, 0.37267068, 0.3432773, ...","['Chicha', 'Journalist', 'Product', 'T-shirt',...",['prop_meme_6.png'],['propagandistic'],1
2,10257,"['/m/07crc', '/m/051zk', '/t/280hgdwwmpmhj', '...","[1.08525, 0.97965, 0.738, 0.738, 0.7153, 0.704...","[nan, 'Choco Taco', 'Dessert tacos', 'Value me...",['prop_meme_245.png'],['propagandistic'],1
3,11736,"['/g/11fljpmxv', '/g/120xp5nm', '/g/11q4gdbgt3...","[0.4142, 0.3957, 0.3118, 0.2946, 0.2853, 0.279...","['Ovulation Test', 'personal', 'important', 'O...",['prop_meme_269.png'],['propagandistic'],1
4,12345,"['/g/11kkx451vq', '/g/11hgzk9flm', '/m/060d2',...","[0.71295, 0.597, 0.5965, 0.5184, 0.3457, 0.074...","[nan, 'The First Presidential Debate', 'First ...",['prop_meme_351.png'],['propagandistic'],1
...,...,...,...,...,...,...,...
1345,56209,"['/g/11q3tk_3xv', '/m/04wpw', '/m/0bf3_n', '/g...","[0.7059, 0.4874, 0.44, 0.4048, 0.3842, 0.3197,...","['funny', 'CHWV-FM', 'Imgur', 'Culture', 'Funn...",['prop_meme_25022.png'],['non_propagandistic'],0
1346,56212,"['/m/01b9xk', '/m/0bt9lr', '/m/01z5f', '/m/0bb...","[1.49805, 1.0897499, 0.7246549, 0.6803, 0.6009...","['Dog', 'Hot Dog', 'Canidae', 'Companion dog',...",['prop_meme_25025.png'],['non_propagandistic'],0
1347,56214,"['/g/11h88gys_y', '/m/06nh1', '/m/01yrx', '/g/...","[1.62492, 0.75485855, 0.62432873, 0.5757, 0.49...","[nan, 'Cat', 'PCGamesN', 'Mammal', 'Blizzard E...",['prop_meme_25027.png'],['non_propagandistic'],0
1348,56216,"['/m/02jz0l', '/m/01j2bj', '/m/0130jx', '/m/0d...","[0.7199, 0.5394, 0.5137, 0.4367, 0.4196, 0.407...","['Kitchen Tap', 'Shower', 'Funny meme', 'Bathr...",['prop_meme_25029.png'],['non_propagandistic'],0


In [84]:
descriptions = df['description'].tolist()

## DataLoader

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [119]:
train, test = train_test_split(df, test_size=0.2)

In [120]:
descriptions = train['description'].tolist()
descriptions_val = test['description'].tolist()

In [115]:
max_len = 0

for sent in descriptions:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  128


In [123]:
input_ids = []
attention_masks = []

for text in descriptions:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = max_len,          # default to 512
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = []
val_attention_masks = []

for text in descriptions_val:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = max_len,          # default to 512
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    

    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

In [124]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train['labels'].tolist())

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(test['labels'].values.tolist())

In [125]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

## Finetune

In [129]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                     
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

batch_size = 8

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size)

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size)

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [174]:
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def f1_scorer(preds, labels):
    f1 = BinaryF1Score()
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1(torch.tensor(pred_flat),  torch.tensor(labels_flat))

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Training Script

In [134]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.61
  Training epcoh took: 0:00:45

Running Validation...
  Accuracy: 0.78

Training...

  Average training loss: 0.52
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 0.78

Training...

  Average training loss: 0.47
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 0.79

Training...

  Average training loss: 0.38
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 0.76

Training...

  Average training loss: 0.31
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 0.79

Training complete!
Total training took 0:03:54 (h:mm:ss)


In [135]:
model_params = f'Bert_entity_FT'
save_path = 'X:\\PhD\\SemEval Task4\\Code\\1. Final Code\\NLP Finetuning\\models'

torch.save(model, os.path.join(save_path, model_params+'.pth'))
torch.save(model.state_dict(), os.path.join(save_path, model_params+'_weights.pth'))

## Evaluation

In [176]:
# on the same validation set

model.eval()
total_eval_accuracy = 0
best_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
f1_score = 0
   
for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
       
    with torch.no_grad():        
        output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
    loss = output.loss
    total_eval_loss += loss.item()
        
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
        
    total_eval_accuracy += flat_accuracy(logits, label_ids)
    f1_score += f1_scorer(logits, label_ids)
   
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
avg_f1 = f1_score / len(validation_dataloader)
print("Accuracy: {0:.2f}".format(avg_val_accuracy))
print("F1: {0:.2f}".format(avg_f1))

Accuracy: 0.79
F1: 0.83


In [177]:
# pending dev set - need to extract labels again!

# Subtask2a

In [None]:
df = df.explode('labels')
df['labels'].apply(lambda x: x if x else 'None')
df.replace(lvl1_dict, inplace=True, regex=True)
df = df.explode('labels')
df.fillna('None', inplace=True)
df = df.groupby('id').agg(list).reset_index()
df['labels'] = df['labels'].apply(lambda x: list(set(x)))
df['text'] = df['text'].str[0]
df.astype(str)

In [231]:
labels_df = anno_subtask2a_combined[['id', 'labels', 'image']]
labels_df = labels_df.explode('labels')
labels_df['labels'].apply(lambda x: x if x else 'None')
labels_df.fillna('None', inplace=True)
labels_df = labels_df.groupby('id').agg(list).reset_index()
labels_df['labels'] = labels_df['labels'].apply(lambda x: list(set(x)))
labels_df['image'] = labels_df['image'].str[0]
labels_df['id'] = labels_df['id'].astype('int64')

In [242]:
df = web_ents.dropna(subset='2a_label') # just changing it here 
df = df.groupby('id').agg(list).reset_index()
df['description'] = df['description'].apply(lambda x: list(set(x))).astype(str)
df = df[['id', 'entityId', 'score', 'description']]
df = pd.merge(df, labels_df, on='id')

In [348]:
lab = df['labels'].values
unique_labels = []

for i in lab:
    for x in i:
        unique_labels.append(x)
unique_labels = list(set(unique_labels))

num_classes = len(unique_labels)

ml = MultiLabelBinarizer()
df['encoded_labels'] = ml.fit_transform(df['labels'])[:,1:].tolist()

In [349]:
train, test = train_test_split(df, test_size=0.2)
descriptions = train['description'].tolist()
descriptions_val = test['description'].tolist()

In [365]:
input_ids = []
attention_masks = []

for text in descriptions:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = max_len,          # default to 512
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = []
val_attention_masks = []

for text in descriptions_val:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = max_len,          # default to 512
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    

    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

In [366]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train['encoded_labels'].values.tolist())

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(test['encoded_labels'].values.tolist())

In [367]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [368]:
# new bert model, since we have more classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 22, # The number of output labels--2 for binary classification.                     
    output_attentions = False, 
    output_hidden_states = False,
    problem_type = "multi_label_classification" 
)

batch_size = 8

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size)

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size)

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [389]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels) 
        loss = torch.nn.BCEWithLogitsLoss()(output.logits, b_labels)
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = torch.nn.BCEWithLogitsLoss()(output.logits, b_labels)
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.21
  Training epcoh took: 0:38:18

Running Validation...


  return np.sum(pred_flat == labels_flat) / len(labels_flat)


  Accuracy: 0.00

Training...

  Average training loss: 0.19
  Training epcoh took: 0:37:39

Running Validation...
  Accuracy: 0.00

Training...

  Average training loss: 0.17
  Training epcoh took: 0:32:35

Running Validation...
  Accuracy: 0.00

Training...

  Average training loss: 0.15
  Training epcoh took: 0:24:55

Running Validation...
  Accuracy: 0.00

Training...

  Average training loss: 0.14
  Training epcoh took: 0:24:51

Running Validation...
  Accuracy: 0.00

Training complete!
Total training took 2:53:48 (h:mm:ss)


In [371]:
model_params = f'Bert_entity_FT_subtask2a'
save_path = 'X:\\PhD\\SemEval Task4\\Code\\1. Final Code\\NLP Finetuning\\models'

torch.save(model, os.path.join(save_path, model_params+'.pth'))
torch.save(model.state_dict(), os.path.join(save_path, model_params+'_weights.pth'))

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def binary_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def f1_scorer(preds, labels):
    f1 = BinaryF1Score()
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1(torch.tensor(pred_flat),  torch.tensor(labels_flat))

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [456]:
# on the same validation set

model.eval()
total_eval_accuracy = 0
best_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
f1_score = 0

fin_targets = []
fin_outputs = []
overall_acc = 0

binaryacc = BinaryAccuracy().to(device)
binaryf1 = BinaryF1Score().to(device)
   
for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device).float()    
       
    with torch.no_grad():        
        output = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

        loss = torch.nn.BCEWithLogitsLoss()(output.logits, b_labels)
        logits = output.logits
        label_ids = b_labels.to('cpu').numpy()

        targets = b_labels.to(device, dtype=torch.float)
        logits = torch.sigmoid(logits)
        overall_acc += binaryacc(logits, targets)
        f1_score += binaryf1(logits, targets)
        
        fin_targets.extend(targets)
        fin_outputs.extend(logits)


print(f"Accuracy Score = {overall_acc/len(validation_dataloader)}")
print(f"F1 Score = {f1_score/len(validation_dataloader)}")

probs = [t.to('cpu').numpy() for t in fin_outputs]
act = [t.to('cpu').numpy() for t in fin_targets]

macrof1 = MulticlassF1Score(multidim_average='global', num_classes=num_classes, average='macro')
print(f'Macro F1: {macrof1(torch.tensor(probs), torch.tensor(act))}')

weightedf1 = MulticlassF1Score(multidim_average='global', num_classes=num_classes, average='weighted')
print(f'Weighted F1: {weightedf1(torch.tensor(probs), torch.tensor(act))}')

smacrof1 = MulticlassF1Score(multidim_average='samplewise', num_classes=num_classes, average='macro')
print(f'Macro Sample F1: {macrof1(torch.tensor(probs), torch.tensor(act))}')

smicrof1 = MulticlassF1Score(multidim_average='global', num_classes=num_classes, average='micro')
print(f'Micro Sample F1: {smicrof1(torch.tensor(probs), torch.tensor(act))}')


binaryf1 = BinaryF1Score(multidim_average='global')
print(binaryf1(torch.tensor(probs), torch.tensor(act)))

binaryf1 = BinaryF1Score(multidim_average='samplewise')
print(binaryf1(torch.tensor(probs), torch.tensor(act)))


Accuracy Score = 0.8942517042160034
F1 Score = 0.26689666509628296
Macro F1: 0.4740113914012909
Weighted F1: 0.8543409109115601
Macro Sample F1: 0.4740113914012909
Micro Sample F1: 0.0
tensor(0.2694)
tensor([1.0000, 0.3333, 0.0000,  ..., 0.0000, 0.8000, 1.0000])


In [None]:
# use sedat's hierarchial f1 code instead