# Task 1: Working with a dataset with categorical features

### Step 1, Reading the data

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import krippendorff
import torch 

class TDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
class DataSetContainer():
    def __init__(self, RawY=None, RawX=None, File=None, Consensus=None, SplitY=None, ConfidenceWeights=None):
        self.RawX = RawX
        self.RawY = RawY
        self.SplitY = SplitY
        self.File = File
        self.Consensus = Consensus
        self.ConfidenceWeights = ConfidenceWeights

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
trainContainer = DataSetContainer(File="assets/a3_train_final.tsv")
testContainer = DataSetContainer(File="assets/a3_test.tsv")
dataContainers = [trainContainer, testContainer]

for dcont in dataContainers:
    df = pd.read_table(dcont.File, names=['opinion', 'text'])
    df = df.sample(frac=1, random_state=1337)
    df["text"] = df["text"].apply(lambda a:a.lower())
    dcont.RawY = df["opinion"]
    dcont.RawX = df["text"]

In [3]:
# The trainset has annotator disagreements
# https://towardsdatascience.com/assessing-annotator-disagreements-in-python-to-build-a-robust-dataset-for-machine-learning-16c74b49f043

import numpy as np

import random
from collections import Counter
allNums = trainContainer.RawY.str.split('/').to_numpy()
allNums = np.concatenate(allNums).ravel().tolist()
occNum = dict(Counter(allNums))
del occNum['-1']

def randomByOcc():
    val,prob = zip(*(occNum.items()))
    #return int(random.choices([0,1], weights=[1,9]))
    return int(random.choices(val,weights=prob))

def toNumOrNan(n):
    try:
        if (n == "-1"):
            #return randomByOcc()
            #return 0
            return np.nan
        return int(n)
    except Exception:
        return np.nan

for dset in [trainContainer]:
    splitOpinion = dset.RawY.str.split('/', expand=True)
    splitOpinion = splitOpinion.applymap(toNumOrNan).transpose()
    
    # since we don't know who the annotators are who wrote what should be arbitrary
    # but (it does not actually matter for krippendorf)
    #splitOpinion = pd.DataFrame(data=[sk.utils.shuffle(list(splitOpinion.loc[:,c]), random_state=c) for c in splitOpinion.columns]).transpose()

    dset.SplitY = splitOpinion
    dset.Consensus = krippendorff.alpha(reliability_data=splitOpinion, value_domain=[0,1])
print(f"Krippendorff alpha for training data: {trainContainer.Consensus}")

Krippendorff alpha for training data: 0.8798749025443741


In [4]:
# Weiging annotations, https://arxiv.org/pdf/2208.06161.pdf
#   SPA makes one key assumption: The degree to
#   which labels are absent must be independent of the
#   true item-agreements ni⊥Pi.
from collections import Counter

def getMostLikelyAndItsWeight(col):
    answer2count = Counter([x for x in col if x in [0,1]])
    nAnnotators = float(len(answer2count))

    mostPopularAnswer = sorted(answer2count, reverse=True)[0]
    mostPopularCount = answer2count[mostPopularAnswer]

    # agreement = % is the most popular - % isn't the most popular
    del answer2count[mostPopularAnswer]
    agreement = float(mostPopularCount - sum(answer2count.values()))/nAnnotators

    #using weight = number of annotators
    weight = nAnnotators

    return (weight*agreement, mostPopularAnswer)

train_weights,train_mostPopClass = zip(*[
    getMostLikelyAndItsWeight(dset.SplitY.loc[:,c]) 
    for c in trainContainer.SplitY.columns
])
trainContainer.ConfidenceWeights = pd.Series(list(train_weights))

In [39]:
############
# Bert Setup
############
from transformers import BertTokenizer, BertForSequenceClassification

bertTokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)
bertModel = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

loading file vocab.txt from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_

In [24]:
# max tweet = 240 characters -> 120 words
# must const sizes or torch gets angry
from sklearn.model_selection import train_test_split 
X_train, X_val, y_train, y_val = train_test_split(trainContainer.RawX, train_mostPopClass, test_size=0.2)

train_Xtoken = bertTokenizer(list(X_train), max_length=120, padding=True, truncation=True) 
eval_Xtoken  = bertTokenizer(list(X_val),  max_length=120, padding=True, truncation=True)

In [7]:
trainY = [int(x) for x in list(y_train)]
evalY = [int(x) for x in list(y_val)]

trainTset = TDataset(train_Xtoken, trainY)
evalTset = TDataset(eval_Xtoken, evalY)

#Making sure sizes are correct
print(len(train_Xtoken["input_ids"]))
print(len(train_Xtoken["token_type_ids"]))
print(len(train_Xtoken["attention_mask"]))
print(len(trainY))

30308
30308
30308
30308


In [8]:
def getScores(torchDataSet):
    pred, labels = torchDataSet
    pred = np.argmax(pred, axis=1)

    recall = sk.metrics.recall_score(y_true=labels, y_pred=pred)
    precision = sk.metrics.precision_score(y_true=labels, y_pred=pred)
    f1 = sk.metrics.f1_score(y_true=labels, y_pred=pred)

    return {
        "precision": precision, 
        "recall": recall, 
        "f1": f1
    }

In [9]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

#https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=1337,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=bertModel,
    args=args,
    train_dataset=trainTset,
    eval_dataset=evalTset,
    compute_metrics=getScores,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
#RUN 
#trainer.train()

# Testing

In [38]:
trainedPath = "output/checkpoint-4500"
trainedModel = BertForSequenceClassification.from_pretrained(trainedPath, num_labels=2)
#trainedTokenizer = BertTokenizer.from_pretrained(trainedPath)

test_trainer = Trainer(
    model=trainedModel
)

loading configuration file output/checkpoint-4500\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file output/checkpoint-4500\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceC

OSError: Can't load tokenizer for 'output/checkpoint-4500'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'output/checkpoint-4500' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [37]:
nTake = min(100, len(testContainer.RawX))

test_X = list(testContainer.RawX)[:nTake]
test_Xtoken = bertTokenizer(list(testContainer.RawX)[:nTake], max_length=120, padding=True, truncation=True) 
testYtrue = [int(x) for x in list(testContainer.RawY)[:nTake]] # no split consensus for test

testTset = TDataset(test_Xtoken)

In [27]:
#test_X = X_train
#testYtrue = y_train

In [36]:
raw_pred, _, _ = test_trainer.predict(testTset)
#raw_pred, _, _ = test_trainer.predict(trainTset)

***** Running Prediction *****
  Num examples = 100
  Batch size = 8
  0%|          | 8/3789 [00:16<1:59:08,  1.89s/it]

KeyboardInterrupt: 

In [30]:
y_pred = np.argmax(raw_pred, axis=1)

df = pd.DataFrame()
df["text"] = test_X
df["true_opinion"] = testYtrue
df["predicted_opinion"] = y_pred

In [31]:
recall    = sk.metrics.recall_score(    y_true=df["true_opinion"], y_pred=df["predicted_opinion"])
precision = sk.metrics.precision_score( y_true=df["true_opinion"], y_pred=df["predicted_opinion"])
f1        = sk.metrics.f1_score(        y_true=df["true_opinion"], y_pred=df["predicted_opinion"])
accuracy  = sk.metrics.accuracy_score(  y_true=df["true_opinion"], y_pred=df["predicted_opinion"])

falseNegatives = df.query("`true_opinion` == 1 and `predicted_opinion` == 0")
falsePositives = df.query("`true_opinion` == 0 and `predicted_opinion` == 1")
corrects = df.query("`true_opinion` == `predicted_opinion`")

testFile = trainContainer.File
#testFile = testContainer.File

In [33]:
pd.set_option('display.max_colwidth', 0)

testFrame = pd.DataFrame.from_dict({
    "test file": [testFile],
    "model path": [trainedPath],
    "recall": [recall],
    "precision": [precision],
    "f1": [f1],
    "accuracy": [accuracy],
    "#samples": [df.shape[0]],
    "  #correct samples": [corrects.shape[0]],
    "  #false negatives": [falseNegatives.shape[0]],
    "  #false positives": [falsePositives.shape[0]],
}, orient="index")
display(testFrame)

display("False Negatives", falseNegatives.head())
display("False Positives", falsePositives.head())
display("Correct", corrects.head())

import re
def alphnum(s):
    return re.sub("[^0-9a-zA-Z]+", "_", s)

try:
    os.mkdir("test_results")
except Exception:
    ""
falseNegatives.to_csv(f"test_results/{df.shape[0]}_{alphnum(testFile)}_{alphnum(trainedPath)}_False_Negatives.csv")
falsePositives.to_csv(f"test_results/{df.shape[0]}_{alphnum(testFile)}_{alphnum(trainedPath)}_False_Positives.csv")
corrects.to_csv(f"test_results/{df.shape[0]}_{alphnum(testFile)}_{alphnum(trainedPath)}_Correct.csv")
testFrame.to_csv(f"test_results/{df.shape[0]}_{alphnum(testFile)}_{alphnum(trainedPath)}_STAT.csv")

Unnamed: 0,0
test file,assets/a3_train_final.tsv
model path,output/checkpoint-4500
recall,0.531548
precision,0.526332
f1,0.528927
accuracy,0.501914
#samples,30308
#correct samples,15212
#false negatives,7469
#false positives,7627


'False Negatives'

Unnamed: 0,text,true_opinion,predicted_opinion
27786,"the tinfoil hat brigade should be made to watch this. (only kidding, you'll not change their dangerous opinions). still, i'm very impressed by how this explanation is both easy-to-understand, and without hyperbole. thank you.",1.0,0
25756,thank you @cdcgov !! i'm grateful fr every vax and booster i've ever had!!,1.0,0
24163,scientists and government safety experts. it was tested like other vaccines.,1.0,0
31399,"vaccines are safe, and save lives. love this shirt, thanks #vaccinescauseadult",1.0,0
34768,you don't know that the vaccine is safe. you're a human petri dish.,1.0,0


'False Positives'

Unnamed: 0,text,true_opinion,predicted_opinion
16239,in my opinion. this has set back trust in the medical industry by generations in some cases. they failed us.,0.0,1
36287,"load bs all to depopulate the human race,they are gene therapys if you have had the test up the nose you have already have the nanos in you",0.0,1
12699,i took the vaccine yesterday and i got the worst migraine of my life today. the night followig the vaccination i felt chills the whole night and was freezing even with two thick hoodies on. i regret taking it so fkn much as i was completely fine before and i never got sick with covid these last 2 years,0.0,1
34563,you also swallowed their covid vaccine bulls- but you never learned a thing from it about what the democrats say,0.0,1
11078,i have lost my job and ‘retired’ now because i am not vaccinated. i am a teacher and above 6o and i had been to all the busy places and i don’t have any symptoms or covid (2 years now),0.0,1


'Correct'

Unnamed: 0,text,true_opinion,predicted_opinion
33778,will it actually provide a high level of immunity though?,0.0,0
34492,yes. my experience getting vaccinated was unreal. the efficiency was impressive.,1.0,1
15582,"if vax work, then why are people worried about people who don't get them?",0.0,0
21208,no vaxx !,0.0,0
4107,do not take the vaccine!!! it is part of an agenda for new world order!,0.0,0


## Importance scores

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

imodel_path = "output/checkpoint-4500"

# load model
imodel = BertForSequenceClassification.from_pretrained(imodel_path)
imodel.to(device)
imodel.eval()
imodel.zero_grad()

# load tokenizer
itokenizer = BertTokenizer.from_pretrained(imodel_path)

#Weight 
def predict(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
    output = imodel(inputs, token_type_ids=token_type_ids,
                 position_ids=position_ids, attention_mask=attention_mask, )
    return output.start_logits, output.end_logits

def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
    pred = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    pred = pred[position]
    return pred.max(1).values

In [None]:
ref_token_id = itokenizer.pad_token_id # A token used for generating token reference
sep_token_id = itokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = itokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):
    text_ids = itokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [sep_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device)
def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

def construct_whole_bert_embeddings(
    input_ids, ref_input_ids, 
    token_type_ids=None, 
    ref_token_type_ids=None,
    position_ids=None, 
    ref_position_ids=None
):
    input_embeddings = imodel.bert.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
    ref_input_embeddings = imodel.bert.embeddings(ref_input_ids, token_type_ids=ref_token_type_ids, position_ids=ref_position_ids)
    
    return input_embeddings, ref_input_embeddings

In [None]:
exampleText = "unicef you are doing and distributing vaccines booster covid-19 make for more confident and safe for people.thank you so much."

input_ids, ref_input_ids, sep_id = construct_input_ref_pair(exampleText, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = itokenizer.convert_ids_to_tokens(indices)

In [None]:
start_scores, end_scores = predict(
    input_ids,
    token_type_ids=token_type_ids,
    position_ids=position_ids,
    attention_mask=attention_mask
)

In [None]:
from captum.attr import LayerConductance, LayerIntegratedGradients

lig = LayerIntegratedGradients(squad_pos_forward_func, imodel.bert.embeddings)

attributions_start, delta_start = lig.attribute(inputs=input_ids,
                                  baselines=ref_input_ids,
                                  additional_forward_args=(token_type_ids, position_ids, attention_mask, 0),
                                  return_convergence_delta=True)
attributions_end, delta_end = lig.attribute(inputs=input_ids, baselines=ref_input_ids,
                                additional_forward_args=(token_type_ids, position_ids, attention_mask, 1),
                                return_convergence_delta=True)

def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

In [None]:
attributions_start_sum = summarize_attributions(attributions_start)
attributions_end_sum = summarize_attributions(attributions_end)

In [None]:
from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients

start_position_vis = viz.VisualizationDataRecord(
    attributions_start_sum,
    torch.max(torch.softmax(start_scores[0], dim=0)),
    torch.argmax(start_scores),
    torch.argmax(start_scores),
    str(ground_truth_start_ind),
    attributions_start_sum.sum(),       
    all_tokens,
    delta_start
)