# Task 1: Working with a dataset with categorical features

### Step 1, Reading the data

In [19]:
import pandas as pd
import numpy as np
import sklearn as sk
import krippendorff
import torch 

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
class DataSetContainer():
    def __init__(self, RawY=None, RawX=None, File=None, Consensus=None, SplitY=None, ConfidenceWeights=None, TorchDataSet=None):
        self.RawX = RawX
        self.RawY = RawY
        self.SplitY = SplitY
        self.File = File
        self.Consensus = Consensus
        self.ConfidenceWeights = ConfidenceWeights
        self.TorchDataSet = TorchDataSet

trainSet = DataSetContainer(File="assets/a3_train_final.tsv")
testSet = DataSetContainer(File="assets/a3_test.tsv")
dataContainers = [trainSet, testSet]

for dcont in dataContainers:
    df = pd.read_table(dcont.File, names=['opinion', 'text'])
    df = df.sample(frac=1, random_state=0)
    df["text"] = df["text"].apply(lambda a:a.lower())
    dcont.RawY = df["opinion"]
    dcont.RawX = df["text"]

In [20]:
# The trainset has annotator disagreements
# https://towardsdatascience.com/assessing-annotator-disagreements-in-python-to-build-a-robust-dataset-for-machine-learning-16c74b49f043

import numpy as np

import random
from collections import Counter
allNums = trainSet.RawY.str.split('/').to_numpy()
allNums = np.concatenate(allNums).ravel().tolist()
occNum = dict(Counter(allNums))
del occNum['-1']

def randomByOcc():
    val,prob = zip(*(occNum.items()))
    #return int(random.choices([0,1], weights=[1,9]))
    return int(random.choices(val,weights=prob))

def toNumOrNan(n):
    try:
        if (n == "-1"):
            #return randomByOcc()
            #return 0
            return np.nan
        return int(n)
    except Exception:
        return np.nan

for dset in [trainSet]:
    splitOpinion = dset.RawY.str.split('/', expand=True)
    splitOpinion = splitOpinion.applymap(toNumOrNan).transpose()
    
    # since we don't know who the annotators are who wrote what should be arbitrary
    # but (it does not actually matter for krippendorf)
    splitOpinion = pd.DataFrame(data=[sk.utils.shuffle(list(splitOpinion.iloc[:,c]), random_state=c) for c in splitOpinion.columns]).transpose()

    dset.SplitY = splitOpinion
    dset.Consensus = krippendorff.alpha(reliability_data=splitOpinion, value_domain=[0,1])
print(f"Krippendorff alpha for training data: {trainSet.Consensus}")

Krippendorff alpha for training data: 0.8798749025443741


In [21]:
############
# Bert Setup
############
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

loading file vocab.txt from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\wilux/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_

In [22]:
# Weiging annotations, https://arxiv.org/pdf/2208.06161.pdf
#   SPA makes one key assumption: The degree to
#   which labels are absent must be independent of the
#   true item-agreements ni⊥Pi.
from collections import Counter

def getMostLikelyAndItsWeight(col):
    answer2count = Counter([x for x in col if x in [0,1]])
    nAnnotators = float(len(answer2count))

    mostPopularAnswer = sorted(answer2count, reverse=True)[0]
    mostPopularCount = answer2count[mostPopularAnswer]

    # agreement = % is the most popular - % isn't the most popular
    del answer2count[mostPopularAnswer]
    agreement = float(mostPopularCount - sum(answer2count.values()))/nAnnotators

    #using weight = number of annotators
    weight = nAnnotators

    return (weight*agreement, mostPopularAnswer)

train_weights,train_mostPopClass = zip(*[
    getMostLikelyAndItsWeight(dset.SplitY.iloc[:,c]) 
    for c in trainSet.SplitY.columns
])
trainSet.ConfidenceWeights = pd.Series(list(train_weights))

In [23]:
# max tweet = 240 characters
# must const sizes or torch gets angry
train_Xtoken = tokenizer(list(trainSet.RawX), max_length=240, padding=True, truncation=True) 
test_Xtoken  = tokenizer(list(testSet.RawX),  max_length=240, padding=True, truncation=True)

trainSet.TorchDataSet = Dataset(train_Xtoken, list(train_mostPopClass))
testSet.TorchDataSet = Dataset(test_Xtoken, list(testSet.RawY))

In [24]:
def getScores(torchDataSet):
    pred, labels = torchDataSet
    pred = np.argmax(pred, axis=1)

    recall = sk.metrics.recall_score(y_true=labels, y_pred=pred)
    precision = sk.metrics.precision_score(y_true=labels, y_pred=pred)
    f1 = sk.metrics.f1_score(y_true=labels, y_pred=pred)

    return {
        "precision": precision, 
        "recall": recall, 
        "f1": f1
    }

In [31]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

display(trainSet.TorchDataSet)

#https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=1,
    #per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=1337,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=trainSet.TorchDataSet,
    #eval_dataset=trainSet.TorchDataSet,
    compute_metrics=getScores,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

<__main__.Dataset at 0x138d54018a0>

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [34]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

***** Running training *****
  Num examples = 37885
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 113655
  Number of trainable parameters = 109483778




  0%|          | 0/113655 [00:33<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 2.00 GiB total capacity; 1.26 GiB already allocated; 0 bytes free; 1.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Testing

In [None]:
model_path = "output/checkpoint-50000"
#model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)