In [9]:
import json
from pathlib import Path
from typing import Callable, Tuple, Dict, Union, Any

import torch
from datasets import load_dataset, load_from_disk
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from transformers import AutoTokenizer


In [42]:
import logging
import random
from typing import Callable, Tuple, Any

import click
import numpy as np
import torch
import wandb
from datasets import load_metric, Metric
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler, CyclicLR
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AdamW
from transformers import RobertaForSequenceClassification, set_seed, default_data_collator, \
    DataCollatorWithPadding, get_scheduler

from hs_generalization.utils import get_dataset, load_config, save_model, load_model

In [2]:
dataset = load_dataset("csv", data_files="../data/davidson/labeled_data.csv")

Found cached dataset csv (/Users/urjakhurana/.cache/huggingface/datasets/csv/default-d0e14351e646c3c0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet'],
        num_rows: 24783
    })
})

In [7]:
dataset["train"]

Dataset({
    features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet'],
    num_rows: 24783
})

In [13]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
dataset = dataset.map(
    lambda x: tokenizer(
        x["tweet"],
        padding=True,
        truncation=True,
    ),
    batched=True,
)

Map:   0%|          | 0/24783 [00:00<?, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 24783
    })
})

In [20]:
from datasets import DatasetDict

# 90% train, 10% test + validation
train_testvalid = dataset["train"].train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [21]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 22304
    })
    test: Dataset({
        features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 1240
    })
    valid: Dataset({
        features: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 1239
    })
})

In [23]:
set_seed(0)
torch.backends.cudnn.deterministic = True

In [25]:
train_dataset = train_test_valid_dataset["train"]
validation_dataset = train_test_valid_dataset["valid"]

In [26]:
batch_size = 32

In [74]:
# train_dataset = train_dataset.rename_column("class", "labels")
train_datasets = train_dataset.remove_columns(["Unnamed: 0", "count", "hate_speech", "offensive_language", "neither", "tweet"])
# validation_dataset = validation_dataset.rename_column("class", "labels")

In [33]:
data_collator = DataCollatorWithPadding(tokenizer)

In [75]:
train_dataloader = DataLoader(train_datasets, shuffle=True, collate_fn=data_collator, batch_size=batch_size)
val_dataloader = DataLoader(validation_dataset, shuffle=False, collate_fn=data_collator, batch_size=batch_size)

In [36]:
num_update_steps_per_epoch = len(train_dataloader)
n_epochs = 2
max_train_steps = n_epochs * num_update_steps_per_epoch

In [45]:
def get_optimizer(model: Any, learning_rate: float, weight_decay: float) -> Optimizer:
    """Function that returns the optimizer for training.

    Given the model, learning rate, and weight decay, this function returns the optimizer that can be used while
    training. The model parameters are split into two groups: weight decay and non-weight decay groups, as done in the
    BERT paper.

    Args:
        model (torch.nn.module): Model used for training.
        learning_rate (float): Float that indicates the learning rate.
        weight_decay (float): Float that indicates the weight decay.

    Returns:
        optimizer (Optimizer): optimizer for the training.
    """
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

    return optimizer

In [46]:
metric = load_metric("f1")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")
optimizer = get_optimizer(model, 0.00005, 0.0)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [47]:
lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=200,
        num_training_steps=max_train_steps,
)

In [48]:
start_epoch = 0

In [50]:
device = "cpu"
model = model.to(device)

In [54]:
train_batch_size = batch_size
validation_batch_size = batch_size

In [55]:
print(f" Amount training examples: {len(train_dataset)}")
print(f" Amount validation examples: {len(validation_dataset)}")
print(f" Amount of epochs: {n_epochs}")
print(f" Amount optimization steps: {max_train_steps}")
print(f" Batch size train: {train_batch_size}, validation: {validation_batch_size}")

 Amount training examples: 22304
 Amount validation examples: 1239
 Amount of epochs: 2
 Amount optimization steps: 1394
 Batch size train: 32, validation: 32


In [76]:
for index in random.sample(range(len(train_dataset)), 3):
    print(f" Sample {index} of the training set: {train_dataset[index]}.")


 Sample 8484 of the training set: {'Unnamed: 0': 24282, 'count': 3, 'hate_speech': 0, 'offensive_language': 3, 'neither': 0, 'labels': 1, 'tweet': 'eagles are fucking cunts http://t.co/nK2CO0snNv', 'input_ids': [0, 242, 26007, 32, 23523, 740, 32346, 2054, 640, 90, 4, 876, 73, 282, 530, 176, 6335, 288, 22617, 487, 705, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.
 Sample 16753 of the training set: {'Unnamed: 0': 7181, 'count': 3, 'hate_speech': 0, 'offensive_language': 3, 'neither': 0, 'labels': 1, '

In [58]:
logging_freq = 100

In [60]:
def train(
        model: Any,
        epoch: int,
        dataloader: DataLoader,
        optimizer: Optimizer,
        lr_scheduler: _LRScheduler,
        metric: Metric,
        logging_freq: int,
        max_steps: int,
        device: str,
) -> None:
    """Function that performs all the steps during the training phase.

    In this function, the entire training phase of an epoch is run. Looping over the dataloader, each batch is fed
    to the model, the loss and metric are tracked/calculated, and the forward and backward pass are done.

    Args:
        model (Model): Model that is being trained.
        epoch (int): Current epoch of experiment.
        dataloader (DataLoader): Object that will load the training data.
        optimizer (Optimizer): Optimizer for training.
        lr_scheduler (_LRScheduler): Learning rate scheduler for the optimizer.
        metric (Metric): Metric that is being tracked.
        logging_freq (int): Frequency of logging the training metrics.
        max_steps (int): Maximum amount of steps to be taken during this epoch.
        device (str): Device on which training will be done.
    """
    model.train()
    logging.info(f" Start training epoch {epoch}")

    scores = []
    losses = []
    for step, batch in enumerate(tqdm(dataloader)):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        if step < 5:
            print(batch["input_ids"][0])
            print(dataloader.collate_fn.tokenizer.batch_decode(batch["input_ids"])[0])

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        lr_scheduler.step()
        current_lr = lr_scheduler.get_last_lr()[0]

        predictions = outputs.logits.argmax(dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

        accuracy = metric.compute()["accuracy"]
        metrics = {"train_accuracy": accuracy}
        scores.append(accuracy)

        current_step = (epoch * len(dataloader)) + step
        log_dict = {"epoch": epoch, "train_loss": loss, **metrics, "learning_rate": current_lr}
        wandb.log(log_dict, step=current_step)

        losses.append(loss.detach().cpu().numpy())

        if step % logging_freq == 0:
            logger.info(f" Epoch {epoch}, Step {step}: Loss: {loss}, Accuracy: {accuracy}")

        if current_step == max_steps - 1:
            break

    average_loss = np.mean(losses)
    average_score = np.mean(scores)

    metrics = {"average_train_accuracy": average_score}
    logger.info(f" Epoch {epoch} average training loss: {average_loss}, accuracy: {average_score}")

    wandb.log({"average_train_loss": average_loss, **metrics})


def validate(
        model: Any,
        epoch: int,
        dataloader: DataLoader,
        metric: Metric,
        max_steps: int,
        device: str,
) -> Tuple[np.float_, float]:
    """Function that performs all the steps during the validation/evaluation phase.

    In this function, the entire evaluation phase of an epoch is run. Looping over the dataloader, each batch is fed
    to the model and the loss and accuracy are tracked.

    Args:
        model (Model): Model that is being trained.:
        epoch (int): Current epoch of experiment.
        dataloader (DataLoader): Object that will load the training data.
        metric (Metric): Metric that is being tracked.
        max_steps (int): Maximum amount of steps to be taken during this epoch.
        device (str): Device on which training will be done.

    Returns:
        eval_loss (float): Average loss over the whole validation set.
        eval_accuracy (float): Average accuracy over the whole validation set.
    """
    model.eval()

    with torch.no_grad():
        logger.info(" Starting Evaluation")
        losses = []
        for step, batch in enumerate(tqdm(dataloader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            if step < 5:
                print(batch["input_ids"][0])
                print(dataloader.collate_fn.tokenizer.batch_decode(batch["input_ids"])[0])

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])

            losses.append(outputs.loss.detach().cpu().numpy())
            current_step = (epoch * len(dataloader)) + step

            if current_step == max_steps - 1:
                break

    eval_loss = np.mean(losses)

    eval_accuracy = metric.compute()["accuracy"]
    logger.info(f" Evaluation {epoch}: Average Loss: {eval_loss}, Average Accuracy: {eval_accuracy}")
    metrics = {"eval_accuracy": eval_accuracy}

    wandb.log({"epoch": epoch, "eval_loss": eval_loss, **metrics})

    return eval_loss, eval_accuracy

In [78]:
for epoch in range(start_epoch, n_epochs):
    train(
        model,
        epoch,
        train_dataloader,
        optimizer,
        lr_scheduler,
        metric,
        logging_freq,
        max_train_steps,
        device,
    )
    eval_loss, eval_accuracy = validate(
        model,
        epoch,
        validation_dataloader,
        metric,
        max_train_steps,
        device,
    )

    print("\n")

    save_model(
        model, optimizer, lr_scheduler, epoch, config["pipeline"]["output_directory"], model_name
    )
    tracker.update(eval_loss, eval_accuracy, epoch)

  0%|                                                                                                                                                                                                                          | 0/697 [00:00<?, ?it/s]

tensor([    0, 13963,   787, 10004,   102, 37419,  1438,  5499,   673,    35,
        19325,     7, 33044,   209,   828,  5559,     8,  4356,    45,   190,
          457,   169,     5,   605,   127, 27685, 50118,  8166,   640,    90,
            4,   876,    73,   257, 32428,  7180,   448,   406, 41238,  1178,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

  0%|                                                                                                                                                                                                                          | 0/697 [00:23<?, ?it/s]


KeyboardInterrupt: 

TypeError: 'RandomSampler' object is not subscriptable