# 1) Getting Setup

In [None]:
!pip install wandb
!pip install jupyter --upgrade
!pip install ipywidgets widgetsnbextension --upgrade
# !pip install nlpaug
# !pip install sacremoses

Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)


In [None]:
!pip install -q peft

In [None]:
from accelerate import Accelerator
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig

In [None]:
import wandb
import transformers
import torch
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import random
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch import nn
import sys
import gc
from transformers import DataCollatorWithPadding
from transformers import AdamW
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from accelerate import notebook_launcher
from accelerate import DistributedDataParallelKwargs
# from torch.distributed.fsdp import (
#    FullyShardedDataParallel,
# )
# import nlpaug.augmenter.word as naw

In [None]:
# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
def dict_from_class(cls):
    return dict((key, value) for (key, value) in cls.__dict__.items() if not "__" in key )

In [None]:
class config:
    # General Configuration
    seed = 42
    base_path = "/kaggle/input/contradictory-my-dear-watson"
    mode = "maximize"
    patience = 8
    device_type = "gpus"
    
    # WandB Configuration
    name = "Baseline"
    model_name = "google/byt5-large"
    metric_name = "accuracy"

    # Training Configuration
        # EDA shows no sentence longer than 196 words!
        
    lr = 2e-5
    epochs = 40
    patience = 8
    grad_accum = 8
    optimizer = "AdamW"
    scheduler = "cosine"
    batch_size = 4
    warmup_pct = 0.1
    weight_decay = 0.0
    
    # Data Configuration
#     max_length = "longest_first"
    truncation = True
    padding = True
    test_size = 0.2
    back_translate = 0.5
    dropout = 0.3
    upsample = True
    
    # LoRA hyperparameters
    r = 8
    lora_alpha = 8
    lora_dropout = 0.1
    bias = "all"
        
config.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
config.checkpoint = f"/kaggle/working/{config.model_name}.pt"

In [None]:
if config.device_type == "gpu":
    config.batch_size = 4
elif config.device_type == "gpus":
    # Batch of 8 for each gpu
    config.batch_size = 4
elif config.device_type == "tpu":
    # Batch of 128 for each TPU core
    config.batch_size = 128

In [None]:
# Signing into WandB
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

!wandb login $secret_value_0

## 1.1) Trackers

In [None]:
class LossTracker:
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class AccuracyTracker():
    def __init__(self):
        self.correct_predictions = 0.0
        self.total_predictions = 0.0
        
    def update(self, y_hat, y):
        preds = y_hat.detach().cpu().numpy()
        labels = y.detach().cpu().numpy()
        
        n = len(preds)
        self.correct_predictions += (preds == labels).sum()        
        self.total_predictions += n
    
    def score(self):
        return self.correct_predictions / self.total_predictions

In [None]:
class ModelTracker():
    def __init__(self, model, optimizer, scheduler, accelerator):
        self.missed = 0
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric = float("-inf") if config.mode == "maximize" else float("inf")
        self.accelerator = accelerator
        
    def save_helper(self, epoch):
        self.accelerator.save({
                    "epoch": epoch, 
                    "model_state_dict": self.accelerator.unwrap_model(self.model).state_dict(), 
                    "optimizer_state_dict": self.optimizer.state_dict(),
                    "scheduler": self.scheduler.state_dict()
                }, config.checkpoint)

        self.accelerator.print(f"Saved to model to {config.checkpoint}!")
        
    def save_model(self, epoch):
        self.save_helper(epoch)
        

    def update(self, value, epoch):
        if config.mode == "maximize":
            if value >= self.metric:
                self.accelerator.print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch)    
                self.missed = 0

            else:
                self.accelerator.print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.accelerator.print(f"Model did not improve on epoch {epoch}")
                self.missed += 1
        else:
            if value <= self.metric:
                self.accelerator.print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch) 
                self.missed = 0

            else:
                self.accelerator.print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.accelerator.print(f"Model did not improve on epoch {epoch}")
                self.missed += 1
        
    def check_improvement(self):
        return self.missed < config.patience

# 2) Data Loading

# 2.2) Data Loading

In [None]:
class TrainData(Dataset):
    def __init__(self, df):
        self.df = df
#         self.en_to_chinese = AutoModel.from_pretrained('Helsinki-NLP/opus-mt-en-zh')
#         self.en_to_arabic = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
#         self.en_to_french = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
#         self.en_to_swah = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-sw")
#         self.en_to_urdu = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
#         self.en_to_viet = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-vi")
#         self.en_to_russ = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
#         self.en_to_hindi = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
#         self.en_to_thai = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-th")
#         self.en_to_greek = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-gr")
#         self.en_to_spanish = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-sp")
# #         self.en_to_german = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-gr")
#         self.en_to_turkish = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-tr")
#         self.en_to_bulgarian = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en") 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
        
        premise = selection["premise"]
        
        hypothesis = selection["hypothesis"]
        
        language = selection["language"]
        
#         if language == "English":
#             if np.random.uniform() >= config.back_translate:
#                 premise = self.aug.augment(selection["premise"], num_thread = 0)
            
#             if np.random.uniform() >= config.back_translate:

#                 hypothesis = self.aug.augment(selection["hypothesis"], num_thread = 0)
        
        
        return premise, hypothesis, selection["label"]
    

In [None]:
class TestData(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
        return selection["premise"], selection["hypothesis"], selection["label"], selection["language"]
    

In [None]:
def train_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    
    tokens = config.tokenizer(text_input, padding=config.padding, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels)

In [None]:
def test_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    languages = batch[:, 3]
    
    tokens = config.tokenizer(text_input, padding=config.padding, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels), languages

# 3) Model

In [None]:
class Model(nn.Module):
    def __init__(self, train_len):
        super(Model, self).__init__()
        self.train_len = train_len
        self.base_model = AutoModel.from_pretrained(config.model_name).encoder
        self.dropout = nn.Dropout(p = config.dropout)
        self.fc = nn.Linear(self.base_model.config.hidden_size, 3)
        
    def feature(self, inputs):
        x = self.base_model(**inputs)["last_hidden_state"]
        return x[:, 0, :]
    
    def forward(self, inputs):
        features = self.feature(inputs)
        
        return self.fc(self.dropout(features))

# 4) Training Loop

## 4.1) Data Preparation

In [None]:
train = pd.read_csv(f"{config.base_path}/train.csv")
train, test = train_test_split(train, test_size = config.test_size, stratify = train["lang_abv"], random_state = config.seed)

## 4.2) Criterions + Optimizers + Schedulers

In [None]:
def generateConfusionMatrices(val_preds):
    for lang in val_preds.langs.unique():
        filtered_df = val_preds.loc[val_preds.langs == lang]
        y = list(filtered_df.y.astype(int))
        y_hat = list(filtered_df.y_hat.astype(int))
        
        wandb.log({f"{lang} Confusion Matrix": wandb.plot.confusion_matrix(y_true=y, preds=y_hat, class_names=["entailment", "contradiction", "neutral"], title = f"{lang} Confusion Matrix", )})

In [None]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, accelerator):
    model.train()
    model.to(config.device)
    
    loss_tracker = LossTracker()
    accuracy_tracker = AccuracyTracker()

    progress_bar = tqdm(train_loader, desc = f"Training Loop Epoch: {epoch}")
    
    average_acc = None
    
    for batch_idx, batch in enumerate(progress_bar):
        with accelerator.accumulate(model):
            inputs, labels = batch

            for key, value in inputs.items():
                inputs[key] = value.to(config.device)

            labels = labels.to(config.device)
            batch_size = labels.size(0)

            logits = model(inputs)
            # PyTorch CrossEntropy uses the unnormalized logits
            train_loss = criterion(logits, labels)
            scaled_loss = train_loss / config.grad_accum

            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)

            accuracy_tracker.update(y_hat, labels)

            loss_tracker.update(train_loss.item(), batch_size)

            accelerator.backward(scaled_loss)

            if ((batch_idx + 1) % config.grad_accum == 0) or (batch_idx + 1 == model.train_len):

                optimizer.step()

                optimizer.zero_grad()

                if not scheduler is None:
                    scheduler.step()
                    for i, lr in enumerate(scheduler.get_last_lr()):
                        accelerator.log({f"Layer {i} Learning Rate": lr})
                        
            avg_accuracy = accuracy_tracker.score()
            avg_loss = loss_tracker.avg
            step_loss = loss_tracker.val
            learning_rate = scheduler.get_last_lr()[0]

            text = f"Epoch: {epoch} | Average Training Accuracy: {avg_accuracy:.4f} | Average Training Loss: {avg_loss:.4f} | Step Training Loss: {step_loss:.4f} | Learning Rate: {learning_rate:.4f}"
            progress_bar.set_postfix_str(text)
            progress_bar.refresh()

            accelerator.log({f"Step Training Loss": step_loss})

    epoch_loss = loss_tracker.avg
    epoch_accuracy = accuracy_tracker.score()

    accelerator.log({f"Training Loss Epoch": epoch_loss})
    accelerator.log({f"Training Accuracy Epoch": epoch_accuracy})
    accelerator.print(f"Training Loss: {epoch_loss} | Training Accuracy: {epoch_accuracy}")


In [None]:
def valid_fn(val_loader, model, criterion, epoch, accelerator):
    with torch.no_grad():
        model.eval()
        model.to(config.device)

        loss_tracker = LossTracker()
        accuracy_tracker = AccuracyTracker()

        progress_bar = tqdm(val_loader, desc = f"Validation Loop Epoch: {epoch}")

        for batch_idx, batch in enumerate(progress_bar):

            inputs, labels = batch

            for key, value in inputs.items():
                inputs[key] = value.to(config.device)

            labels = labels.to(config.device)
            batch_size = labels.size(0)

            logits = accelerator.unwrap_model(model)(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)

            val_loss = criterion(logits, labels)

            accuracy_tracker.update(y_hat, labels)
            loss_tracker.update(val_loss.item(), batch_size)

            avg_val_loss = loss_tracker.avg
            avg_val_acc = accuracy_tracker.score()

            progress_bar.set_postfix_str(f"Epoch: {epoch} | Average Validation Accuracy {avg_val_acc:.4f}| Average Validation Loss: {avg_val_loss:.4f}")
            progress_bar.refresh()


        epoch_loss = loss_tracker.avg
        epoch_accuracy = accuracy_tracker.score()

        accelerator.log({f"Validation Loss Epoch": epoch_loss})
        accelerator.log({f"Validation Accuracy Epoch": epoch_accuracy})
        accelerator.print(f"Validation Loss: {epoch_loss} | Validation Accuracy: {epoch_accuracy}")
    
    return epoch_accuracy

In [None]:
def test_fn(test_loader, model, checkpoint, accelerator):
    with torch.no_grad():
        model.to(accelerator.device)
        model.eval()

        accuracy_tracker = AccuracyTracker()

        preds = []
        actual = []
        langs = []

        progress_bar = tqdm(test_loader, desc = f"Test Loop")
        for batch_idx, batch in enumerate(progress_bar):

            inputs, labels, languages = batch

            for key, value in inputs.items():
                inputs[key] = value.to(accelerator.device)

            labels = labels.to(acclerator.device)
            batch_size = labels.size(0)

            logits = model(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)
            accuracy_tracker.update(y_hat, labels)

            preds.extend(y_hat.detach().cpu().numpy())
            actual.extend(labels.detach().cpu().numpy())
            langs.extend(languages)


            avg_val_acc = accuracy_tracker.score()

            progress_bar.set_postfix_str(f"Average Test Accuracy {avg_val_acc}")
            progress_bar.refresh()

        test_accuracy = accuracy_tracker.score()

        wandb.log({f"Final Test Accuracy": test_accuracy})
        accelerator.print(f"Test Accuracy: {test_accuracy}")

        test_preds = pd.DataFrame(np.array([preds, actual, langs]).T, columns = ["y_hat", "y", "langs"])
        test_preds.to_csv("Test Predictions.csv")
        
        # I need to run this because accelerator's API doesn't expose anything to directly save a csv.
        wandb.save(f"/kaggle/working/Test Predictions.csv")
        accelerator.print("Saved Test Predictions to /kaggle/working/Test Predictions.csv")

        generateConfusionMatrices(test_preds)

        return test_accuracy

In [None]:
# Validation Data and Test Data are both the same in this case!
# They are distinguished for logging purposes!
train_data, val_data, test_data = TrainData(train), TrainData(test), TestData(test)

# Getting sample weights for balancing data
weights = (1 / train.language.value_counts()).to_dict()
train["weight"] = train.apply(lambda row: weights[row.language], axis = 1)
sample_weights = list(train["weight"])

train_sampler = WeightedRandomSampler(sample_weights, len(train_data))

train_data_loader = DataLoader(train_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count(), sampler = train_sampler)


val_data_loader = DataLoader(test_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())
test_data_loader = DataLoader(test_data, collate_fn = test_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())

In [None]:
# model = torch.compile(Model(len(train_data_loader)))
# model = FullyShardedDataParallel(Model(len(train_data_loader)))
model = Model(len(train_data_loader))

In [None]:
lora_config = LoraConfig(r = config.r, bias = config.bias, lora_dropout = config.lora_dropout, target_modules=["q", "k", "v", "o"])

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
def train_loop(model):
    seed_everything(config.seed)
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    if config.device_type == "cuda" or config.device_type == "gpus":
        accelerator = Accelerator(mixed_precision = "fp16", gradient_accumulation_steps = config.grad_accum, log_with = "wandb", kwargs_handlers=[ddp_kwargs])
        config.device = accelerator.device
    elif config.device_type == "tpu":
        accelerator = Accelerator(mixed_precision = "bf16", gradient_accumulation_steps = config.grad_accum, downcast_bf16=True, log_with = "wandb", kwargs_handlers=[ddp_kwargs])
        config.device = accelerator.device

    accelerator.init_trackers(
        "My Dear Watson",
        config=dict_from_class(config),
        init_kwargs={
            "wandb": {
                "group": config.name,
                "reinit": False,
                "job_type": config.model_name,
                "name": f"Seed {config.seed}",
                "entity": "uw-kaggle",
            }
        },
    )
    ######################################################################
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), weight_decay = config.weight_decay, lr = config.lr, correct_bias = True)

    warmup_steps = model.train_len * config.warmup_pct
    num_training_steps = model.train_len * config.epochs // config.grad_accum

    scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_training_steps)
    model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(model, optimizer, train_data_loader, val_data_loader, scheduler)
    ######################################################################
    tracker = ModelTracker(model, optimizer, scheduler, accelerator)
    
    for epoch in range(config.epochs):

        train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, accelerator)

        val_accuracy = valid_fn(val_loader, model, criterion, epoch, accelerator)

        accelerator.wait_for_everyone()
        tracker.update(val_accuracy, epoch)

        if not tracker.check_improvement():
            print(f"Stopping the model at epoch {epoch} since the model did not improve!")
            break
    
    accelerator.wait_for_everyone()

    gc.collect()

    torch.cuda.empty_cache()

In [None]:
if config.device_type == "tpu":
    # Ignore print message that says it's running on 8 GPUs
    notebook_launcher(train_loop, (model,), num_processes = 8)
    
elif config.device_type == "gpus":
    notebook_launcher(train_loop, (model,), num_processes = torch.cuda.device_count())
    
else:
    train_loop(model)

# 5) Getting Test Predictions

In [None]:
acclerator = Accelerator(log_with = "wandb")
accelerator.init_trackers(
        "My Dear Watson",
        config=dict_from_class(config),
        init_kwargs={
            "wandb": {
                "group": config.name,
                "reinit": False,
                "resume": True,
                "job_type": config.model_name,
                "name": f"Seed {config.seed}",
                "entity": "uw-kaggle",
            }
        },
    )

In [None]:
saved = torch.load(config.checkpoint)
model = accelerator.unwrap_model(model)
model.load_state_dict(saved["model_state_dict"])
model = torch.compile(model)

In [None]:
model, test_loader = accelerator.prepare(model, test_data_loader)

In [None]:
test_fn(test_loader, model, config.checkpoint, accelerator)

In [None]:
accelerator.end_training()