# 1) Getting Setup

In [1]:
!pip install wandb
!pip install jupyter --upgrade
!pip install ipywidgets widgetsnbextension --upgrade
!pip install -q peft

Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Installing collected packages: jupyter
Successfully installed jupyter-1.0.0
Collecting ipywidgets
  Obtaining dependency information for ipywidgets from https://files.pythonhosted.org/packages/4a/0e/57ed498fafbc60419a9332d872e929879ceba2d73cb11d284d7112472b3e/ipywidgets-8.1.1-py3-none-any.whl.metadata
  Downloading ipywidgets-8.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension
  Obtaining dependency information for widgetsnbextension from https://files.pythonhosted.org/packages/29/03/107d96077c4befed191f7ad1a12c7b52a8f9d2778a5836d59f9855c105f6/widgetsnbextension-4.0.9-py3-none-any.whl.metadata
  Downloading widgetsnbextension-4.0.9-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.9 (from ipywidgets)
  Obtaining dependency information for jupyterlab-widgets~=3.0.9 from https://files.pythonhosted.org/packages/e8/05/0ebab152288693b5ec7b339aab857362947031143b282853b4c2dd4b5b

In [2]:
from accelerate import Accelerator
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig
import wandb
import transformers
import torch
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import random
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch import nn
import sys
import gc
from transformers import DataCollatorWithPadding
from transformers import AdamW
from accelerate import notebook_launcher
from sklearn.model_selection import train_test_split
from accelerate import DistributedDataParallelKwargs
import time
import re
from transformers import get_cosine_schedule_with_warmup



In [3]:
# Uncomment to enable Fully Sharded Data Parallel
# os.environ["ACCELERATE_USE_FSDP"] = "true"

In [4]:
# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
def dict_from_class(cls):
    return dict((key, value) for (key, value) in cls.__dict__.items() if not "__" in key )

In [6]:
class config:
    # General Configuration
    seed = 42
    base_path = "/kaggle/input/my-dear-watson-backtranslated-dataset"
    mode = "maximize"
    device_type = "gpus"
    
    # WandB Configuration
    name = "Multilingual Models"
#     model_name = "facebook/xlm-roberta-xl"
#     model_name = "bert-base-multilingual-cased"
    model_name = "xlm-roberta-large"
    metric_name = "accuracy"

    # Training Hyperparameters
    lr = 1e-4
    epochs = 40
    patience = 8
    grad_accum = 4
    grad_norm = 1.0           # Gradient Clipping
    optimizer = "AdamW"
    scheduler = "Cosine"
    weight_decay = 0.3
    pearson_weight = 0.0      # Percent of weight to put onto Pearson Correlation (Do not use due to NAN losses)
    warmup = 0.1
    mean_max_sampling = False # Whether or not to use mean-max sampling on the final BERT layer (Don't use due to instability)
    
    # Data Configuration
    truncation = True
    padding = True
    test_size = 0.2
    back_translate = 0.5      # Percent of time to back translate
    upsample = False          # Whether or not to upsample (Don't turn this on as the train distribution will vary from test.)
    
    # LoRA hyperparameters
    r = 8
    lora_alpha = 16
    lora_dropout = 0.0        # Dropout on LoRA Layers (Do not use as transformer already has dropout on by default)
    bias = "all"
        
config.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
config.checkpoint = f"/kaggle/working/{config.model_name}.pt"

Downloading config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
if config.device_type == "gpu":
    config.batch_size = 8
elif config.device_type == "gpus":
    config.batch_size = 8
elif config.device_type == "tpu":
    # Batch of 128 for each TPU core
    config.batch_size = 128

In [8]:
# Signing into WandB
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

!wandb login $secret_value_0

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## 1.1) Trackers

In [9]:
class LossTracker:
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [10]:
class AccuracyTracker():
    def __init__(self):
        self.correct_predictions = 0.0
        self.total_predictions = 0.0
        
    def update(self, y_hat, y):
        preds = y_hat.detach().cpu().numpy()
        labels = y.detach().cpu().numpy()
        
        n = len(preds)
        self.correct_predictions += (preds == labels).sum()        
        self.total_predictions += n
    
    def score(self):
        return self.correct_predictions / self.total_predictions

In [11]:
class ModelTracker():
    def __init__(self, model, optimizer, scheduler, accelerator):
        self.missed = 0
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric = float("-inf") if config.mode == "maximize" else float("inf")
        self.accelerator = accelerator
        
    def save_helper(self, epoch):
        self.accelerator.save({
                    "epoch": epoch, 
                    "model_state_dict": self.accelerator.unwrap_model(self.model).state_dict(), 
                    "optimizer_state_dict": self.optimizer.state_dict(),
                    "scheduler": self.scheduler.state_dict()
                }, config.checkpoint)

        self.accelerator.print(f"Saved to model to {config.checkpoint}!")
        
    def save_model(self, epoch):
        self.save_helper(epoch)
        

    def update(self, value, epoch):
        if config.mode == "maximize":
            if value >= self.metric:
                self.accelerator.print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch)    
                self.missed = 0

            else:
                self.accelerator.print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.accelerator.print(f"Model did not improve on epoch {epoch}")
                self.missed += 1
        else:
            if value <= self.metric:
                self.accelerator.print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch) 
                self.missed = 0

            else:
                self.accelerator.print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.accelerator.print(f"Model did not improve on epoch {epoch}")
                self.missed += 1
        
    def check_improvement(self):
        return self.missed < config.patience

# 2) Data Loading

# 2.1) Data Loading

In [12]:
class TrainData(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
    
        # Back Translation Code
        premise = selection["bt_premise"] if np.random.uniform() <= config.back_translate else selection["premise"]
        hypothesis = selection["bt_hypothesis"] if np.random.uniform() <= config.back_translate else selection["hypothesis"]
        
        return premise, hypothesis, selection["label"]

In [13]:
class TestData(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
        return selection["premise"], selection["hypothesis"], selection["label"], selection["language"]
    

In [14]:
# Turns each batch of data into tensors
def train_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    
    tokens = config.tokenizer(text_input, padding=config.padding, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels)

In [15]:
# Turns each batch of data into tensors
def test_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    languages = batch[:, 3]
    
    tokens = config.tokenizer(text_input, padding=config.padding, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels), languages

# 3) Model

In [16]:
class Model(nn.Module):
    def __init__(self, train_len):
        super(Model, self).__init__()
        self.train_len = train_len
        self.base_model = AutoModel.from_pretrained(config.model_name)
        if config.mean_max_sampling:
            self.fc = nn.Linear(2 * self.base_model.config.hidden_size, 3)
        else:
            self.fc = nn.Linear(self.base_model.config.hidden_size, 3)
    
    def feature(self, inputs):
        features = self.base_model(**inputs)["last_hidden_state"]
        # Taking the mean and max of all last hidden state tokens
        if config.mean_max_sampling:
            mean_pooling_embeddings = torch.mean(features, 1)
            _, max_pooling_embeddings = torch.max(features, 1)
            mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)

            return mean_max_embeddings
        else:
            return features[:, 0, :]
    
    def forward(self, inputs):
        features = self.feature(inputs)
        
        return self.fc(features)

# 4) Training Loop

## 4.1) Data Preparation

In [17]:
train = pd.read_csv(f"{config.base_path}/train.csv")
train, test = train_test_split(train, test_size = config.test_size, stratify = train["lang_abv"], random_state = config.seed)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [18]:
# Validation Data and Test Data are both the same in this case!
# They are distinguished for logging purposes!
train_data, val_data, test_data = TrainData(train), TrainData(test), TestData(test)

if config.upsample:
# Getting sample weights for balancing data
    weights = (1 / train.language.value_counts()).to_dict()
    train["weight"] = train.apply(lambda row: weights[row.language], axis = 1)
    sample_weights = list(train["weight"])

    train_sampler = WeightedRandomSampler(sample_weights, len(train_data))

    train_data_loader = DataLoader(train_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count(), sampler = train_sampler)
    
else:
    train_data_loader = DataLoader(train_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count(), shuffle = True)


val_data_loader = DataLoader(test_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())
test_data_loader = DataLoader(test_data, collate_fn = test_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())

## 4.2) Training Functions

In [19]:
def generateConfusionMatrices(val_preds):
    for lang in val_preds.langs.unique():
        filtered_df = val_preds.loc[val_preds.langs == lang]
        y = list(filtered_df.y.astype(int))
        y_hat = list(filtered_df.y_hat.astype(int))
        
        wandb.log({f"{lang} Confusion Matrix": wandb.plot.confusion_matrix(y_true=y, preds=y_hat, class_names=["entailment", "contradiction", "neutral"], title = f"{lang} Confusion Matrix", )})

In [20]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, accelerator):
    model.train()
    model.to(config.device)
    
    loss_tracker = LossTracker()
    accuracy_tracker = AccuracyTracker()

    progress_bar = tqdm(train_loader, desc = f"Training Loop Epoch: {epoch}")
    
    average_acc = None
    
    for batch_idx, batch in enumerate(progress_bar):
        with accelerator.accumulate(model):
            inputs, labels = batch

            for key, value in inputs.items():
                inputs[key] = value.to(config.device)

            labels = labels.to(config.device)
            batch_size = labels.size(0)

            logits = model(inputs)
            # PyTorch CrossEntropy uses the unnormalized logits
            train_loss = criterion(logits, labels)
            scaled_loss = train_loss / config.grad_accum

            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)

            accuracy_tracker.update(y_hat, labels)

            loss_tracker.update(train_loss.item(), batch_size)

            accelerator.backward(scaled_loss)

            if ((batch_idx + 1) % config.grad_accum == 0) or (batch_idx + 1 == len(train_loader)):

                # Clip gradients once all of them are synced to main process
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), config.grad_norm)
                    
                optimizer.step()

                optimizer.zero_grad()

                if not scheduler is None:
                    scheduler.step()
                    for i, lr in enumerate(scheduler.get_last_lr()):
                        accelerator.log({f"Layer {i} Learning Rate": lr})
                        
                avg_accuracy = accuracy_tracker.score()
                avg_loss = loss_tracker.avg
                step_loss = loss_tracker.val
                learning_rate = scheduler.get_last_lr()[0]

                text = f"Epoch: {epoch} | Average Training Accuracy: {avg_accuracy:.4f} | Average Training Loss: {avg_loss:.4f} | Step Training Loss: {step_loss:.4f} | Learning Rate: {learning_rate:.4f}"
                progress_bar.set_postfix_str(text)
                progress_bar.refresh()

                accelerator.log({f"Step Training Loss": step_loss})
                

    epoch_loss = loss_tracker.avg
    epoch_accuracy = accuracy_tracker.score()

    accelerator.log({f"Training Loss Epoch": epoch_loss})
    accelerator.log({f"Training Accuracy Epoch": epoch_accuracy})
    accelerator.print(f"Training Loss: {epoch_loss} | Training Accuracy: {epoch_accuracy}")


In [21]:
def valid_fn(val_loader, model, criterion, epoch, accelerator):
    with torch.no_grad():
        model.eval()
        model.to(config.device)

        loss_tracker = LossTracker()
        accuracy_tracker = AccuracyTracker()

        progress_bar = tqdm(val_loader, desc = f"Validation Loop Epoch: {epoch}")

        for batch_idx, batch in enumerate(progress_bar):

            inputs, labels = batch

            for key, value in inputs.items():
                inputs[key] = value.to(config.device)

            labels = labels.to(config.device)
            batch_size = labels.size(0)

            logits = accelerator.unwrap_model(model)(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)

            val_loss = criterion(logits, labels)

            accuracy_tracker.update(y_hat, labels)
            loss_tracker.update(val_loss.item(), batch_size)

            avg_val_loss = loss_tracker.avg
            avg_val_acc = accuracy_tracker.score()

            progress_bar.set_postfix_str(f"Epoch: {epoch} | Average Validation Accuracy {avg_val_acc:.4f}| Average Validation Loss: {avg_val_loss:.4f}")
            progress_bar.refresh()


        epoch_loss = loss_tracker.avg
        epoch_accuracy = accuracy_tracker.score()

        accelerator.log({f"Validation Loss Epoch": epoch_loss})
        accelerator.log({f"Validation Accuracy Epoch": epoch_accuracy})
        accelerator.print(f"Validation Loss: {epoch_loss} | Validation Accuracy: {epoch_accuracy}")
    
    return epoch_accuracy

In [22]:
def test_fn(test_loader, model, checkpoint, accelerator):
    with torch.no_grad():
        model.to(accelerator.device)
        model.eval()

        accuracy_tracker = AccuracyTracker()

        preds = []
        actual = []
        langs = []

        progress_bar = tqdm(test_loader, desc = f"Test Loop")
        for batch_idx, batch in enumerate(progress_bar):

            inputs, labels, languages = batch

            for key, value in inputs.items():
                inputs[key] = value.to(accelerator.device)

            labels = labels.to(acclerator.device)
            batch_size = labels.size(0)

            logits = model(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)
            accuracy_tracker.update(y_hat, labels)

            preds.extend(y_hat.detach().cpu().numpy())
            actual.extend(labels.detach().cpu().numpy())
            langs.extend(languages)


            avg_val_acc = accuracy_tracker.score()

            progress_bar.set_postfix_str(f"Average Test Accuracy {avg_val_acc}")
            progress_bar.refresh()

        test_accuracy = accuracy_tracker.score()

        wandb.log({f"Final Test Accuracy": test_accuracy})
        accelerator.print(f"Test Accuracy: {test_accuracy}")

        test_preds = pd.DataFrame(np.array([preds, actual, langs]).T, columns = ["y_hat", "y", "langs"])
        test_preds.to_csv("Test Predictions.csv")
        
        # I need to run this because accelerator's API doesn't expose anything to directly save a csv.
        wandb.save(f"/kaggle/working/Test Predictions.csv")
        accelerator.print("Saved Test Predictions to /kaggle/working/Test Predictions.csv")

        generateConfusionMatrices(test_preds)

        return test_accuracy

In [23]:
def getCustomLoss():
    def customLoss(output, target):
        ce = nn.CrossEntropyLoss()
        x = output.argmax(dim = 1).to(dtype = float)
        y = target.to(dtype = float)

        vx = x - torch.mean(x)
        vy = y - torch.mean(y)

        pearsonCost = 1.0 - torch.sum(vx * vy) / (torch.norm(vx) * torch.norm(vy) + 1e-14)
        ceCost = ce(output, target)

        return ceCost + pearsonCost * config.pearson_weight
    return customLoss

## 4.3) Model Loading

In [24]:
# model = torch.compile(Model(len(train_data_loader)))
unwrapped_model = Model(len(train_data_loader))
# model = Model(len(train_data_loader))

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [25]:
model_modules = str(unwrapped_model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))

In [26]:
print(target_modules)

['query', 'key', 'value', 'dense', 'fc']


In [27]:
lora_config = LoraConfig(r = config.r, lora_alpha=config.lora_alpha, bias = config.bias, lora_dropout = config.lora_dropout, target_modules=target_modules)
model = get_peft_model(unwrapped_model, lora_config)

## 4.4) Train Loop

In [28]:
def train_loop(model):
    seed_everything(config.seed)
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    if config.device_type == "gpu" or config.device_type == "gpus":
        accelerator = Accelerator(mixed_precision = "fp16", gradient_accumulation_steps = config.grad_accum, log_with = "wandb", kwargs_handlers=[ddp_kwargs])
        config.device = accelerator.device
    elif config.device_type == "tpu":
        accelerator = Accelerator(mixed_precision = "bf16", gradient_accumulation_steps = config.grad_accum, log_with = "wandb", kwargs_handlers=[ddp_kwargs])
        config.device = accelerator.device

    accelerator.init_trackers(
        "My Dear Watson",
        config=dict_from_class(config),
        init_kwargs={
            "wandb": {
                "group": config.name,
                "reinit": False,
                "job_type": config.model_name,
                "name": f"Seed {config.seed}",
                "entity": "uw-kaggle",
            }
        },
    )
    ######################################################################
    criterion = getCustomLoss()
    optimizer = AdamW(model.parameters(), weight_decay = config.weight_decay, lr = config.lr, correct_bias = True)

    num_training_steps = model.train_len * config.epochs // config.grad_accum

    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = num_training_steps * config.warmup, num_training_steps = num_training_steps)
    model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(model, optimizer, train_data_loader, val_data_loader, scheduler)
    ######################################################################
    tracker = ModelTracker(model, optimizer, scheduler, accelerator)
    
    for epoch in range(config.epochs):

        train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, accelerator)

        val_accuracy = valid_fn(val_loader, model, criterion, epoch, accelerator)

        accelerator.wait_for_everyone()
        tracker.update(val_accuracy, epoch)

        if not tracker.check_improvement():
            print(f"Stopping the model at epoch {epoch} since the model did not improve!")
            break
    
    accelerator.wait_for_everyone()

    gc.collect()

    torch.cuda.empty_cache()

In [None]:
if config.device_type == "tpu":
    # Ignore print message that says it's running on 8 GPUs
    notebook_launcher(train_loop, (model,), num_processes = 8)
    
elif config.device_type == "gpus":
    notebook_launcher(train_loop, (model,), num_processes = torch.cuda.device_count())
    
else:
    train_loop(model)

Launching training on one GPU.


[34m[1mwandb[0m: Currently logged in as: [33mgoggins[0m ([33muw-kaggle[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Loop Epoch: 0:  18%|█▊        | 219/1212 [00:49<03:39,  4.53it/s, Epoch: 0 | Average Training Accuracy: 0.3559 | Average Training Loss: 1.1611 | Step Training Loss: 1.2702 | Learning Rate: 0.0000]

In [None]:
%debug

# 5) Getting Test Predictions

In [None]:
accelerator = Accelerator(log_with = "wandb")
accelerator.init_trackers(
        "My Dear Watson",
        config=dict_from_class(config),
        init_kwargs={
            "wandb": {
                "group": config.name,
                "reinit": False,
                "resume": True,
                "job_type": config.model_name,
                "name": f"Seed {config.seed}",
                "entity": "uw-kaggle",
            }
        },
    )

In [None]:
saved = torch.load(config.checkpoint)
# model = accelerator.unwrap_model(model)
model.load_state_dict(saved["model_state_dict"])

In [None]:
model, test_loader = accelerator.prepare(model, test_data_loader)

In [None]:
test_fn(test_loader, model, config.checkpoint, accelerator)

In [None]:
accelerator.end_training()