# 1) Getting Setup

In [1]:
import wandb
import transformers
import torch
import glob
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch import nn
import sys
from tqdm.notebook import tqdm
import gc
from transformers import DataCollatorWithPadding
from transformers import AdamW
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go



In [2]:
# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
def dict_from_class(cls):
    return dict((key, value) for (key, value) in cls.__dict__.items() if not "__" in key )

In [4]:
class config:
    # General Configuration
    seed = 42
    device = "cuda"
    base_path = "/kaggle/input/contradictory-my-dear-watson"
    mode = "maximize"
    patience = 8
    
    # WandB Configuration
    name = "Baseline"
    model_name = "bert-base-multilingual-cased"
    metric_name = "accuracy"

    # Training Configuration
        # EDA shows no sentence longer than 196 words!
        
    lr = 1e-4
    epochs = 40
    patience = 8
    grad_accum = 1
    optimizer = "AdamW"
    scheduler = "cosine"
    batch_size = 32
    warmup_pct = 0.1
    weight_decay = 0.0
    
    # Data Configuration
    max_length = 256
    truncation = True
    padding = True
    test_size = 0.2
        
config.tokenizer = AutoTokenizer.from_pretrained(config.model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [5]:
# Signing into WandB
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

!wandb login $secret_value_0

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## 1.1) Trackers

In [6]:
class LossTracker:
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [7]:
class AccuracyTracker():
    def __init__(self):
        self.correct_predictions = 0.0
        self.total_predictions = 0.0
        
    def update(self, y_hat, y):
        preds = y_hat.detach().cpu().numpy()
        labels = y.detach().cpu().numpy()
        
        n = len(preds)
        self.correct_predictions += (preds == labels).sum()        
        self.total_predictions += n
    
    def score(self):
        return self.correct_predictions / self.total_predictions

In [8]:
class ModelTracker():
    def __init__(self, model, path, optimizer, scheduler, base_path = "/kaggle/working"):
        self.missed = 0
        self.path = path
        self.model = model
        self.base_path = base_path
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric = float("-inf") if config.mode == "maximize" else float("inf")
        
    def save_helper(self, epoch):
        torch.save({
                    "epoch": epoch, 
                    "model_state_dict": self.model.state_dict(), 
                    "optimizer_state_dict": self.optimizer.state_dict(),
                    "scheduler": self.scheduler.state_dict()
                }, f"{self.base_path}/{self.path}")

        print(f"Saved to model to {config.base_path}/{self.path}!")
        
    def save_model(self, epoch):
        self.save_helper(epoch)
        

    def update(self, value, epoch):
        if config.mode == "maximize":
            if value >= self.metric:
                print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch)    
                self.missed = 0

            else:
                print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                print(f"Model did not improve on epoch {epoch}")
                self.missed += 1
        else:
            if value <= self.metric:
                print(f"Validation {config.metric_name} fell from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                self.metric = value
                self.save_model(epoch) 
                self.missed = 0

            else:
                print(f"Validation {config.metric_name} rose from {self.metric:.4f} to {value:.4f} on epoch {epoch}")
                print(f"Model did not improve on epoch {epoch}")
                self.missed += 1

    def get_full_path(self):
        return f"{self.base_path}/{self.path}"
        
    def check_improvement(self):
        return self.missed < config.patience

# 1.2) Data Visualization

In [9]:
def compare_relative_distributions(x1, x2):
    fig = go.Figure()

    # Create the first histogram
    fig.add_trace(go.Histogram(
        x = x1, histnorm = "percent"
    ))

    # Create the second histogram
    fig.add_trace(go.Histogram(
        x = x2, histnorm = "percent"
    ))

    fig.show()

# 2) Data Loading

In [10]:
class TrainData(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
        return selection["premise"], selection["hypothesis"], selection["label"]
    

In [11]:
class TestData(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        selection = self.df.iloc[index]
        return selection["premise"], selection["hypothesis"], selection["label"], selection["language"]
    

In [12]:
def train_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    
    tokens = config.tokenizer(text_input, padding=config.padding, max_length = config.max_length, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels)

In [13]:
def test_collate_dynamic_padding(batch):
    batch = np.array(batch, dtype = "object")
    text_input = batch[:, 0:2].tolist()
    labels = batch[:, 2].astype(int)
    languages = batch[:, 3]
    
    tokens = config.tokenizer(text_input, padding=config.padding, max_length = config.max_length, truncation = config.truncation, return_tensors="pt")
    return tokens, torch.tensor(labels), languages

# 3) Model

In [14]:
class Model(nn.Module):
    def __init__(self, train_len):
        super(Model, self).__init__()
        self.train_len = train_len
        self.base_model = AutoModel.from_pretrained(config.model_name)
        self.fc = nn.Linear(self.base_model.config.hidden_size, 3)
        
    def feature(self, inputs):
        x = self.base_model(**inputs)["last_hidden_state"]
        return x[:, 0, :]
    
    def forward(self, inputs):
        features = self.feature(inputs)
        
        return self.fc(features)

# 4) Training Loop

## 4.1) Data Preparation

In [15]:
train = pd.read_csv(f"{config.base_path}/train.csv")
train, test = train_test_split(train, test_size = config.test_size, stratify = train["lang_abv"], random_state = config.seed)
compare_relative_distributions(train.lang_abv, test.lang_abv)

In [16]:
# Validation Data and Test Data are both the same in this case!
# They are distinguished for logging purposes!
train_data, val_data, test_data = TrainData(train), TrainData(test), TestData(test)
train_loader = DataLoader(train_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count(), shuffle = True)
val_loader = DataLoader(test_data, collate_fn = train_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())
test_loader = DataLoader(test_data, collate_fn = test_collate_dynamic_padding, batch_size = config.batch_size, pin_memory = True, num_workers = os.cpu_count())

## 4.2) Criterions + Optimizers + Schedulers

In [17]:
def generateConfusionMatrices(val_preds):
    for lang in val_preds.langs.unique():
        filtered_df = val_preds.loc[val_preds.langs == lang]
        y = list(filtered_df.y.astype(int))
        y_hat = list(filtered_df.y_hat.astype(int))

        wandb.log({f"{lang} Confusion Matrix": wandb.plot.confusion_matrix(y_true=y, preds=y_hat, class_names=["entailment", "contradiction", "neutral"])})

In [18]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler):
    model.train()
    model = model.to(config.device)
    
    loss_tracker = LossTracker()
    accuracy_tracker = AccuracyTracker()
    
    progress_bar = tqdm(train_loader, desc = f"Training Loop Epoch: {epoch}")
    scaler = torch.cuda.amp.GradScaler()
    
    average_acc = None
    
    for batch_idx, batch in enumerate(progress_bar):
        inputs, labels = batch
        
        for key, value in inputs.items():
            inputs[key] = value.to(config.device)
            
        labels = labels.to(config.device)
        batch_size = labels.size(0)
        
        with torch.cuda.amp.autocast():
            logits = model(inputs)
            # PyTorch CrossEntropy uses the unnormalized logits
            train_loss = criterion(logits, labels)
            scaled_loss = train_loss / config.grad_accum
        
        y_hat = torch.nn.functional.softmax(logits, dim = 1)
        y_hat = y_hat.argmax(dim = 1)
        
        accuracy_tracker.update(y_hat, labels)
        
        loss_tracker.update(train_loss.item(), batch_size)
        
        scaler.scale(scaled_loss).backward()
        
        if ((batch_idx + 1) % config.grad_accum == 0) or (batch_idx + 1 == model.train_len):
            scaler.unscale_(optimizer)

            scaler.step(optimizer)
            scaler.update()

            optimizer.zero_grad()

            if not scheduler is None:
                scheduler.step()
                for i, lr in enumerate(scheduler.get_last_lr()):
                    wandb.log({f"Layer {i} Learning Rate": lr})

        avg_accuracy = accuracy_tracker.score()
        avg_loss = loss_tracker.avg
        step_loss = loss_tracker.val
        learning_rate = scheduler.get_last_lr()[0]
        
        text = f"Epoch: {epoch} | Average Training Accuracy: {avg_accuracy:.4f} | Average Training Loss: {avg_loss:.4f} | Step Training Loss: {step_loss:.4f} | Learning Rate: {learning_rate:.4f}"
        progress_bar.set_postfix_str(text)
        progress_bar.refresh()

        wandb.log({f"Step Training Loss": step_loss})

    epoch_loss = loss_tracker.avg
    epoch_accuracy = accuracy_tracker.score()
    
    wandb.log({f"Training Loss Epoch": epoch_loss})
    wandb.log({f"Training Accuracy Epoch": epoch_accuracy})
    print(f"Training Loss: {epoch_loss} | Training Accuracy: {epoch_accuracy}")

    return epoch_loss, epoch_accuracy


In [19]:
def valid_fn(val_loader, model, criterion, epoch):
    model.eval()
    model = model.to(config.device)
    
    loss_tracker = LossTracker()
    accuracy_tracker = AccuracyTracker()

    progress_bar = tqdm(val_loader, desc = f"Validation Loop Epoch: {epoch}")
    for batch_idx, batch in enumerate(progress_bar):

        inputs, labels = batch

        for key, value in inputs.items():
            inputs[key] = value.to(config.device)

        labels = labels.to(config.device)
        batch_size = labels.size(0)
        
        with torch.no_grad():
            logits = model(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)
            accuracy_tracker.update(y_hat, labels)

        val_loss = criterion(logits, labels)
        loss_tracker.update(val_loss.item(), batch_size)
        
        avg_val_loss = loss_tracker.avg
        avg_val_acc = accuracy_tracker.score()
        
        progress_bar.set_postfix_str(f"Epoch: {epoch} | Average Validation Accuracy {avg_val_acc:.4f}| Average Validation Loss: {avg_val_loss:.4f}")
        progress_bar.refresh()

        
    epoch_loss = loss_tracker.avg
    epoch_accuracy = accuracy_tracker.score()
    
    wandb.log({f"Validation Loss Epoch": epoch_loss})
    wandb.log({f"Validation Accuracy Epoch": epoch_accuracy})
    print(f"Validation Loss: {epoch_loss} | Validation Accuracy: {epoch_accuracy}")
    
    return epoch_loss, epoch_accuracy

In [20]:
def test_fn(test_loader, model, checkpoint):
    saved = torch.load(checkpoint)
    model.load_state_dict(saved["model_state_dict"])
    model = model.to(config.device)
    model.eval()

    accuracy_tracker = AccuracyTracker()
    
    preds = []
    actual = []
    langs = []

    progress_bar = tqdm(test_loader, desc = f"Test Loop")
    for batch_idx, batch in enumerate(progress_bar):

        inputs, labels, languages = batch

        for key, value in inputs.items():
            inputs[key] = value.to(config.device)

        labels = labels.to(config.device)
        batch_size = labels.size(0)
        
        with torch.no_grad():
            logits = model(inputs)
            y_hat = torch.nn.functional.softmax(logits, dim = 1)
            y_hat = y_hat.argmax(dim = 1)
            accuracy_tracker.update(y_hat, labels)
            
            preds.extend(y_hat.detach().cpu().numpy())
            actual.extend(labels.detach().cpu().numpy())
            langs.extend(languages)
            
    
        avg_val_acc = accuracy_tracker.score()
        
        progress_bar.set_postfix_str(f"Average Test Accuracy {avg_val_acc}")
        progress_bar.refresh()

    test_accuracy = accuracy_tracker.score()
    
    wandb.log({f"Final Test Accuracy": test_accuracy})
    print(f"Test Accuracy: {test_accuracy}")
    
    test_preds = pd.DataFrame(np.array([preds, actual, langs]).T, columns = ["y_hat", "y", "langs"])
    test_preds.to_csv("Test Predictions.csv")

    wandb.save(f"/kaggle/working/Test Predictions.csv")
    print("Saved Test Predictions to /kaggle/working/Test Predictions.csv")
    
    generateConfusionMatrices(test_preds)
    
    return test_accuracy

In [21]:
def train_loop():
    seed_everything(config.seed)
    ######################################################################
    model = Model(len(train_loader))
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), weight_decay = config.weight_decay, lr = config.lr, correct_bias = True)

    warmup_steps = model.train_len * config.warmup_pct
    num_training_steps = model.train_len * config.epochs // config.grad_accum

    scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_training_steps)
    ######################################################################
    tracker = ModelTracker(model, f"Baseline-Multilingual-BERT.pt", optimizer, scheduler)
    wandb.init(project="My Dear Watson", entity = "goggins", group = config.name, config = dict_from_class(config), reinit = True, job_type = config.model_name, name = f"Seed {config.seed}")
    
    for epoch in range(config.epochs):

        train_loss, train_accuracy = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler)

        val_loss, val_accuracy = valid_fn(val_loader, model, criterion, epoch)

        tracker.update(val_accuracy, epoch)

        if not tracker.check_improvement():
            print(f"Stopping the model at epoch {epoch} since the model did not improve!")
            break

    checkpoint = tracker.get_full_path()

    test_fn(test_loader, model, checkpoint)

    wandb.save(checkpoint)

    del model

    gc.collect()

    torch.cuda.empty_cache()


In [None]:
train_loop()

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



[34m[1mwandb[0m: Currently logged in as: [33mgoggins[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training Loop Epoch: 0:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 1.0211795954027585 | Training Accuracy: 0.4938118811881188


Validation Loop Epoch: 0:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 0.9300751322173443 | Validation Accuracy: 0.5779702970297029
Validation accuracy rose from -inf to 0.5780 on epoch 0
Saved to model to /kaggle/input/contradictory-my-dear-watson/Baseline-Multilingual-BERT.pt!


Training Loop Epoch: 1:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.9033685659024582 | Training Accuracy: 0.5996287128712872


Validation Loop Epoch: 1:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 0.9422530653846539 | Validation Accuracy: 0.5598184818481848
Validation accuracy fell from 0.5780 to 0.5598 on epoch 1
Model did not improve on epoch 1


Training Loop Epoch: 2:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.7672409029683658 | Training Accuracy: 0.6804867986798679


Validation Loop Epoch: 2:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 0.9868857199602789 | Validation Accuracy: 0.5581683168316832
Validation accuracy fell from 0.5780 to 0.5582 on epoch 2
Model did not improve on epoch 2


Training Loop Epoch: 3:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.6072807845109367 | Training Accuracy: 0.7642326732673267


Validation Loop Epoch: 3:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.163663876725502 | Validation Accuracy: 0.5598184818481848
Validation accuracy fell from 0.5780 to 0.5598 on epoch 3
Model did not improve on epoch 3


Training Loop Epoch: 4:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.46071364216678606 | Training Accuracy: 0.8348803630363036


Validation Loop Epoch: 4:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.2020301877862156 | Validation Accuracy: 0.5792079207920792
Validation accuracy rose from 0.5780 to 0.5792 on epoch 4
Saved to model to /kaggle/input/contradictory-my-dear-watson/Baseline-Multilingual-BERT.pt!


Training Loop Epoch: 5:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.32782934424113913 | Training Accuracy: 0.8827351485148515


Validation Loop Epoch: 5:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.3831618024964538 | Validation Accuracy: 0.5565181518151815
Validation accuracy fell from 0.5792 to 0.5565 on epoch 5
Model did not improve on epoch 5


Training Loop Epoch: 6:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.2581050233675702 | Training Accuracy: 0.9127475247524752


Validation Loop Epoch: 6:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.6449681968972234 | Validation Accuracy: 0.5396039603960396
Validation accuracy fell from 0.5792 to 0.5396 on epoch 6
Model did not improve on epoch 6


Training Loop Epoch: 7:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.21893238819668395 | Training Accuracy: 0.9263613861386139


Validation Loop Epoch: 7:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.7431869794039836 | Validation Accuracy: 0.5507425742574258
Validation accuracy fell from 0.5792 to 0.5507 on epoch 7
Model did not improve on epoch 7


Training Loop Epoch: 8:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.15446233567428275 | Training Accuracy: 0.9488448844884488


Validation Loop Epoch: 8:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 1.919054365000709 | Validation Accuracy: 0.5697194719471947
Validation accuracy fell from 0.5792 to 0.5697 on epoch 8
Model did not improve on epoch 8


Training Loop Epoch: 9:   0%|          | 0/303 [00:00<?, ?it/s]

Training Loss: 0.12053690883930367 | Training Accuracy: 0.9601897689768977


Validation Loop Epoch: 9:   0%|          | 0/76 [00:00<?, ?it/s]

Validation Loss: 2.0926939410344993 | Validation Accuracy: 0.5606435643564357
Validation accuracy fell from 0.5792 to 0.5606 on epoch 9
Model did not improve on epoch 9


Training Loop Epoch: 10:   0%|          | 0/303 [00:00<?, ?it/s]