# Preparation

In [2]:
use_colab = True
if use_colab:
    !pip install transformers[sentencepiece]
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/CAPP30255_Project/twitter_disaster_detection"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Using cached transformers-4.29.2-py3-none-any.whl (7.1 MB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[sentencepiece])
  Using cached huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[sentencepiece])
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<=3.20.2 (from transformers[sentencepiece])
  Downloading protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━

Mounted at /content/drive


In [3]:
import os
import copy
import pandas as pd
import numpy as np
# from tqdm.autonotebook import tqdm
from tqdm import tqdm
import random
import json

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold

# importing HuggingFace transformers library
import transformers
from transformers import pipeline, get_linear_schedule_with_warmup

print("Transformer version:", transformers.__version__)
print("torch.device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Transformer version: 4.29.2
torch.device: cuda


In [4]:
data = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
display(data.sample(5))


Unnamed: 0,id,keyword,location,text,target,clean_text
17588,17588,tsunami,in the Word of God,@author_mike Amen today is the Day of Salvatio...,1,amen today day salvation thx brother mike grea...
18064,18064,wild%20fires,New Jersey,These wild fires out west are crazy.,1,wild fire west crazy
688,688,blazing,,@BaseballQuotes1 I have a 32 inch dynasty,0,inch dynasty
8743,8743,blight,Memphis,THDA Kicks Off Anti-Blight Loan Effort in Memp...,0,thda kick loan effort memphis http
14186,14186,inundated,"England & Wales Border, UK",@Lenn_Len Probably. We are inundated with them...,0,probably inundated year


# BERT without fine-tuning

In [5]:
simple_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
labels = ["not disaster", "disaster"]

def model_simple_test(i):
    responce= simple_model(
        data["text"][i],
        candidate_labels=labels
    )
    print(responce)

    true_label = labels[1] if data["target"][i] else labels[0]
    print("True Label:", true_label)
    print("Prediction:", responce["labels"][np.argmax(responce["scores"])])

def calculate_accuracy_zscppl(data, target_ids):
    accurate_num = 0
    for i in target_ids:
        response = simple_model(
            data["text"][i],
            candidate_labels=labels
        )
        true_label = labels[1] if data["target"][i] else labels[0]
        if true_label == response["labels"][np.argmax(response["scores"])]:
            accurate_num += 1
    return accurate_num / len(target_ids)


In [7]:
data_sample = data.sample(5)
data_sample_ids = data_sample["id"].tolist()
display(data_sample)

for id in data_sample_ids:
    model_simple_test(id)

Unnamed: 0,id,keyword,location,text,target,clean_text
3782,3782,fire%20truck,District 12 - Orange County,SIGALERT UPDATE #3***N-133 CLOSED AT 5 FWY UFN...,1,sigalert update closed fwy ufn trash truck fire
9491,9491,burning,,@aubilenon @MarkKriegsman if you think you'd l...,0,aubilenon markkriegsman think like burning man...
8573,8573,blaze,Delhi,#socialmedia news - New Facebook Page Features...,0,socialmedia news new facebook page feature see...
9514,9514,burning,"TÌÁchira, Venezuela",the Burning Legion has returned,0,burning legion returned
14290,14290,landslide,,5 need to-dos seeing as how technical writing ...,0,need seeing technical writing administer apps ...


{'sequence': 'SIGALERT UPDATE #3***N-133 CLOSED AT 5 FWY UFN***- TRASH TRUCK FIRE', 'labels': ['disaster', 'not disaster'], 'scores': [0.9129602909088135, 0.08703970164060593]}
True Label: disaster
Prediction: disaster
{'sequence': "@aubilenon @MarkKriegsman if you think you'd like burning man you should try it because it's the only way to know!", 'labels': ['disaster', 'not disaster'], 'scores': [0.7511258721351624, 0.24887412786483765]}
True Label: not disaster
Prediction: disaster
{'sequence': '#socialmedia news - New Facebook Page Features Seek to Help Personalize the Customer Experience http://t.co/nbizaTlsmV', 'labels': ['not disaster', 'disaster'], 'scores': [0.9289625883102417, 0.07103737443685532]}
True Label: not disaster
Prediction: not disaster
{'sequence': 'the Burning Legion has returned', 'labels': ['disaster', 'not disaster'], 'scores': [0.9670848250389099, 0.0329151451587677]}
True Label: not disaster
Prediction: disaster
{'sequence': '5 need to-dos seeing as how techn

In [8]:
num_target = 50
target_ids = [random.randint(0, len(data)) for _ in range(num_target)]
accuracy_rate = calculate_accuracy_zscppl(data, target_ids)
print(f"Accuracy Rate using Pretrained BERT Zero-Shot-Classification: {accuracy_rate}")


Accuracy Rate using Pretrained BERT Zero-Shot-Classification: 0.6


# DistilBERT model with fine-tuning

## Preparation

### Building A PyTorch Dataset

The following code uses the idea from this tutorial [Fine-tuning with custom datasets](https://huggingface.co/transformers/v3.2.0/custom_datasets.html) on building a custom dataset:


In [9]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['target'].values
        texts = list(dataframe['text'].values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode


    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx])
        return item


    def __len__(self):
        return len(self.dataframe)


Just a wrapper to easier build the Dataset and DataLoader

In [10]:
def make_loaders(dataframe, tokenizer, mode="train", max_length=None):
    dataset = TweetDataset(dataframe, tokenizer, mode, max_length=max_length)
    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=options.batch_size, 
                                             shuffle=True if mode == "train" else False,
                                             num_workers=options.num_workers)
    return dataloader

### Custom Classification Model based on DistilBERT

* DistilBERT is a Language Model which needs to be fine-tuned on a final task of interest. So, we need to build that custom head here. In the [BERT paper](https://arxiv.org/abs/1810.04805)), they introduce some special tokens named [CLS] and [SEP] which they add to the sequence which is being fed to the model. [CLS] is used at the beginning of the sequence and [SEP] tokens are used to notify the end of each part in a sequence (a sequence which is going to be fed to BERT model can be made up of two parts; e.x question and corresponding text). 
 
* In the paper they explain that they use [CLS] hidden state representation to do classification tasks for the sequence. So, in our case, we are going to the same. DistilBERT model will produce a vector of size 768 as a hidden representation for this [CLS] token and we will give it to some nn.Linear layers to do our own specific task. 

In [11]:
class CustomModel(nn.Module):
    def __init__(self,
                 bert_model,
                 num_labels, 
                 bert_hidden_dim=768, 
                 classifier_hidden_dim=768, 
                 dropout=None):

        super().__init__()
        self.bert_model = bert_model
        self.head = nn.Sequential(nn.Linear(bert_hidden_dim, classifier_hidden_dim), # Do nothing if the dropout is set to None
                                  nn.ReLU(),
                                  nn.Dropout(dropout) if dropout is not None else nn.Identity(),
                                  nn.Linear(classifier_hidden_dim, num_labels))

    def forward(self, batch):
        # feeding the input_ids and masks to the model. These are provided by our tokenizer
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        # obtaining the last layer hidden states of the Transformer
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        # CLS token is in the beginning of the sequence. So, we grab its representation by indexing the tensor containing the hidden representations
        CLS_token_state = last_hidden_state[:, 0, :]
        # passing this representation through our custom head
        logits = self.head(CLS_token_state)
        return logits


## Training and Evaluation

In [12]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def one_epoch(model, criterion, loader, device, optimizer=None, lr_scheduler=None, mode="train", step="batch"):
    loss_meter = AvgMeter()
    acc_meter = AvgMeter()
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(device) for k, v in batch.items()}
        preds = model(batch)
        loss = criterion(preds, batch['labels'])
        if mode == "train":
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step == "batch":
                lr_scheduler.step()
                
        count = batch['input_ids'].size(0)
        loss_meter.update(loss.item(), count)
        
        accuracy = get_accuracy(preds.detach(), batch['labels'])
        acc_meter.update(accuracy.item(), count)
        if mode == "train":
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg, lr=get_lr(optimizer))
        else:
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg)
    
    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

def get_accuracy(preds, targets):
    """
    preds shape: (batch_size, num_labels)
    targets shape: (batch_size)
    """
    preds = preds.argmax(dim=1)
    acc = (preds == targets).float().mean()
    return acc


In [13]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, device, options, loss_accuracy, lr_scheduler=None):
    
    best_loss = float('inf')
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print("*" * 30)
        print(f"Epoch {epoch + 1}")
        current_lr = get_lr(optimizer)
        
        model.train()
        train_loss, train_acc = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          device,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          mode="train",
                                          step=options.step)                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              device,
                                              optimizer=None,
                                              lr_scheduler=None,
                                              mode="valid")
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{options.model_path}/{options.model_save_name}')
            print("Saved best model!")
        
        # or you could do: if step == "epoch":
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(valid_loss.avg)
            # if the learning rate changes by ReduceLROnPlateau, we are going to
            # reload our previous best model weights and start from there with a lower LR
            if current_lr != get_lr(optimizer):
                print("Loading best model weights!")
                model.load_state_dict(torch.load(f'{options.model_path}/{options.model_save_name}', 
                                                 map_location=device))
        

        print(f"Train Loss: {train_loss.avg:.5f}")
        print(f"Train Accuracy: {train_acc.avg:.5f}")
        
        print(f"Valid Loss: {valid_loss.avg:.5f}")
        print(f"Valid Accuracy: {valid_acc.avg:.5f}")
        print("*" * 30)

        loss_accuracy["Train Loss"] += [train_loss.avg]
        loss_accuracy["Train Accuracy"] += [train_acc.avg]
        loss_accuracy["Valid Loss"] += [valid_loss.avg]
        loss_accuracy["Valid Accuracy"] += [valid_acc.avg]


### K-Fold Cross Validation

In [14]:
def make_folds(dataframe, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for i, (_, valid_idx) in enumerate(kf.split(X=dataframe['id'])):
        dataframe.loc[valid_idx, 'fold'] = i
    return dataframe

In [15]:
def one_fold(fold, options):  
    print(f"Training Fold: {fold}")
    loss_accuracy = {"Train Loss": [], "Train Accuracy": [], "Valid Loss": [], "Valid Accuracy": []}
    
    # Here, we load the pre-trained DistilBERT model from transformers library
    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    # Loading the corresponding tokenizer from HuggingFace by using AutoTokenizer class.
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    dataframe = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
    dataframe = make_folds(dataframe, n_splits=options.n_folds)
    train_dataframe = dataframe[dataframe['fold'] != fold].reset_index(drop=True)
    valid_dataframe = dataframe[dataframe['fold'] == fold].reset_index(drop=True)

    train_loader = make_loaders(train_dataframe, tokenizer, "train", options.max_length)
    valid_loader = make_loaders(valid_dataframe, tokenizer, "valid", options.max_length)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model, options.num_labels, dropout=options.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=options.learning_rate)
    if options.scheduler == "ReduceLROnPlateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                                  mode="min", 
                                                                  factor=0.5, 
                                                                  patience=options.patience)

        # when to step the scheduler: after an epoch or after a batch
        options.step = "epoch"
        
    elif options.scheduler == "LinearWarmup":
        num_train_steps = len(train_loader) * options.epochs
        lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                       num_warmup_steps=0, 
                                                       num_training_steps=num_train_steps)
        
        # when to step the scheduler: after an epoch or after a batch
        options.step = "batch"
    
    criterion = nn.CrossEntropyLoss()
    options.model_save_name = f"model_fold_{fold}.pt"
    train_eval(options.epochs, model, train_loader, valid_loader,
               criterion, optimizer, device, options, loss_accuracy, lr_scheduler=lr_scheduler)

    # tf = open(f"{path}/models/loss_accuracy_{fold}.json", "x")
    # json.dump(loss_accuracy, tf)
    # tf.close()


In [16]:
def train_folds(options):
    n_folds = options.n_folds
    for i in range(n_folds):
        one_fold(fold=i, options=options)

## Tuning

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 32
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5
options = Options()
train_folds(options)


Training Fold: 0


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

******************************
Epoch 1


100%|██████████| 462/462 [01:45<00:00,  4.37it/s, accuracy=0.836, loss=0.389, lr=3e-5]
100%|██████████| 116/116 [00:09<00:00, 12.24it/s, accuracy=0.861, loss=0.331]


Saved best model!
Train Loss: 0.38890
Train Accuracy: 0.83624
Valid Loss: 0.33106
Valid Accuracy: 0.86089
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


100%|██████████| 462/462 [01:44<00:00,  4.42it/s, accuracy=0.836, loss=0.396, lr=3e-5]
100%|██████████| 116/116 [00:09<00:00, 12.31it/s, accuracy=0.869, loss=0.324]


Saved best model!
Train Loss: 0.39591
Train Accuracy: 0.83638
Valid Loss: 0.32422
Valid Accuracy: 0.86928
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


100%|██████████| 462/462 [01:44<00:00,  4.41it/s, accuracy=0.836, loss=0.39, lr=3e-5]
  1%|          | 1/116 [00:00<00:30,  3.71it/s, accuracy=1, loss=0.112]

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5
options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 128
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5
options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 4
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 164
    num_labels = 2
    epochs = 1
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 8
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-4
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-6
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.3
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.7
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 7

options = Options()
train_folds(options)


## Run with the best setting

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 5
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)

In [None]:
accuracies = [0.8657645459910691, 0.8960757774493691, 0.9096075771464708, 0.9098782130765334, 0.9234100128058974, 0.9142083890221923, 0.9174560208927149, 0.9139377529953426, 0.9209742888546119, 0.9179972929141518, 0.9220568328006987, 0.9198917448762628, 0.9169147489358027, 0.9182679289087391, 0.9161028409520409, 0.9155615689628663, 0.9179972929141518, 0.9188092008979136, 0.9147496609791045, 0.9150202969736917]
import matplotlib.pyplot as plt
epochs = 20
plt.plot(range(1, epochs + 1), accuracies)

In [None]:
!nvidia-smi

In [None]:
def test_one_model(options):  
    test_dataframe = pd.read_csv(path+'/data/tweet_samples_100.csv', sep=",")

    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    test_loader = make_loaders(test_dataframe, tokenizer, mode="test")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model,
                        options.num_labels,
                        dropout=options.dropout).to(device)
    model.load_state_dict(torch.load(f"{options.model_path}/{options.model_save_name}", 
                                     map_location=device))
    model.eval()
    
    all_preds = None
    with torch.no_grad():
        for batch in tqdm(test_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            preds = model(batch)
            if all_preds is None:
                all_preds = preds
            else:
                all_preds = torch.cat([all_preds, preds], dim=0)

    return all_preds

In [None]:
def test_all_models(options):
    n_folds = options.n_folds
    all_model_preds = []
    for fold in range(n_folds):
        options.model_save_name = f"model_fold_{fold}.pt"
        all_preds = test_one_model(options)
        all_model_preds.append(all_preds)
    
    all_model_preds = torch.stack(all_model_preds, dim=0)
    print(all_model_preds.shape)
    # I will return the mean of the final predictions of all the models
    # You could do other things like 'voting' between the five models
    return all_model_preds.mean(0)

In [None]:
test_dataframe = pd.read_csv(path+'/data/tweet_samples_100.csv', sep=",")
all_preds = test_all_models(options)
predictions = all_preds.argmax(dim=1).cpu().numpy()
true_labels = np.array(test_dataframe["target"].tolist())

In [None]:
np.sum(predictions == true_labels)

In [None]:
mismatched_indices = []
for i in range(len(predictions)):
    if predictions[i] != true_labels[i]:
        mismatched_indices.append(i)
mismatched_indices