# Preparation

In [16]:
use_colab = True
if use_colab:
    !pip install transformers[sentencepiece]
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/CAPP30255_Project/twitter_disaster_detection"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<=3.20.2 (from transformers[sentencepiece])
  Downloading protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that a

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import os
import copy
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn


from sklearn.model_selection import train_test_split, KFold

# importing HuggingFace transformers library
import transformers
from transformers import pipeline, get_linear_schedule_with_warmup

print(transformers.__version__)


4.29.1


In [18]:
data = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
display(data.sample(5))


Unnamed: 0,id,keyword,location,text,target,clean_text
15504,15504,radiation%20emergency,Warszawa,Radioactive Box Quarantined - Israel‰Ûªs Ashdo...,1,radioactive box quarantined ashdod port evacua...
9547,9547,burning%20buildings,,@foxnewsvideo @AIIAmericanGirI @ANHQDC So ... ...,1,foxnewsvideo aiiamericangiri anhqdc rioter loo...
55,55,ablaze,USA,#Kurds trampling on Turkmen flag later set it ...,1,kurd trampling turkmen flag later set ablaze o...
9375,9375,buildings%20burning,US of Eh,.@denisleary Not sure how these folks rush int...,1,denisleary sure folk rush burning building gra...
2736,2736,devastated,,abcnews - Obama Declares Disaster for Typhoon-...,1,abcnews obama declares disaster saipan obama s...


# BERT without fine-tuning

Reference: https://huggingface.co/facebook/bart-large-mnli

In [56]:
simple_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [63]:
candidate_labels=["not disaster", "disaster"]
def model_simple_test(i):
    responce= simple_model(
        data["clean_text"][i],
        candidate_labels=candidate_labels
    )
    print(responce)

    true_label = "disaster" if data["target"][i] else "not disaster"
    print("True Label:", true_label)
    print("Prediction:", responce["labels"][np.argmax(responce["scores"])])


In [70]:
display(data.sample(5))

Unnamed: 0,id,keyword,location,text,target,clean_text
7413,7413,wounded,USA,One man fatally shot another wounded on Vermon...,1,one man fatally shot another wounded vermont s...
7736,7736,accident,,@RobynJilllian @WlSDOMTEETHS I feel like I'm g...,0,robynjilllian wlsdomteeths feel like going acc...
8780,8780,blight,"Cleveland, OH",Look for my Policy Matters Ohio report on #CLE...,0,look policy matter ohio report cle cuyahoga co...
1422,1422,casualties,"Absecon, NJ",#Civilian casualties in Afghanistan hit highes...,1,civilian casualty afghanistan hit highest numb...
4180,4180,hazard,Australia,#Lifestyle Û÷It makes me sickÛª: Baby clothe...,0,lifestyle make baby clothes deemed http http


In [71]:
model_simple_test(7413)

{'sequence': 'one man fatally shot another wounded vermont street buffalo http', 'labels': ['disaster', 'not disaster'], 'scores': [0.9108261466026306, 0.08917386829853058]}
True Label: disaster
Prediction: disaster


In [72]:
model_simple_test(7736)

{'sequence': 'robynjilllian wlsdomteeths feel like going accident teesha gon na come', 'labels': ['disaster', 'not disaster'], 'scores': [0.9694749116897583, 0.030525073409080505]}
True Label: not disaster
Prediction: disaster


In [73]:
model_simple_test(8780)

{'sequence': 'look policy matter ohio report cle cuyahoga county blight greening vacant land soon http', 'labels': ['disaster', 'not disaster'], 'scores': [0.8929414749145508, 0.10705850273370743]}
True Label: not disaster
Prediction: disaster


# DistilBERT model
Reference: https://www.kaggle.com/code/moeinshariatnia/simple-distilbert-fine-tuning-0-84-lb

## Preparation

## Building A PyTorch Dataset

The following code uses the idea from this tutorial [Fine-tuning with custom datasets](https://huggingface.co/transformers/v3.2.0/custom_datasets.html) on building a custom dataset:


In [5]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['target'].values
        texts = list(dataframe['text'].values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx])
        return item
    
    def __len__(self):
        return len(self.dataframe)

Just a wrapper to easier build the Dataset and DataLoader

In [6]:
def make_loaders(dataframe, tokenizer, mode="train", max_length=None):
    dataset = TweetDataset(dataframe, tokenizer, mode, max_length=max_length)
    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=options.batch_size, 
                                             shuffle=True if mode == "train" else False,
                                             num_workers=options.num_workers)
    return dataloader

## Custom Classification Model based on DistilBERT

* DistilBERT is a Language Model which needs to be fine-tuned on a final task of interest. So, we need to build that custom head here. In the [BERT paper](https://arxiv.org/abs/1810.04805)), they introduce some special tokens named [CLS] and [SEP] which they add to the sequence which is being fed to the model. [CLS] is used at the beginning of the sequence and [SEP] tokens are used to notify the end of each part in a sequence (a sequence which is going to be fed to BERT model can be made up of two parts; e.x question and corresponding text). 
 
* In the paper they explain that they use [CLS] hidden state representation to do classification tasks for the sequence. So, in our case, we are going to the same. DistilBERT model will produce a vector of size 768 as a hidden representation for this [CLS] token and we will give it to some nn.Linear layers to do our own specific task. 

In [7]:
class CustomModel(nn.Module):
    def __init__(self,
                 bert_model,
                 num_labels, 
                 bert_hidden_dim=768, 
                 classifier_hidden_dim=768, 
                 dropout=None):
        
        super().__init__()
        self.bert_model = bert_model
        # nn.Identity does nothing if the dropout is set to None
        self.head = nn.Sequential(nn.Linear(bert_hidden_dim, classifier_hidden_dim),
                                  nn.ReLU(),
                                  nn.Dropout(dropout) if dropout is not None else nn.Identity(),
                                  nn.Linear(classifier_hidden_dim, num_labels))
    
    def forward(self, batch):
        # feeding the input_ids and masks to the model. These are provided by our tokenizer
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        # obtaining the last layer hidden states of the Transformer
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        # As I said, the CLS token is in the beginning of the sequence. So, we grab its representation 
        # by indexing the tensor containing the hidden representations
        CLS_token_state = last_hidden_state[:, 0, :]
        # passing this representation through our custom head
        logits = self.head(CLS_token_state)
        return logits

## Training and Evaluation functions

In [8]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def one_epoch(model, criterion, loader, device, optimizer=None, lr_scheduler=None, mode="train", step="batch"):
    loss_meter = AvgMeter()
    acc_meter = AvgMeter()
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(device) for k, v in batch.items()}
        preds = model(batch)
        loss = criterion(preds, batch['labels'])
        if mode == "train":
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step == "batch":
                lr_scheduler.step()
                
        count = batch['input_ids'].size(0)
        loss_meter.update(loss.item(), count)
        
        accuracy = get_accuracy(preds.detach(), batch['labels'])
        acc_meter.update(accuracy.item(), count)
        if mode == "train":
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg, lr=get_lr(optimizer))
        else:
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg)
    
    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

def get_accuracy(preds, targets):
    """
    preds shape: (batch_size, num_labels)
    targets shape: (batch_size)
    """
    preds = preds.argmax(dim=1)
    acc = (preds == targets).float().mean()
    return acc

In [9]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, device, options, lr_scheduler=None):
    
    best_loss = float('inf')
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print("*" * 30)
        print(f"Epoch {epoch + 1}")
        current_lr = get_lr(optimizer)
        
        model.train()
        train_loss, train_acc = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          device,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          mode="train",
                                          step=options.step)                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              device,
                                              optimizer=None,
                                              lr_scheduler=None,
                                              mode="valid")
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{options.model_path}/{options.model_save_name}')
            print("Saved best model!")
        
        # or you could do: if step == "epoch":
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(valid_loss.avg)
            # if the learning rate changes by ReduceLROnPlateau, we are going to
            # reload our previous best model weights and start from there with a lower LR
            if current_lr != get_lr(optimizer):
                print("Loading best model weights!")
                model.load_state_dict(torch.load(f'{options.model_path}/{options.model_save_name}', 
                                                 map_location=device))
        

        print(f"Train Loss: {train_loss.avg:.5f}")
        print(f"Train Accuracy: {train_acc.avg:.5f}")
        
        print(f"Valid Loss: {valid_loss.avg:.5f}")
        print(f"Valid Accuracy: {valid_acc.avg:.5f}")
        print("*" * 30)

In [10]:
path

'/content/drive/MyDrive/CAPP30255_Project/twitter_disaster_detection'

In [11]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 10
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

## Taking care of Cross Validation

In [12]:
def make_folds(dataframe, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for i, (_, valid_idx) in enumerate(kf.split(X=dataframe['id'])):
        dataframe.loc[valid_idx, 'fold'] = i
    return dataframe

In [13]:
def one_fold(fold, options):  
    print(f"Training Fold: {fold}")
    
    # Here, we load the pre-trained DistilBERT model from transformers library
    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    # Loading the corresponding tokenizer from HuggingFace by using AutoTokenizer class.
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    dataframe = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
    dataframe = make_folds(dataframe, n_splits=options.n_folds)
    train_dataframe = dataframe[dataframe['fold'] != fold].reset_index(drop=True)
    valid_dataframe = dataframe[dataframe['fold'] == fold].reset_index(drop=True)

    train_loader = make_loaders(train_dataframe, tokenizer, "train", options.max_length)
    valid_loader = make_loaders(valid_dataframe, tokenizer, "valid", options.max_length)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model, options.num_labels, dropout=options.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=options.learning_rate)
    if options.scheduler == "ReduceLROnPlateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                                  mode="min", 
                                                                  factor=0.5, 
                                                                  patience=options.patience)
        
        # when to step the scheduler: after an epoch or after a batch
        options.step = "epoch"
        
    elif options.scheduler == "LinearWarmup":
        num_train_steps = len(train_loader) * options.epochs
        lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                       num_warmup_steps=0, 
                                                       num_training_steps=num_train_steps)
        
        # when to step the scheduler: after an epoch or after a batch
        options.step = "batch"
    
    criterion = nn.CrossEntropyLoss()
    options.model_save_name = f"model_fold_{fold}.pt"
    train_eval(options.epochs, model, train_loader, valid_loader,
               criterion, optimizer, device, options, lr_scheduler=lr_scheduler)

In [14]:
def train_folds(options):
    n_folds = options.n_folds
    for i in range(n_folds):
        one_fold(fold=i, options=options)

In [15]:
options = Options()
train_folds(options)

Training Fold: 0


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

******************************
Epoch 1


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.40467
Train Accuracy: 0.82981
Valid Loss: 0.33592
Valid Accuracy: 0.86008
******************************
******************************
Epoch 2


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.27001
Train Accuracy: 0.89775
Valid Loss: 0.29936
Valid Accuracy: 0.88714
******************************
******************************
Epoch 3


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.17828
Train Accuracy: 0.93511
Valid Loss: 0.28189
Valid Accuracy: 0.90392
******************************
******************************
Epoch 4


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.11827
Train Accuracy: 0.95683
Valid Loss: 0.29952
Valid Accuracy: 0.90717
******************************
******************************
Epoch 5


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07836
Train Accuracy: 0.97199
Valid Loss: 0.31881
Valid Accuracy: 0.91394
******************************
******************************
Epoch 6


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.06202
Train Accuracy: 0.97496
Valid Loss: 0.32987
Valid Accuracy: 0.91827
******************************
******************************
Epoch 7


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09710
Train Accuracy: 0.96542
Valid Loss: 0.28857
Valid Accuracy: 0.91340
******************************
******************************
Epoch 8


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07424
Train Accuracy: 0.97158
Valid Loss: 0.30627
Valid Accuracy: 0.91502
******************************
******************************
Epoch 9


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.06061
Train Accuracy: 0.97720
Valid Loss: 0.31126
Valid Accuracy: 0.91989
******************************
******************************
Epoch 10


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09795
Train Accuracy: 0.96508
Valid Loss: 0.28736
Valid Accuracy: 0.91529
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.40743
Train Accuracy: 0.82420
Valid Loss: 0.34070
Valid Accuracy: 0.86062
******************************
******************************
Epoch 2


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.26869
Train Accuracy: 0.89769
Valid Loss: 0.29078
Valid Accuracy: 0.89256
******************************
******************************
Epoch 3


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.17002
Train Accuracy: 0.93727
Valid Loss: 0.28136
Valid Accuracy: 0.90068
******************************
******************************
Epoch 4


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.10634
Train Accuracy: 0.96170
Valid Loss: 0.28935
Valid Accuracy: 0.91394
******************************
******************************
Epoch 5


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07505
Train Accuracy: 0.97097
Valid Loss: 0.34897
Valid Accuracy: 0.90582
******************************
******************************
Epoch 6


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.06349
Train Accuracy: 0.97469
Valid Loss: 0.32635
Valid Accuracy: 0.91421
******************************
******************************
Epoch 7


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09306
Train Accuracy: 0.96583
Valid Loss: 0.29293
Valid Accuracy: 0.91800
******************************
******************************
Epoch 8


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.06807
Train Accuracy: 0.97422
Valid Loss: 0.29599
Valid Accuracy: 0.92124
******************************
******************************
Epoch 9


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.05456
Train Accuracy: 0.97855
Valid Loss: 0.32851
Valid Accuracy: 0.92314
******************************
******************************
Epoch 10


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09011
Train Accuracy: 0.96671
Valid Loss: 0.28876
Valid Accuracy: 0.91340
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.40176
Train Accuracy: 0.82860
Valid Loss: 0.33466
Valid Accuracy: 0.86360
******************************
******************************
Epoch 2


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.26913
Train Accuracy: 0.89938
Valid Loss: 0.29992
Valid Accuracy: 0.89066
******************************
******************************
Epoch 3


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.16813
Train Accuracy: 0.94099
Valid Loss: 0.27959
Valid Accuracy: 0.91258
******************************
******************************
Epoch 4


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.10483
Train Accuracy: 0.96123
Valid Loss: 0.26572
Valid Accuracy: 0.90934
******************************
******************************
Epoch 5


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07766
Train Accuracy: 0.97158
Valid Loss: 0.27283
Valid Accuracy: 0.91935
******************************
******************************
Epoch 6


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.06159
Train Accuracy: 0.97530
Valid Loss: 0.28737
Valid Accuracy: 0.92936
******************************
******************************
Epoch 7


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.04693
Train Accuracy: 0.98051
Valid Loss: 0.34778
Valid Accuracy: 0.92449
******************************
******************************
Epoch 8


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.06031
Train Accuracy: 0.97618
Valid Loss: 0.29460
Valid Accuracy: 0.91610
******************************
******************************
Epoch 9


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.05087
Train Accuracy: 0.98024
Valid Loss: 0.33414
Valid Accuracy: 0.91394
******************************
******************************
Epoch 10


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.04283
Train Accuracy: 0.98200
Valid Loss: 0.31877
Valid Accuracy: 0.92097
******************************
Training Fold: 3


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.40565
Train Accuracy: 0.82475
Valid Loss: 0.33352
Valid Accuracy: 0.86681
******************************
******************************
Epoch 2


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.26642
Train Accuracy: 0.90216
Valid Loss: 0.27814
Valid Accuracy: 0.90065
******************************
******************************
Epoch 3


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.16667
Train Accuracy: 0.94309
Valid Loss: 0.25376
Valid Accuracy: 0.91121
******************************
******************************
Epoch 4


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.10356
Train Accuracy: 0.96245
Valid Loss: 0.25526
Valid Accuracy: 0.91770
******************************
******************************
Epoch 5


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07426
Train Accuracy: 0.97070
Valid Loss: 0.25716
Valid Accuracy: 0.91987
******************************
******************************
Epoch 6


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.06236
Train Accuracy: 0.97483
Valid Loss: 0.26427
Valid Accuracy: 0.92583
******************************
******************************
Epoch 7


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09275
Train Accuracy: 0.96637
Valid Loss: 0.25500
Valid Accuracy: 0.92068
******************************
******************************
Epoch 8


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07122
Train Accuracy: 0.97408
Valid Loss: 0.26713
Valid Accuracy: 0.92474
******************************
******************************
Epoch 9


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.05715
Train Accuracy: 0.97720
Valid Loss: 0.28099
Valid Accuracy: 0.92610
******************************
******************************
Epoch 10


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.09210
Train Accuracy: 0.96624
Valid Loss: 0.24878
Valid Accuracy: 0.91825
******************************
Training Fold: 4


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


******************************
Epoch 1


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.40136
Train Accuracy: 0.83179
Valid Loss: 0.33946
Valid Accuracy: 0.86167
******************************
******************************
Epoch 2


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.27050
Train Accuracy: 0.89715
Valid Loss: 0.27359
Valid Accuracy: 0.89903
******************************
******************************
Epoch 3


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Saved best model!
Train Loss: 0.17204
Train Accuracy: 0.93755
Valid Loss: 0.24969
Valid Accuracy: 0.91689
******************************
******************************
Epoch 4


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.11069
Train Accuracy: 0.95940
Valid Loss: 0.25204
Valid Accuracy: 0.91391
******************************
******************************
Epoch 5


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07725
Train Accuracy: 0.96901
Valid Loss: 0.26644
Valid Accuracy: 0.92934
******************************
******************************
Epoch 6


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.05945
Train Accuracy: 0.97442
Valid Loss: 0.28218
Valid Accuracy: 0.92501
******************************
******************************
Epoch 7


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09622
Train Accuracy: 0.96542
Valid Loss: 0.24990
Valid Accuracy: 0.92637
******************************
******************************
Epoch 8


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.07017
Train Accuracy: 0.97314
Valid Loss: 0.28037
Valid Accuracy: 0.92393
******************************
******************************
Epoch 9


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Loading best model weights!
Train Loss: 0.05633
Train Accuracy: 0.97781
Valid Loss: 0.29385
Valid Accuracy: 0.92610
******************************
******************************
Epoch 10


  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Train Loss: 0.09421
Train Accuracy: 0.96556
Valid Loss: 0.26456
Valid Accuracy: 0.92149
******************************


In [None]:
def test_one_model(options):  
    test_dataframe = pd.read_csv("../input/nlp-getting-started/test.csv")

    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    test_loader = make_loaders(test_dataframe, tokenizer, mode="test")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model, options.num_labels, dropout=options.dropout).to(device)
    model.load_state_dict(torch.load(f"{options.model_path}/{options.model_save_name}", 
                                     map_location=device))
    model.eval()
    
    all_preds = None
    with torch.no_grad():
        for batch in tqdm(test_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            preds = model(batch)
            if all_preds is None:
                all_preds = preds
            else:
                all_preds = torch.cat([all_preds, preds], dim=0)
    
    return all_preds

In [None]:
def test_all_models(options):
    n_folds = options.n_folds
    all_model_preds = []
    for fold in range(n_folds):
        options.model_save_name = f"model_fold_{fold}.pt"
        all_preds = test_one_model(options)
        all_model_preds.append(all_preds)
    
    all_model_preds = torch.stack(all_model_preds, dim=0)
    print(all_model_preds.shape)
    # I will return the mean of the final predictions of all the models
    # You could do other things like 'voting' between the five models
    return all_model_preds.mean(0)

In [None]:
all_preds = test_all_models(options)
predictions = all_preds.argmax(dim=1).cpu().numpy()
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sample_submission['target'] = predictions
sample_submission.to_csv("sample_submission.csv", index=False)
pd.read_csv("sample_submission.csv")