# Preparation

In [1]:
use_colab = True
if use_colab:
    !pip install transformers[sentencepiece]
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/CAPP30255_Project/twitter_disaster_detection"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[sentencepiece])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-m

Mounted at /content/drive


In [2]:
import os
import copy
import pandas as pd
import numpy as np
# from tqdm.autonotebook import tqdm
from tqdm import tqdm
import random
import json

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold

# importing HuggingFace transformers library
import transformers
from transformers import pipeline, get_linear_schedule_with_warmup

print("Transformer version:", transformers.__version__)
print("torch.device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Transformer version: 4.29.2
torch.device: cuda


In [3]:
data = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
display(data.sample(5))


Unnamed: 0,id,keyword,location,text,target,clean_text
13246,13246,flooding,Jakarta/Kuala Lumpur/S'pore,Heavy Rainfall and Flooding in Northern #VietN...,1,heavy rainfall flooding northern vietnam situa...
16688,16688,structural%20failure,,Investigators say a fatal Virgin Galactic spac...,1,investigator say fatal virgin galactic spacesh...
3745,3745,fire,,My asshole is on fire https://t.co/Y3FO0gHg8t,0,asshole fire http
14069,14069,injured,Nigeria,Ogun smugglers engage Customs in shootoutåÊ: S...,1,ogun smuggler engage custom shootoutåê several...
3301,3301,evacuate,,Condemnation clearly replacing the latest resp...,1,condemnation clearly replacing latest response...


# BERT without fine-tuning

In [None]:
simple_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
labels = ["not disaster", "disaster"]

def model_simple_test(i):
    responce= simple_model(
        data["text"][i],
        candidate_labels=labels
    )
    print(responce)

    true_label = labels[1] if data["target"][i] else labels[0]
    print("True Label:", true_label)
    print("Prediction:", responce["labels"][np.argmax(responce["scores"])])

def calculate_accuracy_zscppl(data, target_ids):
    accurate_num = 0
    for i in target_ids:
        response = simple_model(
            data["text"][i],
            candidate_labels=labels
        )
        true_label = labels[1] if data["target"][i] else labels[0]
        if true_label == response["labels"][np.argmax(response["scores"])]:
            accurate_num += 1
    return accurate_num / len(target_ids)


In [None]:
data_sample = data.sample(5)
data_sample_ids = data_sample["id"].tolist()
display(data_sample)

for id in data_sample_ids:
    model_simple_test(id)

Unnamed: 0,id,keyword,location,text,target,clean_text
3782,3782,fire%20truck,District 12 - Orange County,SIGALERT UPDATE #3***N-133 CLOSED AT 5 FWY UFN...,1,sigalert update closed fwy ufn trash truck fire
9491,9491,burning,,@aubilenon @MarkKriegsman if you think you'd l...,0,aubilenon markkriegsman think like burning man...
8573,8573,blaze,Delhi,#socialmedia news - New Facebook Page Features...,0,socialmedia news new facebook page feature see...
9514,9514,burning,"TÌÁchira, Venezuela",the Burning Legion has returned,0,burning legion returned
14290,14290,landslide,,5 need to-dos seeing as how technical writing ...,0,need seeing technical writing administer apps ...


{'sequence': 'SIGALERT UPDATE #3***N-133 CLOSED AT 5 FWY UFN***- TRASH TRUCK FIRE', 'labels': ['disaster', 'not disaster'], 'scores': [0.9129602909088135, 0.08703970164060593]}
True Label: disaster
Prediction: disaster
{'sequence': "@aubilenon @MarkKriegsman if you think you'd like burning man you should try it because it's the only way to know!", 'labels': ['disaster', 'not disaster'], 'scores': [0.7511258721351624, 0.24887412786483765]}
True Label: not disaster
Prediction: disaster
{'sequence': '#socialmedia news - New Facebook Page Features Seek to Help Personalize the Customer Experience http://t.co/nbizaTlsmV', 'labels': ['not disaster', 'disaster'], 'scores': [0.9289625883102417, 0.07103737443685532]}
True Label: not disaster
Prediction: not disaster
{'sequence': 'the Burning Legion has returned', 'labels': ['disaster', 'not disaster'], 'scores': [0.9670848250389099, 0.0329151451587677]}
True Label: not disaster
Prediction: disaster
{'sequence': '5 need to-dos seeing as how techn

In [None]:
num_target = 50
target_ids = [random.randint(0, len(data)) for _ in range(num_target)]
accuracy_rate = calculate_accuracy_zscppl(data, target_ids)
print(f"Accuracy Rate using Pretrained BERT Zero-Shot-Classification: {accuracy_rate}")


Accuracy Rate using Pretrained BERT Zero-Shot-Classification: 0.6


# DistilBERT model with fine-tuning

## Preparation

### Building A PyTorch Dataset

The following code uses the idea from this tutorial [Fine-tuning with custom datasets](https://huggingface.co/transformers/v3.2.0/custom_datasets.html) on building a custom dataset:


In [4]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['target'].values
        texts = list(dataframe['text'].values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode


    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx])
        return item


    def __len__(self):
        return len(self.dataframe)


Just a wrapper to easier build the Dataset and DataLoader

In [5]:
def make_loaders(dataframe, tokenizer, mode="train", max_length=None):
    dataset = TweetDataset(dataframe, tokenizer, mode, max_length=max_length)
    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=options.batch_size, 
                                             shuffle=True if mode == "train" else False,
                                             num_workers=options.num_workers)
    return dataloader

### Custom Classification Model based on DistilBERT

* DistilBERT is a Language Model which needs to be fine-tuned on a final task of interest. So, we need to build that custom head here. In the [BERT paper](https://arxiv.org/abs/1810.04805)), they introduce some special tokens named [CLS] and [SEP] which they add to the sequence which is being fed to the model. [CLS] is used at the beginning of the sequence and [SEP] tokens are used to notify the end of each part in a sequence (a sequence which is going to be fed to BERT model can be made up of two parts; e.x question and corresponding text). 
 
* In the paper they explain that they use [CLS] hidden state representation to do classification tasks for the sequence. So, in our case, we are going to the same. DistilBERT model will produce a vector of size 768 as a hidden representation for this [CLS] token and we will give it to some nn.Linear layers to do our own specific task. 

In [6]:
class CustomModel(nn.Module):
    def __init__(self,
                 bert_model,
                 num_labels, 
                 bert_hidden_dim=768, 
                 classifier_hidden_dim=768, 
                 dropout=None):

        super().__init__()
        self.bert_model = bert_model
        self.head = nn.Sequential(nn.Linear(bert_hidden_dim, classifier_hidden_dim), # Do nothing if the dropout is set to None
                                  nn.ReLU(),
                                  nn.Dropout(dropout) if dropout is not None else nn.Identity(),
                                  nn.Linear(classifier_hidden_dim, num_labels))

    def forward(self, batch):
        # feeding the input_ids and masks to the model. These are provided by our tokenizer
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        # obtaining the last layer hidden states of the Transformer
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        # CLS token is in the beginning of the sequence. So, we grab its representation by indexing the tensor containing the hidden representations
        CLS_token_state = last_hidden_state[:, 0, :]
        # passing this representation through our custom head
        logits = self.head(CLS_token_state)
        return logits


## Training and Evaluation

In [7]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def one_epoch(model, criterion, loader, device, optimizer=None, lr_scheduler=None, mode="train", step="batch"):
    loss_meter = AvgMeter()
    acc_meter = AvgMeter()
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(device) for k, v in batch.items()}
        preds = model(batch)
        loss = criterion(preds, batch['labels'])
        if mode == "train":
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step == "batch":
                lr_scheduler.step()
                
        count = batch['input_ids'].size(0)
        loss_meter.update(loss.item(), count)
        
        accuracy = get_accuracy(preds.detach(), batch['labels'])
        acc_meter.update(accuracy.item(), count)
        if mode == "train":
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg, lr=get_lr(optimizer))
        else:
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg)
    
    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

def get_accuracy(preds, targets):
    """
    preds shape: (batch_size, num_labels)
    targets shape: (batch_size)
    """
    preds = preds.argmax(dim=1)
    acc = (preds == targets).float().mean()
    return acc


In [8]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, device, options, loss_accuracy, lr_scheduler=None):
    
    best_loss = float('inf')
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print("~" * 30)
        print(f"Epoch {epoch + 1}")
        current_lr = get_lr(optimizer)
        
        model.train()
        train_loss, train_acc = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          device,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          mode="train",
                                          step=options.step)                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              device,
                                              optimizer=None,
                                              lr_scheduler=None,
                                              mode="valid")
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{options.model_path}/{options.model_save_name}')
            print("Saved best model!")
        
        # or you could do: if step == "epoch":
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(valid_loss.avg)
            # if the learning rate changes by ReduceLROnPlateau, we are going to
            # reload our previous best model weights and start from there with a lower LR
            if current_lr != get_lr(optimizer):
                print("Loading best model weights!")
                model.load_state_dict(torch.load(f'{options.model_path}/{options.model_save_name}', 
                                                 map_location=device))
        

        print(f"Train Loss: {train_loss.avg:.5f}")
        print(f"Train Accuracy: {train_acc.avg:.5f}")
        
        print(f"Valid Loss: {valid_loss.avg:.5f}")
        print(f"Valid Accuracy: {valid_acc.avg:.5f}")
        print("*" * 30)

        loss_accuracy["Train Loss"] += [train_loss.avg]
        loss_accuracy["Train Accuracy"] += [train_acc.avg]
        loss_accuracy["Valid Loss"] += [valid_loss.avg]
        loss_accuracy["Valid Accuracy"] += [valid_acc.avg]


### K-Fold Cross Validation

In [9]:
def make_folds(dataframe, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for i, (_, valid_idx) in enumerate(kf.split(X=dataframe['id'])):
        dataframe.loc[valid_idx, 'fold'] = i
    return dataframe

In [10]:
def one_fold(fold, options):  
    print(f"Training Fold: {fold}")
    loss_accuracy = {"Train Loss": [], "Train Accuracy": [], "Valid Loss": [], "Valid Accuracy": []}
    
    # Here, we load the pre-trained DistilBERT model from transformers library
    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    # Loading the corresponding tokenizer from HuggingFace by using AutoTokenizer class.
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    dataframe = pd.read_csv(path+'/data/cleaned-train-tweets.csv', sep="|")
    dataframe = make_folds(dataframe, n_splits=options.n_folds)
    train_dataframe = dataframe[dataframe['fold'] != fold].reset_index(drop=True)
    valid_dataframe = dataframe[dataframe['fold'] == fold].reset_index(drop=True)

    train_loader = make_loaders(train_dataframe, tokenizer, "train", options.max_length)
    valid_loader = make_loaders(valid_dataframe, tokenizer, "valid", options.max_length)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model, options.num_labels, dropout=options.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=options.learning_rate)
    if options.scheduler == "ReduceLROnPlateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                                  mode="min", 
                                                                  factor=0.5, 
                                                                  patience=options.patience)

        # when to step the scheduler: after an epoch or after a batch
        options.step = "epoch"
        
    elif options.scheduler == "LinearWarmup":
        num_train_steps = len(train_loader) * options.epochs
        lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                       num_warmup_steps=0, 
                                                       num_training_steps=num_train_steps)
        
        # when to step the scheduler: after an epoch or after a batch
        options.step = "batch"
    
    criterion = nn.CrossEntropyLoss()
    options.model_save_name = f"model_fold_{fold}.pt"
    train_eval(options.epochs, model, train_loader, valid_loader,
               criterion, optimizer, device, options, loss_accuracy, lr_scheduler=lr_scheduler)

    # tf = open(f"{path}/models/loss_accuracy_{fold}.json", "x")
    # json.dump(loss_accuracy, tf)
    # tf.close()


In [11]:
def train_folds(options):
    n_folds = options.n_folds
    for i in range(n_folds):
        one_fold(fold=i, options=options)

## Tuning

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 32
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3
options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 385/385 [01:27<00:00,  4.39it/s, accuracy=0.835, loss=0.396, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.20it/s, accuracy=0.861, loss=0.369]


Saved best model!
Train Loss: 0.39624
Train Accuracy: 0.83484
Valid Loss: 0.36893
Valid Accuracy: 0.86148
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 385/385 [01:27<00:00,  4.40it/s, accuracy=0.901, loss=0.266, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.46it/s, accuracy=0.884, loss=0.299]


Saved best model!
Train Loss: 0.26632
Train Accuracy: 0.90085
Valid Loss: 0.29943
Valid Accuracy: 0.88373
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 385/385 [01:27<00:00,  4.41it/s, accuracy=0.941, loss=0.163, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.46it/s, accuracy=0.9, loss=0.299]


Saved best model!
Train Loss: 0.16299
Train Accuracy: 0.94113
Valid Loss: 0.29887
Valid Accuracy: 0.89964
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 385/385 [01:27<00:00,  4.42it/s, accuracy=0.82, loss=0.414, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.45it/s, accuracy=0.87, loss=0.335]


Saved best model!
Train Loss: 0.41373
Train Accuracy: 0.82038
Valid Loss: 0.33518
Valid Accuracy: 0.86976
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 385/385 [01:27<00:00,  4.42it/s, accuracy=0.896, loss=0.273, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.42it/s, accuracy=0.878, loss=0.313]


Saved best model!
Train Loss: 0.27293
Train Accuracy: 0.89557
Valid Loss: 0.31261
Valid Accuracy: 0.87804
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 385/385 [01:27<00:00,  4.41it/s, accuracy=0.944, loss=0.165, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.23it/s, accuracy=0.893, loss=0.311]


Saved best model!
Train Loss: 0.16510
Train Accuracy: 0.94365
Valid Loss: 0.31143
Valid Accuracy: 0.89282
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 385/385 [01:27<00:00,  4.41it/s, accuracy=0.828, loss=0.4, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.09it/s, accuracy=0.86, loss=0.339]


Saved best model!
Train Loss: 0.39955
Train Accuracy: 0.82770
Valid Loss: 0.33870
Valid Accuracy: 0.86016
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 385/385 [01:27<00:00,  4.42it/s, accuracy=0.902, loss=0.261, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.18it/s, accuracy=0.897, loss=0.278]


Saved best model!
Train Loss: 0.26119
Train Accuracy: 0.90159
Valid Loss: 0.27786
Valid Accuracy: 0.89654
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 385/385 [01:27<00:00,  4.42it/s, accuracy=0.943, loss=0.154, lr=3e-5]
100%|██████████| 193/193 [00:15<00:00, 12.18it/s, accuracy=0.888, loss=0.311]


Train Loss: 0.15444
Train Accuracy: 0.94251
Valid Loss: 0.31078
Valid Accuracy: 0.88793
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3
options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.825, loss=0.408, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.80it/s, accuracy=0.859, loss=0.347]


Saved best model!
Train Loss: 0.40761
Train Accuracy: 0.82485
Valid Loss: 0.34736
Valid Accuracy: 0.85905
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.893, loss=0.282, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.872, loss=0.318]


Saved best model!
Train Loss: 0.28239
Train Accuracy: 0.89346
Valid Loss: 0.31829
Valid Accuracy: 0.87187
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.93, loss=0.191, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.78it/s, accuracy=0.888, loss=0.313]


Saved best model!
Train Loss: 0.19137
Train Accuracy: 0.93017
Valid Loss: 0.31318
Valid Accuracy: 0.88828
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.817, loss=0.418, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.75it/s, accuracy=0.86, loss=0.346]


Saved best model!
Train Loss: 0.41810
Train Accuracy: 0.81697
Valid Loss: 0.34594
Valid Accuracy: 0.85986
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.893, loss=0.283, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.885, loss=0.305]


Saved best model!
Train Loss: 0.28280
Train Accuracy: 0.89306
Valid Loss: 0.30509
Valid Accuracy: 0.88503
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.935, loss=0.188, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.78it/s, accuracy=0.892, loss=0.315]


Train Loss: 0.18823
Train Accuracy: 0.93479
Valid Loss: 0.31475
Valid Accuracy: 0.89185
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.821, loss=0.414, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.66it/s, accuracy=0.859, loss=0.35]


Saved best model!
Train Loss: 0.41384
Train Accuracy: 0.82113
Valid Loss: 0.35003
Valid Accuracy: 0.85870
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.891, loss=0.287, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.61it/s, accuracy=0.885, loss=0.309]


Saved best model!
Train Loss: 0.28655
Train Accuracy: 0.89112
Valid Loss: 0.30893
Valid Accuracy: 0.88485
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.932, loss=0.193, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.69it/s, accuracy=0.899, loss=0.279]


Saved best model!
Train Loss: 0.19258
Train Accuracy: 0.93245
Valid Loss: 0.27942
Valid Accuracy: 0.89881
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 128
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3
options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.807, loss=0.439, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.31it/s, accuracy=0.857, loss=0.359]


Saved best model!
Train Loss: 0.43871
Train Accuracy: 0.80666
Valid Loss: 0.35905
Valid Accuracy: 0.85661
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.883, loss=0.307, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.27it/s, accuracy=0.87, loss=0.329]


Saved best model!
Train Loss: 0.30734
Train Accuracy: 0.88331
Valid Loss: 0.32878
Valid Accuracy: 0.87009
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.921, loss=0.223, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.30it/s, accuracy=0.891, loss=0.307]


Saved best model!
Train Loss: 0.22275
Train Accuracy: 0.92140
Valid Loss: 0.30688
Valid Accuracy: 0.89071
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.812, loss=0.432, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.28it/s, accuracy=0.854, loss=0.363]


Saved best model!
Train Loss: 0.43164
Train Accuracy: 0.81202
Valid Loss: 0.36299
Valid Accuracy: 0.85450
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 97/97 [01:19<00:00,  1.23it/s, accuracy=0.878, loss=0.307, lr=3e-5]
100%|██████████| 49/49 [00:15<00:00,  3.26it/s, accuracy=0.872, loss=0.339]


Saved best model!
Train Loss: 0.30695
Train Accuracy: 0.87795
Valid Loss: 0.33933
Valid Accuracy: 0.87204
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.92, loss=0.224, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.28it/s, accuracy=0.883, loss=0.333]


Saved best model!
Train Loss: 0.22383
Train Accuracy: 0.92018
Valid Loss: 0.33326
Valid Accuracy: 0.88308
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.806, loss=0.441, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.35it/s, accuracy=0.857, loss=0.351]


Saved best model!
Train Loss: 0.44064
Train Accuracy: 0.80627
Valid Loss: 0.35106
Valid Accuracy: 0.85724
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.882, loss=0.311, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.38it/s, accuracy=0.876, loss=0.314]


Saved best model!
Train Loss: 0.31060
Train Accuracy: 0.88154
Valid Loss: 0.31368
Valid Accuracy: 0.87559
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 97/97 [01:18<00:00,  1.23it/s, accuracy=0.917, loss=0.234, lr=3e-5]
100%|██████████| 49/49 [00:14<00:00,  3.38it/s, accuracy=0.888, loss=0.297]


Saved best model!
Train Loss: 0.23450
Train Accuracy: 0.91702
Valid Loss: 0.29747
Valid Accuracy: 0.88761
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.826, loss=0.409, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.73it/s, accuracy=0.851, loss=0.357]


Saved best model!
Train Loss: 0.40899
Train Accuracy: 0.82639
Valid Loss: 0.35676
Valid Accuracy: 0.85060
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.891, loss=0.285, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.867, loss=0.34]


Saved best model!
Train Loss: 0.28516
Train Accuracy: 0.89143
Valid Loss: 0.33968
Valid Accuracy: 0.86716
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.932, loss=0.189, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.887, loss=0.309]


Saved best model!
Train Loss: 0.18873
Train Accuracy: 0.93179
Valid Loss: 0.30920
Valid Accuracy: 0.88714
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.819, loss=0.416, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.70it/s, accuracy=0.868, loss=0.34]


Saved best model!
Train Loss: 0.41588
Train Accuracy: 0.81900
Valid Loss: 0.34047
Valid Accuracy: 0.86765
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.895, loss=0.278, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.887, loss=0.309]


Saved best model!
Train Loss: 0.27804
Train Accuracy: 0.89476
Valid Loss: 0.30867
Valid Accuracy: 0.88665
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.934, loss=0.186, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.904, loss=0.285]


Saved best model!
Train Loss: 0.18642
Train Accuracy: 0.93366
Valid Loss: 0.28461
Valid Accuracy: 0.90419
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.819, loss=0.413, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.67it/s, accuracy=0.863, loss=0.339]


Saved best model!
Train Loss: 0.41302
Train Accuracy: 0.81926
Valid Loss: 0.33852
Valid Accuracy: 0.86292
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.893, loss=0.281, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.63it/s, accuracy=0.887, loss=0.307]


Saved best model!
Train Loss: 0.28063
Train Accuracy: 0.89290
Valid Loss: 0.30682
Valid Accuracy: 0.88696
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.935, loss=0.183, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.67it/s, accuracy=0.901, loss=0.279]


Saved best model!
Train Loss: 0.18279
Train Accuracy: 0.93513
Valid Loss: 0.27861
Valid Accuracy: 0.90141
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 4
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.817, loss=0.413, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.78it/s, accuracy=0.848, loss=0.373]


Saved best model!
Train Loss: 0.41296
Train Accuracy: 0.81746
Valid Loss: 0.37315
Valid Accuracy: 0.84784
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.896, loss=0.28, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.73it/s, accuracy=0.873, loss=0.312]


Saved best model!
Train Loss: 0.27979
Train Accuracy: 0.89557
Valid Loss: 0.31200
Valid Accuracy: 0.87301
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.933, loss=0.186, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.77it/s, accuracy=0.897, loss=0.296]


Saved best model!
Train Loss: 0.18627
Train Accuracy: 0.93350
Valid Loss: 0.29639
Valid Accuracy: 0.89704
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.823, loss=0.407, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.69it/s, accuracy=0.867, loss=0.339]


Saved best model!
Train Loss: 0.40715
Train Accuracy: 0.82306
Valid Loss: 0.33856
Valid Accuracy: 0.86749
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.893, loss=0.282, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.89, loss=0.293]


Saved best model!
Train Loss: 0.28229
Train Accuracy: 0.89306
Valid Loss: 0.29268
Valid Accuracy: 0.88990
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.935, loss=0.189, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.74it/s, accuracy=0.898, loss=0.276]


Saved best model!
Train Loss: 0.18892
Train Accuracy: 0.93471
Valid Loss: 0.27601
Valid Accuracy: 0.89818
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.822, loss=0.413, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.62it/s, accuracy=0.861, loss=0.349]


Saved best model!
Train Loss: 0.41310
Train Accuracy: 0.82243
Valid Loss: 0.34871
Valid Accuracy: 0.86097
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.892, loss=0.286, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.68it/s, accuracy=0.879, loss=0.312]


Saved best model!
Train Loss: 0.28591
Train Accuracy: 0.89217
Valid Loss: 0.31213
Valid Accuracy: 0.87900
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.932, loss=0.193, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.65it/s, accuracy=0.902, loss=0.268]


Saved best model!
Train Loss: 0.19294
Train Accuracy: 0.93204
Valid Loss: 0.26834
Valid Accuracy: 0.90239
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 164
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 8
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.813, loss=0.436, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.67it/s, accuracy=0.851, loss=0.366]


Saved best model!
Train Loss: 0.43580
Train Accuracy: 0.81267
Valid Loss: 0.36562
Valid Accuracy: 0.85125
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.881, loss=0.314, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.65it/s, accuracy=0.865, loss=0.34]


Saved best model!
Train Loss: 0.31400
Train Accuracy: 0.88112
Valid Loss: 0.33951
Valid Accuracy: 0.86505
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.915, loss=0.238, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.66it/s, accuracy=0.882, loss=0.319]


Saved best model!
Train Loss: 0.23828
Train Accuracy: 0.91539
Valid Loss: 0.31860
Valid Accuracy: 0.88178
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.806, loss=0.443, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.64it/s, accuracy=0.861, loss=0.351]


Saved best model!
Train Loss: 0.44344
Train Accuracy: 0.80641
Valid Loss: 0.35062
Valid Accuracy: 0.86148
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.876, loss=0.319, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.64it/s, accuracy=0.879, loss=0.324]


Saved best model!
Train Loss: 0.31884
Train Accuracy: 0.87568
Valid Loss: 0.32432
Valid Accuracy: 0.87934
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.909, loss=0.244, lr=3e-5]
100%|██████████| 38/38 [00:14<00:00,  2.66it/s, accuracy=0.877, loss=0.328]


Train Loss: 0.24363
Train Accuracy: 0.90905
Valid Loss: 0.32769
Valid Accuracy: 0.87675
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.811, loss=0.438, lr=3e-5]
100%|██████████| 38/38 [00:13<00:00,  2.83it/s, accuracy=0.859, loss=0.352]


Saved best model!
Train Loss: 0.43805
Train Accuracy: 0.81098
Valid Loss: 0.35230
Valid Accuracy: 0.85886
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.879, loss=0.316, lr=3e-5]
100%|██████████| 38/38 [00:13<00:00,  2.82it/s, accuracy=0.879, loss=0.314]


Saved best model!
Train Loss: 0.31595
Train Accuracy: 0.87886
Valid Loss: 0.31393
Valid Accuracy: 0.87884
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 76/76 [01:16<00:00,  1.00s/it, accuracy=0.913, loss=0.242, lr=3e-5]
100%|██████████| 38/38 [00:13<00:00,  2.79it/s, accuracy=0.89, loss=0.3]


Saved best model!
Train Loss: 0.24200
Train Accuracy: 0.91288
Valid Loss: 0.30004
Valid Accuracy: 0.89037
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-4
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.821, loss=0.428, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.841, loss=0.377]


Saved best model!
Train Loss: 0.42777
Train Accuracy: 0.82063
Valid Loss: 0.37685
Valid Accuracy: 0.84053
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.899, loss=0.278, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.866, loss=0.384]


Train Loss: 0.27810
Train Accuracy: 0.89866
Valid Loss: 0.38411
Valid Accuracy: 0.86635
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.857, loss=0.311, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.89it/s, accuracy=0.568, loss=0.685]


Train Loss: 0.31086
Train Accuracy: 0.85708
Valid Loss: 0.68475
Valid Accuracy: 0.56804
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.808, loss=0.448, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.75it/s, accuracy=0.842, loss=0.387]


Saved best model!
Train Loss: 0.44846
Train Accuracy: 0.80828
Valid Loss: 0.38718
Valid Accuracy: 0.84167
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.892, loss=0.304, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.868, loss=0.369]


Saved best model!
Train Loss: 0.30358
Train Accuracy: 0.89151
Valid Loss: 0.36917
Valid Accuracy: 0.86798
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.899, loss=0.289, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.85it/s, accuracy=0.86, loss=0.371]


Train Loss: 0.28907
Train Accuracy: 0.89923
Valid Loss: 0.37125
Valid Accuracy: 0.85969
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.81, loss=0.443, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.75it/s, accuracy=0.86, loss=0.35]


Saved best model!
Train Loss: 0.44255
Train Accuracy: 0.81049
Valid Loss: 0.35021
Valid Accuracy: 0.85983
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:19<00:00,  2.42it/s, accuracy=0.904, loss=0.265, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.77it/s, accuracy=0.883, loss=0.351]


Train Loss: 0.26450
Train Accuracy: 0.90435
Valid Loss: 0.35081
Valid Accuracy: 0.88306
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:19<00:00,  2.41it/s, accuracy=0.942, loss=0.183, lr=0.0003]
100%|██████████| 97/97 [00:14<00:00,  6.86it/s, accuracy=0.797, loss=0.539]


Train Loss: 0.18266
Train Accuracy: 0.94186
Valid Loss: 0.53891
Valid Accuracy: 0.79682
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.828, loss=0.407, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.74it/s, accuracy=0.855, loss=0.354]


Saved best model!
Train Loss: 0.40745
Train Accuracy: 0.82818
Valid Loss: 0.35389
Valid Accuracy: 0.85515
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.894, loss=0.281, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.75it/s, accuracy=0.87, loss=0.33]


Saved best model!
Train Loss: 0.28069
Train Accuracy: 0.89411
Valid Loss: 0.32966
Valid Accuracy: 0.87025
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.934, loss=0.184, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.891, loss=0.311]


Saved best model!
Train Loss: 0.18392
Train Accuracy: 0.93374
Valid Loss: 0.31124
Valid Accuracy: 0.89136
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.814, loss=0.418, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.72it/s, accuracy=0.869, loss=0.336]


Saved best model!
Train Loss: 0.41840
Train Accuracy: 0.81445
Valid Loss: 0.33602
Valid Accuracy: 0.86928
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.893, loss=0.283, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.89, loss=0.302]


Saved best model!
Train Loss: 0.28258
Train Accuracy: 0.89265
Valid Loss: 0.30174
Valid Accuracy: 0.89006
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.937, loss=0.184, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.77it/s, accuracy=0.9, loss=0.274]


Saved best model!
Train Loss: 0.18427
Train Accuracy: 0.93723
Valid Loss: 0.27359
Valid Accuracy: 0.90045
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.825, loss=0.411, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.61it/s, accuracy=0.862, loss=0.342]


Saved best model!
Train Loss: 0.41074
Train Accuracy: 0.82462
Valid Loss: 0.34211
Valid Accuracy: 0.86195
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.897, loss=0.279, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.68it/s, accuracy=0.887, loss=0.296]


Saved best model!
Train Loss: 0.27871
Train Accuracy: 0.89713
Valid Loss: 0.29615
Valid Accuracy: 0.88728
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.932, loss=0.19, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.69it/s, accuracy=0.897, loss=0.293]


Saved best model!
Train Loss: 0.18972
Train Accuracy: 0.93220
Valid Loss: 0.29288
Valid Accuracy: 0.89703
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-6
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.709, loss=0.561, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.71it/s, accuracy=0.816, loss=0.429]


Saved best model!
Train Loss: 0.56141
Train Accuracy: 0.70905
Valid Loss: 0.42886
Valid Accuracy: 0.81634
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.836, loss=0.393, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.839, loss=0.392]


Saved best model!
Train Loss: 0.39309
Train Accuracy: 0.83573
Valid Loss: 0.39205
Valid Accuracy: 0.83875
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.856, loss=0.359, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.77it/s, accuracy=0.845, loss=0.378]


Saved best model!
Train Loss: 0.35943
Train Accuracy: 0.85595
Valid Loss: 0.37782
Valid Accuracy: 0.84492
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.714, loss=0.571, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.74it/s, accuracy=0.815, loss=0.425]


Saved best model!
Train Loss: 0.57116
Train Accuracy: 0.71368
Valid Loss: 0.42452
Valid Accuracy: 0.81487
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.828, loss=0.406, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.79it/s, accuracy=0.837, loss=0.384]


Saved best model!
Train Loss: 0.40560
Train Accuracy: 0.82810
Valid Loss: 0.38365
Valid Accuracy: 0.83696
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.851, loss=0.365, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.78it/s, accuracy=0.856, loss=0.358]


Saved best model!
Train Loss: 0.36466
Train Accuracy: 0.85132
Valid Loss: 0.35792
Valid Accuracy: 0.85612
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.759, loss=0.535, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.62it/s, accuracy=0.829, loss=0.407]


Saved best model!
Train Loss: 0.53493
Train Accuracy: 0.75893
Valid Loss: 0.40660
Valid Accuracy: 0.82930
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.836, loss=0.397, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.69it/s, accuracy=0.847, loss=0.374]


Saved best model!
Train Loss: 0.39690
Train Accuracy: 0.83647
Valid Loss: 0.37408
Valid Accuracy: 0.84733
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.854, loss=0.361, lr=3e-6]
100%|██████████| 97/97 [00:14<00:00,  6.69it/s, accuracy=0.854, loss=0.359]


Saved best model!
Train Loss: 0.36112
Train Accuracy: 0.85442
Valid Loss: 0.35892
Valid Accuracy: 0.85399
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.3
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.829, loss=0.407, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.73it/s, accuracy=0.859, loss=0.35]


Saved best model!
Train Loss: 0.40665
Train Accuracy: 0.82875
Valid Loss: 0.35020
Valid Accuracy: 0.85856
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.896, loss=0.28, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.80it/s, accuracy=0.878, loss=0.322]


Saved best model!
Train Loss: 0.27963
Train Accuracy: 0.89631
Valid Loss: 0.32183
Valid Accuracy: 0.87788
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.932, loss=0.192, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.76it/s, accuracy=0.888, loss=0.307]


Saved best model!
Train Loss: 0.19230
Train Accuracy: 0.93220
Valid Loss: 0.30682
Valid Accuracy: 0.88795
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.82, loss=0.414, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.70it/s, accuracy=0.869, loss=0.335]


Saved best model!
Train Loss: 0.41370
Train Accuracy: 0.82006
Valid Loss: 0.33459
Valid Accuracy: 0.86928
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.895, loss=0.277, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.66it/s, accuracy=0.883, loss=0.317]


Saved best model!
Train Loss: 0.27708
Train Accuracy: 0.89484
Valid Loss: 0.31736
Valid Accuracy: 0.88259
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.936, loss=0.179, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.77it/s, accuracy=0.886, loss=0.34]


Train Loss: 0.17940
Train Accuracy: 0.93593
Valid Loss: 0.34012
Valid Accuracy: 0.88600
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:20<00:00,  2.41it/s, accuracy=0.824, loss=0.406, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.63it/s, accuracy=0.867, loss=0.336]


Saved best model!
Train Loss: 0.40642
Train Accuracy: 0.82421
Valid Loss: 0.33584
Valid Accuracy: 0.86731
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:20<00:00,  2.40it/s, accuracy=0.895, loss=0.279, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.70it/s, accuracy=0.889, loss=0.299]


Saved best model!
Train Loss: 0.27859
Train Accuracy: 0.89510
Valid Loss: 0.29923
Valid Accuracy: 0.88939
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


 58%|█████▊    | 111/193 [00:46<00:35,  2.34it/s, accuracy=0.932, loss=0.191, lr=3e-5]

In [12]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.34it/s, accuracy=0.829, loss=0.404, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.28it/s, accuracy=0.859, loss=0.349]


Saved best model!
Train Loss: 0.40448
Train Accuracy: 0.82907
Valid Loss: 0.34852
Valid Accuracy: 0.85872
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.897, loss=0.275, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.56it/s, accuracy=0.882, loss=0.32]


Saved best model!
Train Loss: 0.27475
Train Accuracy: 0.89712
Valid Loss: 0.32010
Valid Accuracy: 0.88178
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.932, loss=0.185, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.53it/s, accuracy=0.893, loss=0.304]


Saved best model!
Train Loss: 0.18475
Train Accuracy: 0.93228
Valid Loss: 0.30369
Valid Accuracy: 0.89282
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.824, loss=0.416, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.57it/s, accuracy=0.864, loss=0.341]


Saved best model!
Train Loss: 0.41584
Train Accuracy: 0.82420
Valid Loss: 0.34128
Valid Accuracy: 0.86408
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.34it/s, accuracy=0.889, loss=0.285, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.58it/s, accuracy=0.89, loss=0.299]


Saved best model!
Train Loss: 0.28511
Train Accuracy: 0.88948
Valid Loss: 0.29872
Valid Accuracy: 0.89022
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.934, loss=0.186, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.52it/s, accuracy=0.897, loss=0.298]


Saved best model!
Train Loss: 0.18641
Train Accuracy: 0.93447
Valid Loss: 0.29832
Valid Accuracy: 0.89721
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.822, loss=0.415, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.42it/s, accuracy=0.862, loss=0.342]


Saved best model!
Train Loss: 0.41490
Train Accuracy: 0.82194
Valid Loss: 0.34152
Valid Accuracy: 0.86211
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.34it/s, accuracy=0.89, loss=0.289, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.44it/s, accuracy=0.884, loss=0.304]


Saved best model!
Train Loss: 0.28883
Train Accuracy: 0.88982
Valid Loss: 0.30353
Valid Accuracy: 0.88420
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.931, loss=0.192, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.43it/s, accuracy=0.897, loss=0.287]


Saved best model!
Train Loss: 0.19210
Train Accuracy: 0.93090
Valid Loss: 0.28713
Valid Accuracy: 0.89687
******************************


In [13]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.7
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.82, loss=0.424, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.49it/s, accuracy=0.855, loss=0.36]


Saved best model!
Train Loss: 0.42373
Train Accuracy: 0.81989
Valid Loss: 0.35962
Valid Accuracy: 0.85466
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.893, loss=0.289, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.57it/s, accuracy=0.874, loss=0.326]


Saved best model!
Train Loss: 0.28902
Train Accuracy: 0.89265
Valid Loss: 0.32593
Valid Accuracy: 0.87399
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.925, loss=0.204, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.52it/s, accuracy=0.886, loss=0.309]


Saved best model!
Train Loss: 0.20383
Train Accuracy: 0.92481
Valid Loss: 0.30859
Valid Accuracy: 0.88600
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.816, loss=0.42, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.51it/s, accuracy=0.863, loss=0.338]


Saved best model!
Train Loss: 0.42016
Train Accuracy: 0.81640
Valid Loss: 0.33790
Valid Accuracy: 0.86310
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.889, loss=0.289, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.56it/s, accuracy=0.886, loss=0.309]


Saved best model!
Train Loss: 0.28857
Train Accuracy: 0.88867
Valid Loss: 0.30922
Valid Accuracy: 0.88600
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.929, loss=0.198, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.48it/s, accuracy=0.893, loss=0.314]


Train Loss: 0.19808
Train Accuracy: 0.92895
Valid Loss: 0.31361
Valid Accuracy: 0.89282
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.822, loss=0.42, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.42it/s, accuracy=0.867, loss=0.342]


Saved best model!
Train Loss: 0.42011
Train Accuracy: 0.82243
Valid Loss: 0.34216
Valid Accuracy: 0.86682
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.34it/s, accuracy=0.891, loss=0.287, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.36it/s, accuracy=0.882, loss=0.304]


Saved best model!
Train Loss: 0.28724
Train Accuracy: 0.89063
Valid Loss: 0.30376
Valid Accuracy: 0.88176
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.932, loss=0.195, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.42it/s, accuracy=0.895, loss=0.287]


Saved best model!
Train Loss: 0.19470
Train Accuracy: 0.93196
Valid Loss: 0.28740
Valid Accuracy: 0.89524
******************************


In [14]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 3

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.819, loss=0.416, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.59it/s, accuracy=0.855, loss=0.349]


Saved best model!
Train Loss: 0.41570
Train Accuracy: 0.81876
Valid Loss: 0.34924
Valid Accuracy: 0.85466
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.892, loss=0.286, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.55it/s, accuracy=0.876, loss=0.316]


Saved best model!
Train Loss: 0.28595
Train Accuracy: 0.89249
Valid Loss: 0.31641
Valid Accuracy: 0.87577
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.932, loss=0.192, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.56it/s, accuracy=0.897, loss=0.306]


Saved best model!
Train Loss: 0.19208
Train Accuracy: 0.93220
Valid Loss: 0.30561
Valid Accuracy: 0.89737
******************************
Training Fold: 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.819, loss=0.418, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.60it/s, accuracy=0.863, loss=0.347]


Saved best model!
Train Loss: 0.41766
Train Accuracy: 0.81908
Valid Loss: 0.34712
Valid Accuracy: 0.86262
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.889, loss=0.289, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.50it/s, accuracy=0.887, loss=0.308]


Saved best model!
Train Loss: 0.28851
Train Accuracy: 0.88883
Valid Loss: 0.30789
Valid Accuracy: 0.88698
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.93, loss=0.192, lr=3e-5]
100%|██████████| 97/97 [00:14<00:00,  6.58it/s, accuracy=0.885, loss=0.338]


Train Loss: 0.19165
Train Accuracy: 0.93025
Valid Loss: 0.33782
Valid Accuracy: 0.88454
******************************
Training Fold: 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.821, loss=0.413, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.45it/s, accuracy=0.868, loss=0.344]


Saved best model!
Train Loss: 0.41259
Train Accuracy: 0.82137
Valid Loss: 0.34394
Valid Accuracy: 0.86844
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


100%|██████████| 193/193 [01:22<00:00,  2.35it/s, accuracy=0.892, loss=0.282, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.39it/s, accuracy=0.889, loss=0.296]


Saved best model!
Train Loss: 0.28152
Train Accuracy: 0.89169
Valid Loss: 0.29590
Valid Accuracy: 0.88939
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 3


100%|██████████| 193/193 [01:22<00:00,  2.34it/s, accuracy=0.935, loss=0.186, lr=3e-5]
100%|██████████| 97/97 [00:15<00:00,  6.44it/s, accuracy=0.901, loss=0.303]


Train Loss: 0.18581
Train Accuracy: 0.93456
Valid Loss: 0.30261
Valid Accuracy: 0.90109
******************************


In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)


Training Fold: 0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1


100%|██████████| 231/231 [01:38<00:00,  2.35it/s, accuracy=0.826, loss=0.403, lr=3e-5]
100%|██████████| 58/58 [00:08<00:00,  6.48it/s, accuracy=0.868, loss=0.339]


Saved best model!
Train Loss: 0.40256
Train Accuracy: 0.82589
Valid Loss: 0.33863
Valid Accuracy: 0.86766
******************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2


 66%|██████▌   | 153/231 [01:05<00:33,  2.36it/s, accuracy=0.897, loss=0.275, lr=3e-5]

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 3
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 7

options = Options()
train_folds(options)


## Run with the best setting

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 5
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    model_path = path + "/models/"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()
train_folds(options)

In [None]:
accuracies = [0.8657645459910691, 0.8960757774493691, 0.9096075771464708, 0.9098782130765334, 0.9234100128058974, 0.9142083890221923, 0.9174560208927149, 0.9139377529953426, 0.9209742888546119, 0.9179972929141518, 0.9220568328006987, 0.9198917448762628, 0.9169147489358027, 0.9182679289087391, 0.9161028409520409, 0.9155615689628663, 0.9179972929141518, 0.9188092008979136, 0.9147496609791045, 0.9150202969736917]
import matplotlib.pyplot as plt
epochs = 20
plt.plot(range(1, epochs + 1), accuracies)

In [None]:
!nvidia-smi

In [None]:
def test_one_model(options):  
    test_dataframe = pd.read_csv(path+'/data/tweet_samples_100.csv', sep=",")

    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)
    
    test_loader = make_loaders(test_dataframe, tokenizer, mode="test")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model,
                        options.num_labels,
                        dropout=options.dropout).to(device)
    model.load_state_dict(torch.load(f"{options.model_path}/{options.model_save_name}", 
                                     map_location=device))
    model.eval()
    
    all_preds = None
    with torch.no_grad():
        for batch in tqdm(test_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            preds = model(batch)
            if all_preds is None:
                all_preds = preds
            else:
                all_preds = torch.cat([all_preds, preds], dim=0)

    return all_preds

In [None]:
def test_all_models(options):
    n_folds = options.n_folds
    all_model_preds = []
    for fold in range(n_folds):
        options.model_save_name = f"model_fold_{fold}.pt"
        all_preds = test_one_model(options)
        all_model_preds.append(all_preds)
    
    all_model_preds = torch.stack(all_model_preds, dim=0)
    print(all_model_preds.shape)
    # I will return the mean of the final predictions of all the models
    # You could do other things like 'voting' between the five models
    return all_model_preds.mean(0)

In [None]:
test_dataframe = pd.read_csv(path+'/data/tweet_samples_100.csv', sep=",")
all_preds = test_all_models(options)
predictions = all_preds.argmax(dim=1).cpu().numpy()
true_labels = np.array(test_dataframe["target"].tolist())

In [None]:
np.sum(predictions == true_labels)

In [None]:
mismatched_indices = []
for i in range(len(predictions)):
    if predictions[i] != true_labels[i]:
        mismatched_indices.append(i)
mismatched_indices