# Hyperparameter Fine-Tuning

In [1]:
!pip install transformers==4.17 # this will also install tokenizers 
!pip install datasets
!pip install ray==1.10.0

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 7.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [2]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
import matplotlib.pyplot as plt
import nltk
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, roc_curve, auc
from nltk.tokenize import sent_tokenize
import pandas as pd
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, RobertaForSequenceClassification,RobertaModel, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup
import datasets
from ray.tune.schedulers import PopulationBasedTraining,ASHAScheduler
from ray import tune
from ray.tune import uniform, CLIReporter
from random import randint
from sklearn.metrics import accuracy_score
# from ray.tune.stopper import TrialPlateauStopper
import os
import ray
print(ray.__version__)
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

1.10.0
There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


Load Data

In [3]:
data = datasets.load_dataset("SetFit/sst2")

Using custom data configuration SetFit--sst2-c66a8eb4897c1f8f


Downloading and preparing dataset json/SetFit--sst2 to /root/.cache/huggingface/datasets/json/SetFit--sst2-c66a8eb4897c1f8f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/281k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/136k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/SetFit--sst2-c66a8eb4897c1f8f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
### number of labels
num_labels = len(np.unique(data['train']['label']))
print (num_labels)

2


Hyperparameters for model

In [6]:
# Hyper-parameters
model_name = "roberta-base" 
max_length = 50
batch_size = 32
n_epochs = 10
lr = 3e-5
warmup = 0.06
beta_1 = 0.9
beta_2 = 0.98
adam_ep = 1e-6
weight_decay = 0.1
optimizer = AdamW


Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir = "."
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Dataset class

In [13]:
class SST2Dataset():
## Dataset in pandas dataframe format##
    def __init__(self, tokenizer, df, max_length):
        super(SST2Dataset, self).__init__()

        self.tokenizer = tokenizer
        self.max_seq_len = max_length
        self.input_ids, self.attention_mask,self.token_type_ids,self.label = self.get_input(df)

    def __len__(self):
        return len(self.label)
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens_seq)
        input_ids += padding   
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len)
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids

    def get_input(self,df):
        tweet = df['text'].values
        label = df['label'].values
        tokens_seq = list(map(self.tokenizer.tokenize,tweet))
        result = list(map(self.trunate_and_pad, tokens_seq))
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]

        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(label).type(torch.long)
               )

    def __getitem__(self, item):
        return self.input_ids[item], self.attention_mask[item],self.token_type_ids[item],self.label[item]


Function to load data for individual hyper-parameter tuning

In [8]:
def load_data(max_length,batch_size):
    if load_data  == None:
        datasets.load_dataset("SetFit/sst2")
    data_train = datasets.Dataset.to_pandas(data["train"])
    data_val = datasets.Dataset.to_pandas(data["validation"])
    data_test = datasets.Dataset.to_pandas(data["test"])
    ## Emotion dataset
    train_dataset = SST2Dataset(tokenizer, data_train, max_length)
    val_dataset = SST2Dataset(tokenizer, data_val, max_length)
    test_dataset = SST2Dataset(tokenizer, data_test, max_length)
    ## dataloader
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True) # shuffle the training set
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    return train_loader,val_loader,test_loader,len(data["train"]["text"])

Train and Eval function

In [9]:
def model_init(model_name,optimizer,data_len):
  model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels = 2)
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
          {
                  'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                  'weight_decay':weight_decay
          },
          {
                  'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                  'weight_decay':0.0
          }
  ]
  optimizer = optimizer(optimizer_grouped_parameters, lr=lr, betas = (beta_1,beta_2),eps = adam_ep)
  total_steps = int(n_epochs * (data_len/ batch_size))
  warmup_steps = int(warmup * total_steps)
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps = warmup_steps, 
      num_training_steps = total_steps)
  
  return model,optimizer,scheduler


In [10]:
def set_seed(seed_value = 0):
    """Set seed for reproducibility.
    """
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    
def train(train_loader, model, optimizer, scheduler):
    model.train()
    train_losses = []
    correct_preds = 0
    tqdm_loader = tqdm(train_loader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate (tqdm_loader):
        model.zero_grad()
        

        input_ids = batch_seqs
        attention_mask = batch_seq_masks
        segments = batch_seq_segments
        labels = batch_labels

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        segments = segments.to(device)
        labels = labels.to(device)

        loss,logits = model(input_ids, attention_mask,segments,labels = labels)[:2] # forward pass (batch_size, 1)
        probs = nn.functional.softmax(logits,dim = -1)
        _,out_classes = probs.max(dim =1)
        correct = (out_classes == labels).sum()
        correct_preds += correct.item()
        train_losses.append(loss.item())

        loss.backward() # backward pass
        ## clip grad
        optimizer.step() # weights update
        if scheduler is not None:
          scheduler.step()
        optimizer.zero_grad()
        
    train_loss = np.mean(train_losses)
    epoch_accuracy = correct_preds /len(train_loader.dataset)

    return train_loss,epoch_accuracy

In [11]:
def eval(loader, model):
    model.eval()
    val_losses = []
    val_probs = []
    acc_correct = 0
    iter = 0
    log_interval = 100
    sigmoid = nn.Sigmoid()

    val_labels = []
    # threshold = torch.tensor([0.5]).to(device)
    # loader = tqdm(loader)
    for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in loader:
        
        input_ids = batch_seqs
        attention_mask = batch_seq_masks
        segments = batch_seq_segments
        labels = batch_labels

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        segments = segments.to(device)
        # print (attention_mask.shape,labels.shape)

        with torch.no_grad():
            valid_loss,valid_logits = model(input_ids, attention_mask,segments,labels = labels)[:2]
            valid_probs = nn.functional.softmax(valid_logits,dim = -1)
            _,out_classes = valid_probs.max(dim =1)
            correct = (out_classes == labels).sum()
            acc_correct += correct.item()
            val_probs.extend(valid_probs[:,1].cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
            val_losses.append(valid_loss.item())
    val_loss = np.mean(val_losses)
    val_probs = np.array(val_probs)
    val_labels = np.array(val_labels)
    valid_accuracy = acc_correct/(len(loader.dataset))

    
    fpr, tpr, threshold = roc_curve(val_labels, val_probs)
    val_auc = 100 * auc(fpr, tpr)

    return val_loss, val_auc, val_probs, val_labels,valid_accuracy

**Fine-Tune individual Hyperparameters**

Batch size and Learning rate

Using larger batch sizes allow higher learning rate to be used, similarly, smaller batch size with lower learning rate

In [None]:
batch_size_set = [8,16,64]
lr_set = [1e-5,2e-5,5e-5]

In [None]:
### Seed all test to seed 0
seed_val = 0
set_seed(seed_value = seed_val)
for i in range(len(batch_size_set)):
  batch_size = batch_size_set[i]
  lr = lr_set[i]
  train_loader,val_loader,test_loader,data_len = load_data(max_length,batch_size)
  model,optimizer,scheduler = model_init(model_name,optimizer,data_len)
  model.to(device)
  val_aucs = []
  val_accs = []
  best_score = 0.
  patience_counter = 0
  val_loss, val_auc, _, _, val_acc= eval(val_loader, model)
  val_aucs.append(val_auc)
  val_accs.append(val_acc)
  print("\nEpoch 0 (before training) val loss: {:.4f}, AUC: {:.2f}, ACC: {:.4f}".format(val_loss, val_auc,val_acc*100))
  print ('--'*89)
  print ('Fine-Tuning of batch size : {}, lr : {}'.format(batch_size,lr))
  print ('--'*89)
  for epoch in range(1, n_epochs + 1):
      
      print("\nNew epoch, epoch {} / {}".format(epoch, n_epochs))

      # Training round
      train_loss,train_accuracy = train(train_loader, model, optimizer, scheduler)

      # Validation round
      val_loss, val_auc, _, _,val_acc= eval(val_loader, model)
      val_aucs.append(val_auc)
      val_accs.append(val_acc)

      print("Training: loss: {:.4f}, accuracy {:.4f}".format(train_loss,train_accuracy*100))
    
      print ("Validation: loss: {:.4f}, AUC: {:.2f}, ACC : {:.4f}, (best ACC: {:.4f} at epoch {})"
      .format(val_loss, val_auc,val_acc*100,np.max(np.array(val_accs)),np.argmax(np.array(val_accs*100))))
      # scheduler.step(val_acc)


      print ('lr : {}'.format(scheduler.optimizer.param_groups[0]['lr']))
      if val_auc == np.max(np.array(val_aucs)):
          print("!! New best val AUC !!")
      if val_acc == np.max(np.array(val_acc)):
          print("!! New best val ACC !!")
  _, test_auc, test_probs, test_labels,test_acc = eval(test_loader, model) # val_loss, val_auc, val_probs, val_labels,val_acc
  print("Test AUC of batch size {}, lr {}: {:.2f}".format(batch_size,lr,test_auc))
  print ('Test ACC of batch size {}, lr {}: {:.2f}'.format(batch_size,lr,test_acc*100))



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6931, AUC: 55.61, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of batch size : 8, lr : 1e-05
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.4113, accuracy 77.8468
Validation: loss: 0.2406, AUC: 97.16, ACC : 91.9725, (best ACC: 0.9197 at epoch 1)
lr : 9.574468085106385e-06
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.1889, accuracy 93.0202
Validation: loss: 0.2034, AUC: 97.75, ACC : 92.7752, (best ACC: 0.9278 at epoch 2)
lr : 8.510638297872341e-06
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.1161, accuracy 95.7659
Validation: loss: 0.2884, AUC: 97.62, ACC : 92.2018, (best ACC: 0.9278 at epoch 2)
lr : 7.446808510638298e-06
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0655, accuracy 97.8902
Validation: loss: 0.3030, AUC: 97.74, ACC : 92.0872, (best ACC: 0.9278 at epoch 2)
lr : 6.382978723404256e-06
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0453, accuracy 98.4971
Validation: loss: 0.3113, AUC: 97.54, ACC : 92.7752, (best ACC: 0.9278 at epoch 2)
lr : 5.319148936170213e-06
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0251, accuracy 99.1329
Validation: loss: 0.4104, AUC: 97.60, ACC : 92.4312, (best ACC: 0.9278 at epoch 2)
lr : 4.255319148936171e-06
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0190, accuracy 99.4220
Validation: loss: 0.4238, AUC: 97.46, ACC : 92.2018, (best ACC: 0.9278 at epoch 2)
lr : 3.191489361702128e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0155, accuracy 99.6098
Validation: loss: 0.4299, AUC: 97.36, ACC : 91.8578, (best ACC: 0.9278 at epoch 2)
lr : 2.1276595744680853e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0088, accuracy 99.7254
Validation: loss: 0.4652, AUC: 97.52, ACC : 91.9725, (best ACC: 0.9278 at epoch 2)
lr : 1.0638297872340427e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/865 [00:00<?, ?it/s]

Training: loss: 0.0075, accuracy 99.7977
Validation: loss: 0.4521, AUC: 97.60, ACC : 92.4312, (best ACC: 0.9278 at epoch 2)
lr : 0.0
!! New best val ACC !!
Test AUC of batch size 8, lr 1e-05: 98.81
Test ACC of batch size 8, lr 1e-05: 94.95


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6953, AUC: 47.54, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of batch size : 16, lr : 2e-05
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.4246, accuracy 77.8324
Validation: loss: 0.2018, AUC: 97.62, ACC : 92.7752, (best ACC: 0.9278 at epoch 1)
lr : 1.914412198721102e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.2041, accuracy 92.5289
Validation: loss: 0.2264, AUC: 97.36, ACC : 92.0872, (best ACC: 0.9278 at epoch 1)
lr : 1.7014264633546484e-05
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.1133, accuracy 96.1705
Validation: loss: 0.3094, AUC: 97.30, ACC : 92.5459, (best ACC: 0.9278 at epoch 1)
lr : 1.488440727988195e-05
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0593, accuracy 97.8324
Validation: loss: 0.3202, AUC: 97.09, ACC : 91.8578, (best ACC: 0.9278 at epoch 1)
lr : 1.2754549926217414e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0389, accuracy 98.7428
Validation: loss: 0.3955, AUC: 97.50, ACC : 90.9404, (best ACC: 0.9278 at epoch 1)
lr : 1.062469257255288e-05
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0242, accuracy 99.2052
Validation: loss: 0.4855, AUC: 97.34, ACC : 92.3165, (best ACC: 0.9278 at epoch 1)
lr : 8.494835218888343e-06
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0195, accuracy 99.4220
Validation: loss: 0.4418, AUC: 97.17, ACC : 92.7752, (best ACC: 0.9278 at epoch 1)
lr : 6.364977865223808e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0119, accuracy 99.6243
Validation: loss: 0.4242, AUC: 97.32, ACC : 92.5459, (best ACC: 0.9278 at epoch 1)
lr : 4.235120511559273e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0078, accuracy 99.8121
Validation: loss: 0.4487, AUC: 97.38, ACC : 92.8899, (best ACC: 0.9289 at epoch 9)
lr : 2.105263157894737e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/433 [00:00<?, ?it/s]

Training: loss: 0.0032, accuracy 99.8844
Validation: loss: 0.4677, AUC: 97.38, ACC : 92.7752, (best ACC: 0.9289 at epoch 9)
lr : 0.0
!! New best val ACC !!
Test AUC of batch size 16, lr 2e-05: 99.00
Test ACC of batch size 16, lr 2e-05: 94.62


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6952, AUC: 47.85, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of batch size : 64, lr : 5e-05
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.4341, accuracy 77.2543
Validation: loss: 0.2626, AUC: 96.99, ACC : 90.8257, (best ACC: 0.9083 at epoch 1)
lr : 4.778761061946903e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.1964, accuracy 93.0202
Validation: loss: 0.2497, AUC: 96.85, ACC : 92.4312, (best ACC: 0.9243 at epoch 2)
lr : 4.2428711897738446e-05
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.1085, accuracy 96.1416
Validation: loss: 0.2676, AUC: 97.60, ACC : 92.7752, (best ACC: 0.9278 at epoch 3)
lr : 3.706981317600787e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0655, accuracy 97.8035
Validation: loss: 0.3344, AUC: 97.20, ACC : 91.8578, (best ACC: 0.9278 at epoch 3)
lr : 3.171091445427729e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0358, accuracy 98.8150
Validation: loss: 0.3939, AUC: 97.13, ACC : 91.7431, (best ACC: 0.9278 at epoch 3)
lr : 2.635201573254671e-05
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0272, accuracy 99.1474
Validation: loss: 0.3115, AUC: 97.33, ACC : 91.8578, (best ACC: 0.9278 at epoch 3)
lr : 2.0993117010816127e-05
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0169, accuracy 99.3642
Validation: loss: 0.4121, AUC: 97.41, ACC : 92.8899, (best ACC: 0.9289 at epoch 7)
lr : 1.5634218289085548e-05
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0131, accuracy 99.5665
Validation: loss: 0.3870, AUC: 97.50, ACC : 92.5459, (best ACC: 0.9289 at epoch 7)
lr : 1.0275319567354965e-05
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0098, accuracy 99.7399
Validation: loss: 0.3869, AUC: 97.57, ACC : 93.0046, (best ACC: 0.9300 at epoch 9)
lr : 4.9164208456243854e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/109 [00:00<?, ?it/s]

Training: loss: 0.0019, accuracy 99.9566
Validation: loss: 0.4137, AUC: 97.58, ACC : 93.0046, (best ACC: 0.9300 at epoch 9)
lr : 0.0
!! New best val ACC !!
Test AUC of batch size 64, lr 5e-05: 98.74
Test ACC of batch size 64, lr 5e-05: 94.45


Weight Decay

In [None]:
## larger and smaller weight decay
weight_decay_set = [0.01,0.05,0.15,0.2]

In [None]:
# Default hyperparameters
model_name = "roberta-base" 
max_length = 50
batch_size = 32
n_epochs = 10
lr = 3e-5
warmup = 0.06
beta_1 = 0.9
beta_2 = 0.98
adam_ep = 1e-6
weight_decay = 0.1

### Seed all test to seed 0
seed_val = 0
set_seed(seed_value = seed_val)
for i in range(len(weight_decay_set)):
  weight_decay = weight_decay_set[i]
  train_loader,val_loader,test_loader,data_len = load_data(max_length,batch_size)
  model,optimizer,scheduler = model_init(model_name,optimizer,data_len)
  model.to(device)
  val_aucs = []
  val_accs = []
  best_score = 0.
  patience_counter = 0
  val_loss, val_auc, _, _, val_acc= eval(val_loader, model)
  val_aucs.append(val_auc)
  val_accs.append(val_acc)
  print("\nEpoch 0 (before training) val loss: {:.4f}, AUC: {:.2f}, ACC: {:.4f}".format(val_loss, val_auc,val_acc*100))
  print ('--'*89)
  print ('Fine-Tuning of weight decay : {}'.format(weight_decay))
  print ('--'*89)
  for epoch in range(1, n_epochs + 1):
      
      print("\nNew epoch, epoch {} / {}".format(epoch, n_epochs))

      # Training round
      train_loss,train_accuracy = train(train_loader, model, optimizer, scheduler)

      # Validation round
      val_loss, val_auc, _, _,val_acc= eval(val_loader, model)
      val_aucs.append(val_auc)
      val_accs.append(val_acc)

      print("Training: loss: {:.4f}, accuracy {:.4f}".format(train_loss,train_accuracy*100))
    
      print ("Validation: loss: {:.4f}, AUC: {:.2f}, ACC : {:.4f}, (best ACC: {:.4f} at epoch {})"
      .format(val_loss, val_auc,val_acc*100,np.max(np.array(val_accs)),np.argmax(np.array(val_accs*100))))
      # scheduler.step(val_acc)


      print ('lr : {}'.format(scheduler.optimizer.param_groups[0]['lr']))
      if val_auc == np.max(np.array(val_aucs)):
          print("!! New best val AUC !!")
      if val_acc == np.max(np.array(val_acc)):
          print("!! New best val ACC !!")
  _, test_auc, test_probs, test_labels,test_acc = eval(test_loader, model) # val_loss, val_auc, val_probs, val_labels,val_acc
  print("Test AUC of weight decay {}: {:.2f}".format(weight_decay,test_auc))
  print ('Test ACC of of weight decay {}: {:.2f}'.format(weight_decay,test_acc*100))



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6931, AUC: 55.61, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of weight decay : 0.01
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.4229, accuracy 78.2948
Validation: loss: 0.2186, AUC: 97.09, ACC : 92.0872, (best ACC: 0.9209 at epoch 1)
lr : 2.8701426463354646e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1983, accuracy 92.7457
Validation: loss: 0.2371, AUC: 97.38, ACC : 92.5459, (best ACC: 0.9255 at epoch 2)
lr : 2.549926217412691e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1107, accuracy 96.0983
Validation: loss: 0.2417, AUC: 97.83, ACC : 93.0046, (best ACC: 0.9300 at epoch 3)
lr : 2.2297097884899164e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0644, accuracy 97.7890
Validation: loss: 0.3334, AUC: 97.75, ACC : 92.5459, (best ACC: 0.9300 at epoch 3)
lr : 1.9094933595671423e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0418, accuracy 98.5260
Validation: loss: 0.3350, AUC: 97.40, ACC : 92.6606, (best ACC: 0.9300 at epoch 3)
lr : 1.589276930644368e-05
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0216, accuracy 99.3353
Validation: loss: 0.4109, AUC: 97.31, ACC : 92.5459, (best ACC: 0.9300 at epoch 3)
lr : 1.2690605017215937e-05
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0187, accuracy 99.4798
Validation: loss: 0.3996, AUC: 97.41, ACC : 92.8899, (best ACC: 0.9300 at epoch 3)
lr : 9.488440727988196e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0097, accuracy 99.7688
Validation: loss: 0.4573, AUC: 97.60, ACC : 92.7752, (best ACC: 0.9300 at epoch 3)
lr : 6.286276438760452e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0081, accuracy 99.7688
Validation: loss: 0.4375, AUC: 97.67, ACC : 92.6606, (best ACC: 0.9300 at epoch 3)
lr : 3.08411214953271e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0030, accuracy 99.8844
Validation: loss: 0.4388, AUC: 97.75, ACC : 93.0046, (best ACC: 0.9300 at epoch 3)
lr : 0.0
!! New best val ACC !!
Test AUC of weight decay 0.01: 98.97
Test ACC of of weight decay 0.01: 94.78


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6953, AUC: 47.54, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of weight decay : 0.05
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.4218, accuracy 77.0520
Validation: loss: 0.2492, AUC: 96.86, ACC : 89.9083, (best ACC: 0.8991 at epoch 1)
lr : 2.8701426463354646e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.2000, accuracy 92.6012
Validation: loss: 0.2283, AUC: 97.36, ACC : 91.7431, (best ACC: 0.9174 at epoch 2)
lr : 2.549926217412691e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1197, accuracy 95.9682
Validation: loss: 0.2704, AUC: 97.42, ACC : 92.6606, (best ACC: 0.9266 at epoch 3)
lr : 2.2297097884899164e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0675, accuracy 97.8035
Validation: loss: 0.3148, AUC: 97.35, ACC : 91.9725, (best ACC: 0.9266 at epoch 3)
lr : 1.9094933595671423e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0378, accuracy 98.7861
Validation: loss: 0.2947, AUC: 97.59, ACC : 92.4312, (best ACC: 0.9266 at epoch 3)
lr : 1.589276930644368e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0243, accuracy 99.2486
Validation: loss: 0.4502, AUC: 97.35, ACC : 92.6606, (best ACC: 0.9266 at epoch 3)
lr : 1.2690605017215937e-05
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0212, accuracy 99.3497
Validation: loss: 0.4345, AUC: 97.57, ACC : 91.9725, (best ACC: 0.9266 at epoch 3)
lr : 9.488440727988196e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0102, accuracy 99.6676
Validation: loss: 0.4739, AUC: 97.38, ACC : 91.3991, (best ACC: 0.9266 at epoch 3)
lr : 6.286276438760452e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0071, accuracy 99.7832
Validation: loss: 0.4770, AUC: 97.16, ACC : 91.9725, (best ACC: 0.9266 at epoch 3)
lr : 3.08411214953271e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0029, accuracy 99.9422
Validation: loss: 0.4977, AUC: 97.07, ACC : 92.2018, (best ACC: 0.9266 at epoch 3)
lr : 0.0
!! New best val ACC !!
Test AUC of weight decay 0.05: 98.66
Test ACC of of weight decay 0.05: 94.29


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.6953, AUC: 47.84, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of weight decay : 0.15
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.4485, accuracy 76.1127
Validation: loss: 0.2299, AUC: 97.01, ACC : 91.8578, (best ACC: 0.9186 at epoch 1)
lr : 2.8701426463354646e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1888, accuracy 93.0058
Validation: loss: 0.2185, AUC: 97.80, ACC : 92.6606, (best ACC: 0.9266 at epoch 2)
lr : 2.549926217412691e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1122, accuracy 96.0549
Validation: loss: 0.3020, AUC: 97.44, ACC : 92.0872, (best ACC: 0.9266 at epoch 2)
lr : 2.2297097884899164e-05
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0591, accuracy 97.9046
Validation: loss: 0.3790, AUC: 97.54, ACC : 91.8578, (best ACC: 0.9266 at epoch 2)
lr : 1.9094933595671423e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0314, accuracy 99.0173
Validation: loss: 0.4255, AUC: 97.74, ACC : 92.5459, (best ACC: 0.9266 at epoch 2)
lr : 1.589276930644368e-05
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0214, accuracy 99.2919
Validation: loss: 0.4064, AUC: 97.68, ACC : 93.1193, (best ACC: 0.9312 at epoch 6)
lr : 1.2690605017215937e-05
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0195, accuracy 99.3353
Validation: loss: 0.4140, AUC: 97.48, ACC : 93.0046, (best ACC: 0.9312 at epoch 6)
lr : 9.488440727988196e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0108, accuracy 99.6965
Validation: loss: 0.5239, AUC: 97.64, ACC : 91.5138, (best ACC: 0.9312 at epoch 6)
lr : 6.286276438760452e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0043, accuracy 99.8844
Validation: loss: 0.5021, AUC: 97.52, ACC : 92.5459, (best ACC: 0.9312 at epoch 6)
lr : 3.08411214953271e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0040, accuracy 99.8266
Validation: loss: 0.4943, AUC: 97.55, ACC : 92.6606, (best ACC: 0.9312 at epoch 6)
lr : 0.0
!! New best val ACC !!
Test AUC of weight decay 0.15: 98.95
Test ACC of of weight decay 0.15: 94.45


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Epoch 0 (before training) val loss: 0.7022, AUC: 46.32, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of weight decay : 0.2
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.4425, accuracy 76.6474
Validation: loss: 0.2439, AUC: 96.92, ACC : 91.7431, (best ACC: 0.9174 at epoch 1)
lr : 2.8701426463354646e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1928, accuracy 92.6156
Validation: loss: 0.2394, AUC: 97.58, ACC : 92.4312, (best ACC: 0.9243 at epoch 2)
lr : 2.549926217412691e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.1138, accuracy 96.0983
Validation: loss: 0.2896, AUC: 97.27, ACC : 91.2844, (best ACC: 0.9243 at epoch 2)
lr : 2.2297097884899164e-05
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0635, accuracy 97.7746
Validation: loss: 0.3130, AUC: 97.47, ACC : 91.7431, (best ACC: 0.9243 at epoch 2)
lr : 1.9094933595671423e-05
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0401, accuracy 98.6561
Validation: loss: 0.3493, AUC: 97.70, ACC : 92.8899, (best ACC: 0.9289 at epoch 5)
lr : 1.589276930644368e-05
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0250, accuracy 99.1763
Validation: loss: 0.4119, AUC: 97.37, ACC : 92.7752, (best ACC: 0.9289 at epoch 5)
lr : 1.2690605017215937e-05
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0180, accuracy 99.4075
Validation: loss: 0.3845, AUC: 97.40, ACC : 92.8899, (best ACC: 0.9289 at epoch 5)
lr : 9.488440727988196e-06
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0128, accuracy 99.5954
Validation: loss: 0.4629, AUC: 97.26, ACC : 92.3165, (best ACC: 0.9289 at epoch 5)
lr : 6.286276438760452e-06
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0073, accuracy 99.8266
Validation: loss: 0.4543, AUC: 97.55, ACC : 93.0046, (best ACC: 0.9300 at epoch 9)
lr : 3.08411214953271e-06
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.0040, accuracy 99.9133
Validation: loss: 0.4849, AUC: 97.54, ACC : 93.1193, (best ACC: 0.9312 at epoch 10)
lr : 0.0
!! New best val ACC !!
Test AUC of weight decay 0.2: 98.81
Test ACC of of weight decay 0.2: 94.34


SGD optimizer

In [None]:
lr_set = [1e-3,2e-3,3e-3,1e-2]

In [None]:
# Default hyperparameters
model_name = "roberta-base" 
max_length = 50
batch_size = 32
n_epochs = 10
warmup = 0.06
momentum = 0.9 # seems to work well with this value
weight_decay = 0.1

### Test using SGD optimizer
### Seed all test to seed 0
seed_val = 0
set_seed(seed_value = seed_val)
for i in range(len(lr_set)):
  lr = lr_set[i]
  train_loader,val_loader,test_loader,data_len = load_data(max_length,batch_size)
  # model, optimizer and scheduler
  model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels = 2)
  optimizer = optim.SGD(model.parameters(),lr = lr,momentum = momentum,weight_decay = weight_decay)
  total_steps = int(n_epochs * (data_len/ batch_size))
  warmup_steps = int(warmup * total_steps)
  model.to(device)
  val_aucs = []
  val_accs = []
  best_score = 0.
  patience_counter = 0
  val_loss, val_auc, _, _, val_acc= eval(val_loader, model)
  val_aucs.append(val_auc)
  val_accs.append(val_acc)
  print("\nEpoch 0 (before training) val loss: {:.4f}, AUC: {:.2f}, ACC: {:.4f}".format(val_loss, val_auc,val_acc*100))
  print ('--'*89)
  print ('Fine-Tuning of lr : {}'.format(lr))
  print ('--'*89)
  for epoch in range(1, n_epochs + 1):
      
      print("\nNew epoch, epoch {} / {}".format(epoch, n_epochs))

      # Training round
      train_loss,train_accuracy = train(train_loader, model, optimizer, scheduler)

      # Validation round
      val_loss, val_auc, _, _,val_acc= eval(val_loader, model)
      val_aucs.append(val_auc)
      val_accs.append(val_acc)

      print("Training: loss: {:.4f}, accuracy {:.4f}".format(train_loss,train_accuracy*100))
    
      print ("Validation: loss: {:.4f}, AUC: {:.2f}, ACC : {:.4f}, (best ACC: {:.4f} at epoch {})"
      .format(val_loss, val_auc,val_acc*100,np.max(np.array(val_accs)),np.argmax(np.array(val_accs*100))))
      # scheduler.step(val_acc)


      print ('lr : {}'.format(scheduler.optimizer.param_groups[0]['lr']))
      if val_auc == np.max(np.array(val_aucs)):
          print("!! New best val AUC !!")
      if val_acc == np.max(np.array(val_acc)):
          print("!! New best val ACC !!")
  _, test_auc, test_probs, test_labels,test_acc = eval(test_loader, model) # val_loss, val_auc, val_probs, val_labels,val_acc
  print("Test AUC of lr {}: {:.2f}".format(lr,test_auc))
  print ('Test ACC of of lr {}: {:.2f}'.format(lr,test_acc*100))



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie


Epoch 0 (before training) val loss: 0.6931, AUC: 55.61, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of lr : 0.001
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6085, accuracy 64.0462
Validation: loss: 0.3760, AUC: 93.37, ACC : 83.8303, (best ACC: 0.8383 at epoch 1)
lr : 0.0079390063944909
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.4382, accuracy 80.3757
Validation: loss: 0.5117, AUC: 89.12, ACC : 73.2798, (best ACC: 0.8383 at epoch 1)
lr : 0.0068716182980816535
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.5351, accuracy 72.5289
Validation: loss: 0.6989, AUC: 88.16, ACC : 62.2706, (best ACC: 0.8383 at epoch 1)
lr : 0.005804230201672406
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.5888, accuracy 69.7399
Validation: loss: 0.5499, AUC: 86.91, ACC : 70.1835, (best ACC: 0.8383 at epoch 1)
lr : 0.004736842105263157
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6025, accuracy 68.7572
Validation: loss: 0.5709, AUC: 86.34, ACC : 69.0367, (best ACC: 0.8383 at epoch 1)
lr : 0.0036694540088539106
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6122, accuracy 67.6012
Validation: loss: 0.6005, AUC: 85.53, ACC : 69.4954, (best ACC: 0.8383 at epoch 1)
lr : 0.0026020659124446634
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6115, accuracy 66.0260
Validation: loss: 0.6401, AUC: 86.10, ACC : 61.4679, (best ACC: 0.8383 at epoch 1)
lr : 0.0015346778160354158
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6251, accuracy 65.6069
Validation: loss: 0.5581, AUC: 85.20, ACC : 75.2294, (best ACC: 0.8383 at epoch 1)
lr : 0.0004672897196261682
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6315, accuracy 64.4364
Validation: loss: 0.6101, AUC: 84.92, ACC : 64.4495, (best ACC: 0.8383 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6467, accuracy 63.1358
Validation: loss: 0.7484, AUC: 83.76, ACC : 50.9174, (best ACC: 0.8383 at epoch 1)
lr : 0.0
!! New best val ACC !!
Test AUC of lr 0.001: 83.31
Test ACC of of lr 0.001: 49.92


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie


Epoch 0 (before training) val loss: 0.6953, AUC: 47.54, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of lr : 0.002
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6894, accuracy 53.9884
Validation: loss: 0.6550, AUC: 78.38, ACC : 68.2339, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6501, accuracy 62.5578
Validation: loss: 0.6431, AUC: 82.38, ACC : 66.1697, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6814, accuracy 56.3150
Validation: loss: 0.6868, AUC: 81.56, ACC : 54.7018, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6914, accuracy 52.6301
Validation: loss: 0.6930, AUC: 80.86, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6922, accuracy 52.1676
Validation: loss: 0.6932, AUC: 80.43, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6932, AUC: 80.20, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6923, accuracy 52.1676
Validation: loss: 0.6931, AUC: 79.93, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6931, AUC: 72.09, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6933, AUC: 50.00, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6932, AUC: 50.00, ACC : 50.9174, (best ACC: 0.6823 at epoch 1)
lr : 0.0
!! New best val ACC !!
Test AUC of lr 0.002: 50.00
Test ACC of of lr 0.002: 49.92


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie


Epoch 0 (before training) val loss: 0.6953, AUC: 47.84, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of lr : 0.003
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6934, accuracy 52.6879
Validation: loss: 0.6920, AUC: 68.87, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6925, accuracy 51.5029
Validation: loss: 0.6941, AUC: 67.57, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6925, accuracy 52.1965
Validation: loss: 0.6933, AUC: 60.44, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6922, accuracy 52.1676
Validation: loss: 0.6931, AUC: 56.83, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6923, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6933, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 52.1676
Validation: loss: 0.6931, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6924, accuracy 51.9364
Validation: loss: 0.6939, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6923, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6923, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val ACC !!
Test AUC of lr 0.003: 50.00
Test ACC of of lr 0.003: 49.92


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie


Epoch 0 (before training) val loss: 0.7022, AUC: 46.32, ACC: 50.9174
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Fine-Tuning of lr : 0.01
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

New epoch, epoch 1 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6954, accuracy 51.0405
Validation: loss: 0.6932, AUC: 46.52, ACC : 49.0826, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 2 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6927, accuracy 52.0520
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 3 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6928, accuracy 51.4740
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 4 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6927, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 5 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6927, accuracy 51.7919
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 6 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6926, accuracy 52.1676
Validation: loss: 0.6931, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 7 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6925, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 8 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6921, accuracy 51.9653
Validation: loss: 0.6931, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 9 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6929, accuracy 51.4740
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!

New epoch, epoch 10 / 10


  0%|          | 0/217 [00:00<?, ?it/s]

Training: loss: 0.6925, accuracy 52.1676
Validation: loss: 0.6930, AUC: 50.00, ACC : 50.9174, (best ACC: 0.5092 at epoch 0)
lr : 0.0
!! New best val AUC !!
!! New best val ACC !!
Test AUC of lr 0.01: 50.00
Test ACC of of lr 0.01: 49.92


Load data function for Automatic fine-tuning

In [8]:
def load_data(config,data_dir = None):
    if load_data  == None:
        datasets.load_dataset("SetFit/sst2")
    data_train = datasets.Dataset.to_pandas(data["train"])
    data_val = datasets.Dataset.to_pandas(data["validation"])
    data_test = datasets.Dataset.to_pandas(data["test"])
    ## Emotion dataset
    train_dataset = SST2Dataset(tokenizer, data_train, config['max_length'])
    val_dataset = SST2Dataset(tokenizer, data_val, config['max_length'])
    test_dataset = SST2Dataset(tokenizer, data_test, config['max_length'])
    ## dataloader
    # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True) # shuffle the training set
    # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
    # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    print (len(data["train"]["text"]))
    return train_dataset,val_dataset,test_dataset,len(data["train"]["text"])

Train function

In [9]:
def train_function(config, checkpoint_dir='fine_tuned_model', data_dir=None):

    # if checkpoint_dir == None:
    #   checkpoint_dir = 'fine-tuned_model'
    ## define model
    seed_value = 0
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels = 2)
    # model = FinetuneModel(model,d_in,config['d_h'],d_out,config['dropout'],freeze_pretrained) #pretrained_model, d_in, d_h, d_out, freeze_pretrained=False

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    model.to(device)
    print (f'is GPU available? {torch.cuda.is_available()}')
    print (f'device used: {device}')
### Optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {
                    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                    'weight_decay':config['weight_decay']
            },
            {
                    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                    'weight_decay':0.0
            }
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config['lr'],betas = (config['beta1'],config['beta2']),eps = config['eps'])
    criterion = nn.CrossEntropyLoss()

    # if checkpoint_dir:
    #     model_state, optimizer_state = torch.load(
    #         os.path.join(checkpoint_dir, "checkpoint"))
    #     model.load_state_dict(model_state)
    #     optimizer.load_state_dict(optimizer_state)

    trainset, valset,_,len_data = load_data(config,data_dir)
    ## scheduler
    total_steps = int(config['num_epochs'] * (len_data / config['batch_size'])) # get total_steps for scheduler
    warmup_steps = int(config['warmup'] * total_steps)
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps = warmup_steps, 
      num_training_steps = total_steps)

    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    valloader = torch.utils.data.DataLoader(
        valset,
        batch_size=int(config["batch_size"]),
        shuffle=False)
    ## Training ####
    for epoch in range(config['num_epochs']):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        total = 0
        correct = 0
        for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments,batch_labels) in enumerate (trainloader):
            # get the inputs; data is a list of [inputs, labels]
            model.zero_grad()

            input_ids = batch_seqs
            attention_mask = batch_seq_masks
            labels = batch_labels

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            loss,outputs = model(input_ids,attention_mask,labels = labels)[:2]
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            # loss = criterion(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm = config['max_gradient_norm'])
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if batch_index % 10 == 0:  # print every 2000 mini-batches
                print("Epoch: {}, batch no: {} loss: {:.2f}, accuracy: {:.2f}%".format(epoch + 1, batch_index + 1,
                                                running_loss / epoch_steps,(correct/total)*100))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        acc_correct = 0
        correct = 0
        running_loss = 0
        print ('------'*89)
        print ('Start Validating')
        print ('------'*89)
        for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments,batch_labels) in enumerate (valloader):
            # get the inputs; data is a list of [inputs, labels]
            with torch.no_grad():
                input_ids = batch_seqs
                attention_mask = batch_seq_masks
                labels = batch_labels

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                loss,outputs = model(input_ids,attention_mask,labels = labels)[:2]
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                acc_correct += (predicted == labels).sum().item()
                correct = acc_correct
                # loss = criterion(outputs, labels)

                val_loss += loss.cpu().numpy()
                val_steps += 1

            ### printing item
            running_loss += loss.item()
            if batch_index % 10 == 0:  # print every 2000 mini-batches
                print("Epoch: {}, batch no: {} loss: {:.2f}, accuracy: {:.2f}%".format(epoch + 1, batch_index + 1,
                                                running_loss / val_steps,(correct/total)*100))
                running_loss = 0.0
                
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            print (f'checkpoint path : {path}')
            torch.save((model.state_dict(), optimizer.state_dict(),scheduler.state_dict()), path)
            print ('Model Saved!')
        accuracy = correct/(len(valloader.dataset))
        tune.report(loss=(val_loss / val_steps), accuracy=accuracy)
        ## scheduler
    # test_acc,test_loss = test_accuracy(model, device) # evaluate on test set
    # tune.report(loss = test_loss,accuracy = test_acc)
    print("Finished Training")

Test accuracy function for test set

In [10]:
def test_accuracy(config,model, device="cpu"):
    _,_,testset,_ = load_data(config)

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=config['batch_size'], shuffle=False)
    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    test_loss = 0.
    test_steps = 0
    with torch.no_grad():
        for batch_index, (batch_seqs, batch_seq_masks,batch_seq_segments, batch_labels) in enumerate (testloader):
      
            batch_seqs, batch_seq_masks,batch_labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_labels.to(device)
            loss,outputs = model(batch_seqs,batch_seq_masks,labels = batch_labels)[:2]
            _, predicted = torch.max(outputs.data, 1)
            # loss = criterion(outputs,batch_labels)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.cpu().numpy()
            test_steps += 1

    return correct / total, test_loss/test_steps

In [14]:
config = {
    "num_epochs": tune.randint(3,10),
    "lr": tune.loguniform(1e-5, 1e-4),
    "batch_size": tune.choice([8,16,32,64]),
    "weight_decay": tune.uniform(0.0, 0.15),
    "beta1" : tune.uniform(0.8,0.95),
    "beta2" : tune.uniform(0.995,0.999),
    "eps": tune.loguniform(1e-8,1e-6),
    "max_gradient_norm": tune.randint(7,12),
    "dropout": tune.choice([0.1,0.2,0.3]),
    "warmup": tune.choice([0.04,0.06,0.08,0.1,0.12]),
    "max_length": tune.choice([50,100])
}

In [15]:
def main(num_samples=10,max_num_epochs =10, gpus_per_trial=1):
    # load_data()
    # ray.init()
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    
    # stopper = TrialPlateauStopper(metric = 'accuracy',metric_threshold=0.95, mode = 'max')
    #PBT scheduler
    # scheduler = PopulationBasedTraining(
    #     metric="accuracy",
    #     mode="max",
    #     perturbation_interval=2,
    #     hyperparam_mutations={
    #       "num_epochs": tune.randint(3,10),
    #       "weight_decay": tune.uniform(0.0, 0.1),
    #       "lr": tune.loguniform(1e-5, 1e-4),
    #       "batch_size": tune.choice([ 8, 16,32]),
    #       "beta1" : tune.uniform(0.85,0.95),
    #       "beta2" : tune.uniform(0.995,0.999),
    #       "eps": tune.loguniform(1e-8,1e-6),
    #       "max_gradient_norm": tune.randint(7,12)
    # })
    ## ASHA scheduler
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    result = tune.run(
        train_function,
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter = reporter,
        )

    best_trial = result.get_best_trial("accuracy", "max", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state,scheduler_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc,test_loss = test_accuracy(best_trial.config,best_trained_model, device)
    print("Best trial test set accuracy: {} , test loss : {}".format(test_acc,test_loss))
    print (f'scheduler : {scheduler_state.keys()}')


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=20,max_num_epochs=10, gpus_per_trial=1)

Output hidden; open in https://colab.research.google.com to view.