In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys

# TODO: change this to the path to your homework folder
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = '487-final-project'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))
sys.path.append(GOOGLE_DRIVE_PATH)

['checkpoint_7.pth', 'checkpoint_no_na.pth', 'checkpoint_tuned.pth', 'checkpoint_tuned_1.pth', 'checkpoint_tuned_2.pth', 'credentials.pyc', 'credentials.py', '__pycache__', 'requirements.txt', 'preprocess.py', 'nb.ipynb', 'test.py', 'README.md', 'eval.py', 'scrape.py', 'naivebayes.py', 'checkpoint.pth', 'data', 'twitter.py', 'checkpoint_twitter.pth', 'checkpoint_twitter_2.pth', 'model.py', 'checkpoint_tuned_3.pth', 'train_tweet_model.ipynb', 'train_model.ipynb']


In [3]:
!pip install transformers
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from google.colab import files

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [4]:
from functools import partial
from tqdm import tqdm
tqdm = partial(tqdm, position=0, leave=True)

In [5]:
from model import train
from model import BertClassifier

# Dataset

In [17]:
np.random.seed(112)

df = pd.read_csv(os.path.join(GOOGLE_DRIVE_PATH, 'data/MBIC/labeled_dataset.csv'))

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

# Hyperparameter Tuning

In [None]:
# Hyperparameter Tuning Helper Function
from model import Dataset
def hyperparameter_tuning_helper(model_val, batch_size):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  # model_val = BertClassifier()
  # check what checkpoint you need to load from
  # state_dict = torch.load(os.path.join(GOOGLE_DRIVE_PATH, checkpoint_path))
  # model_val.load_state_dict(state_dict)
  # model_val.to(device)

  validation_data = Dataset(df_val)
  val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size, shuffle=True)

  total_correct_val = 0
  with torch.no_grad():
    for val_input, val_label in tqdm(val_dataloader):

        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model_val(input_id, mask)
        num_correct = (output.argmax(dim=1) == val_label).sum().item()
        total_correct_val += num_correct
        
  print()
  print("Validation Accuracy of", total_correct_val/len(df_val))
  return total_correct_val

Hyperparameter Tuning on Validation Set

In [None]:
from transformers.utils.dummy_pt_objects import BeitForMaskedImageModeling
# epoch_list = [3, 5, 10, 15, 20]
learning_rate_list = [1e-6, 1e-5, 1e-4]
hidden_sizes = [128, 256, 512]
batch_sizes = [16, 8, 4, 2]

model_iteration = 0
best_validation_accuracy = 0
best_hyperparameters = {
                            "learning_rate": None,
                            "hidden_size": None,
                            "batch_size": None
                            }

epoch_value = 5
# for epoch_value in epoch_list:
for hidden_size in hidden_sizes:
  for learning_rate in learning_rate_list:
    for batch_size in batch_sizes:
      print(hidden_size, learning_rate, batch_size)
      bert_clf = BertClassifier(hidden_size=hidden_size)
      # checkpoint_string = "checkpoint_hyperparameter_tuning_" + str(model_iteration) + ".pth"
      train(bert_clf, df_train, df_val, learning_rate, epoch_value, batch_size=batch_size)
      # torch.save(bert_clf.state_dict(), os.path.join(GOOGLE_DRIVE_PATH, checkpoint_string))
      # files.download(os.path.join(GOOGLE_DRIVE_PATH, checkpoint_string))
      # test_model_helper(checkpoint_string)
      # state_dict = bert_clf.state_dict()

      current_val_accuracy = hyperparameter_tuning_helper(bert_clf, batch_size)

      if current_val_accuracy > best_validation_accuracy:
          best_validation_accuracy = current_val_accuracy
          best_hyperparameters["hidden_size"] = hidden_size
          best_hyperparameters["learning_rate"] = learning_rate
          best_hyperparameters["batch_size"] = batch_size

      model_iteration += 1

print("The highest accuracy achieved by the model was:", best_validation_accuracy)
print("Best Hyperparameters:", best_hyperparameters)

128 1e-06 16


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Using Cuda: True


100%|██████████| 85/85 [01:53<00:00,  1.33s/it]


Epochs: 1 | Train Loss:  0.069             | Train Accuracy:  0.329             | Val Loss:  0.071             | Val Accuracy:  0.382


100%|██████████| 85/85 [01:57<00:00,  1.39s/it]


Epochs: 2 | Train Loss:  0.067             | Train Accuracy:  0.417             | Val Loss:  0.069             | Val Accuracy:  0.476


100%|██████████| 85/85 [02:00<00:00,  1.41s/it]


Epochs: 3 | Train Loss:  0.066             | Train Accuracy:  0.465             | Val Loss:  0.068             | Val Accuracy:  0.476


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 4 | Train Loss:  0.064             | Train Accuracy:  0.500             | Val Loss:  0.067             | Val Accuracy:  0.506


100%|██████████| 85/85 [02:01<00:00,  1.43s/it]


Epochs: 5 | Train Loss:  0.062             | Train Accuracy:  0.532             | Val Loss:  0.065             | Val Accuracy:  0.559


100%|██████████| 11/11 [00:05<00:00,  1.97it/s]



Validation Accuracy of 0.5176470588235295
128 1e-06 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:03<00:00,  1.38it/s]


Epochs: 1 | Train Loss:  0.131             | Train Accuracy:  0.441             | Val Loss:  0.137             | Val Accuracy:  0.429


100%|██████████| 170/170 [02:05<00:00,  1.36it/s]


Epochs: 2 | Train Loss:  0.126             | Train Accuracy:  0.499             | Val Loss:  0.131             | Val Accuracy:  0.506


100%|██████████| 170/170 [02:06<00:00,  1.35it/s]


Epochs: 3 | Train Loss:  0.119             | Train Accuracy:  0.593             | Val Loss:  0.126             | Val Accuracy:  0.559


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 4 | Train Loss:  0.110             | Train Accuracy:  0.701             | Val Loss:  0.116             | Val Accuracy:  0.700


100%|██████████| 170/170 [02:06<00:00,  1.35it/s]


Epochs: 5 | Train Loss:  0.100             | Train Accuracy:  0.768             | Val Loss:  0.110             | Val Accuracy:  0.712


100%|██████████| 22/22 [00:05<00:00,  3.91it/s]



Validation Accuracy of 0.711764705882353
128 1e-06 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:10<00:00,  2.60it/s]


Epochs: 1 | Train Loss:  0.270             | Train Accuracy:  0.421             | Val Loss:  0.269             | Val Accuracy:  0.465


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 2 | Train Loss:  0.260             | Train Accuracy:  0.482             | Val Loss:  0.260             | Val Accuracy:  0.494


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 3 | Train Loss:  0.247             | Train Accuracy:  0.539             | Val Loss:  0.246             | Val Accuracy:  0.582


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.225             | Train Accuracy:  0.666             | Val Loss:  0.226             | Val Accuracy:  0.682


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.195             | Train Accuracy:  0.796             | Val Loss:  0.202             | Val Accuracy:  0.735


100%|██████████| 43/43 [00:05<00:00,  7.62it/s]



Validation Accuracy of 0.7705882352941177
128 1e-06 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:29<00:00,  4.55it/s]


Epochs: 1 | Train Loss:  0.521             | Train Accuracy:  0.460             | Val Loss:  0.489             | Val Accuracy:  0.606


100%|██████████| 680/680 [02:31<00:00,  4.50it/s]


Epochs: 2 | Train Loss:  0.439             | Train Accuracy:  0.693             | Val Loss:  0.417             | Val Accuracy:  0.700


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.350             | Train Accuracy:  0.792             | Val Loss:  0.374             | Val Accuracy:  0.724


100%|██████████| 680/680 [02:31<00:00,  4.48it/s]


Epochs: 4 | Train Loss:  0.286             | Train Accuracy:  0.835             | Val Loss:  0.362             | Val Accuracy:  0.729


100%|██████████| 680/680 [02:31<00:00,  4.48it/s]


Epochs: 5 | Train Loss:  0.240             | Train Accuracy:  0.853             | Val Loss:  0.313             | Val Accuracy:  0.741


100%|██████████| 85/85 [00:06<00:00, 14.00it/s]



Validation Accuracy of 0.7352941176470589
128 1e-05 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [01:57<00:00,  1.38s/it]


Epochs: 1 | Train Loss:  0.067             | Train Accuracy:  0.466             | Val Loss:  0.067             | Val Accuracy:  0.465


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 2 | Train Loss:  0.056             | Train Accuracy:  0.621             | Val Loss:  0.047             | Val Accuracy:  0.771


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 3 | Train Loss:  0.031             | Train Accuracy:  0.885             | Val Loss:  0.031             | Val Accuracy:  0.865


100%|██████████| 85/85 [02:01<00:00,  1.43s/it]


Epochs: 4 | Train Loss:  0.016             | Train Accuracy:  0.960             | Val Loss:  0.027             | Val Accuracy:  0.876


100%|██████████| 85/85 [02:01<00:00,  1.43s/it]


Epochs: 5 | Train Loss:  0.010             | Train Accuracy:  0.975             | Val Loss:  0.021             | Val Accuracy:  0.912


100%|██████████| 11/11 [00:05<00:00,  1.97it/s]



Validation Accuracy of 0.9117647058823529
128 1e-05 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:03<00:00,  1.38it/s]


Epochs: 1 | Train Loss:  0.121             | Train Accuracy:  0.539             | Val Loss:  0.109             | Val Accuracy:  0.688


100%|██████████| 170/170 [02:05<00:00,  1.36it/s]


Epochs: 2 | Train Loss:  0.078             | Train Accuracy:  0.783             | Val Loss:  0.076             | Val Accuracy:  0.788


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 3 | Train Loss:  0.048             | Train Accuracy:  0.862             | Val Loss:  0.067             | Val Accuracy:  0.829


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 4 | Train Loss:  0.029             | Train Accuracy:  0.962             | Val Loss:  0.055             | Val Accuracy:  0.876


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 5 | Train Loss:  0.018             | Train Accuracy:  0.980             | Val Loss:  0.049             | Val Accuracy:  0.888


100%|██████████| 22/22 [00:05<00:00,  3.91it/s]



Validation Accuracy of 0.8941176470588236
128 1e-05 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:09<00:00,  2.62it/s]


Epochs: 1 | Train Loss:  0.251             | Train Accuracy:  0.512             | Val Loss:  0.219             | Val Accuracy:  0.653


100%|██████████| 340/340 [02:11<00:00,  2.58it/s]


Epochs: 2 | Train Loss:  0.181             | Train Accuracy:  0.704             | Val Loss:  0.160             | Val Accuracy:  0.771


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 3 | Train Loss:  0.098             | Train Accuracy:  0.904             | Val Loss:  0.121             | Val Accuracy:  0.841


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.044             | Train Accuracy:  0.968             | Val Loss:  0.080             | Val Accuracy:  0.882


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.032             | Train Accuracy:  0.971             | Val Loss:  0.092             | Val Accuracy:  0.888


100%|██████████| 43/43 [00:05<00:00,  7.61it/s]



Validation Accuracy of 0.8764705882352941
128 1e-05 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:29<00:00,  4.55it/s]


Epochs: 1 | Train Loss:  0.423             | Train Accuracy:  0.646             | Val Loss:  0.319             | Val Accuracy:  0.782


100%|██████████| 680/680 [02:31<00:00,  4.50it/s]


Epochs: 2 | Train Loss:  0.186             | Train Accuracy:  0.893             | Val Loss:  0.244             | Val Accuracy:  0.841


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.094             | Train Accuracy:  0.953             | Val Loss:  0.199             | Val Accuracy:  0.882


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 4 | Train Loss:  0.060             | Train Accuracy:  0.975             | Val Loss:  0.154             | Val Accuracy:  0.924


100%|██████████| 680/680 [02:31<00:00,  4.48it/s]


Epochs: 5 | Train Loss:  0.041             | Train Accuracy:  0.983             | Val Loss:  0.183             | Val Accuracy:  0.894


100%|██████████| 85/85 [00:06<00:00, 14.00it/s]



Validation Accuracy of 0.9
128 0.0001 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [01:50<00:00,  1.30s/it]


Epochs: 1 | Train Loss:  0.069             | Train Accuracy:  0.388             | Val Loss:  0.071             | Val Accuracy:  0.388


100%|██████████| 85/85 [01:51<00:00,  1.31s/it]


Epochs: 2 | Train Loss:  0.069             | Train Accuracy:  0.404             | Val Loss:  0.071             | Val Accuracy:  0.388


100%|██████████| 85/85 [01:51<00:00,  1.31s/it]


Epochs: 3 | Train Loss:  0.069             | Train Accuracy:  0.403             | Val Loss:  0.071             | Val Accuracy:  0.388


100%|██████████| 85/85 [01:51<00:00,  1.31s/it]


Epochs: 4 | Train Loss:  0.069             | Train Accuracy:  0.404             | Val Loss:  0.071             | Val Accuracy:  0.388


100%|██████████| 85/85 [01:51<00:00,  1.31s/it]


Epochs: 5 | Train Loss:  0.069             | Train Accuracy:  0.404             | Val Loss:  0.071             | Val Accuracy:  0.388


100%|██████████| 11/11 [00:05<00:00,  1.99it/s]



Validation Accuracy of 0.38823529411764707
128 0.0001 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:02<00:00,  1.39it/s]


Epochs: 1 | Train Loss:  0.131             | Train Accuracy:  0.439             | Val Loss:  0.129             | Val Accuracy:  0.447


100%|██████████| 170/170 [02:03<00:00,  1.37it/s]


Epochs: 2 | Train Loss:  0.121             | Train Accuracy:  0.450             | Val Loss:  0.130             | Val Accuracy:  0.453


100%|██████████| 170/170 [02:04<00:00,  1.36it/s]


Epochs: 3 | Train Loss:  0.121             | Train Accuracy:  0.457             | Val Loss:  0.129             | Val Accuracy:  0.441


100%|██████████| 170/170 [02:01<00:00,  1.39it/s]


Epochs: 4 | Train Loss:  0.131             | Train Accuracy:  0.417             | Val Loss:  0.140             | Val Accuracy:  0.371


100%|██████████| 170/170 [02:01<00:00,  1.40it/s]


Epochs: 5 | Train Loss:  0.132             | Train Accuracy:  0.402             | Val Loss:  0.139             | Val Accuracy:  0.412


100%|██████████| 22/22 [00:05<00:00,  4.11it/s]



Validation Accuracy of 0.37058823529411766
128 0.0001 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:05<00:00,  2.70it/s]


Epochs: 1 | Train Loss:  0.264             | Train Accuracy:  0.452             | Val Loss:  0.265             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:05<00:00,  2.71it/s]


Epochs: 2 | Train Loss:  0.264             | Train Accuracy:  0.447             | Val Loss:  0.272             | Val Accuracy:  0.453


100%|██████████| 340/340 [02:03<00:00,  2.76it/s]


Epochs: 3 | Train Loss:  0.263             | Train Accuracy:  0.457             | Val Loss:  0.274             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:02<00:00,  2.76it/s]


Epochs: 4 | Train Loss:  0.263             | Train Accuracy:  0.457             | Val Loss:  0.272             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:03<00:00,  2.76it/s]


Epochs: 5 | Train Loss:  0.263             | Train Accuracy:  0.457             | Val Loss:  0.272             | Val Accuracy:  0.447


100%|██████████| 43/43 [00:05<00:00,  7.71it/s]



Validation Accuracy of 0.4470588235294118
128 0.0001 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:23<00:00,  4.74it/s]


Epochs: 1 | Train Loss:  0.547             | Train Accuracy:  0.404             | Val Loss:  0.549             | Val Accuracy:  0.388


100%|██████████| 680/680 [02:23<00:00,  4.75it/s]


Epochs: 2 | Train Loss:  0.549             | Train Accuracy:  0.404             | Val Loss:  0.549             | Val Accuracy:  0.388


100%|██████████| 680/680 [02:23<00:00,  4.75it/s]


Epochs: 3 | Train Loss:  0.549             | Train Accuracy:  0.404             | Val Loss:  0.549             | Val Accuracy:  0.388


100%|██████████| 680/680 [02:23<00:00,  4.75it/s]


Epochs: 4 | Train Loss:  0.549             | Train Accuracy:  0.404             | Val Loss:  0.549             | Val Accuracy:  0.388


100%|██████████| 680/680 [02:23<00:00,  4.75it/s]


Epochs: 5 | Train Loss:  0.549             | Train Accuracy:  0.404             | Val Loss:  0.549             | Val Accuracy:  0.388


100%|██████████| 85/85 [00:05<00:00, 14.21it/s]



Validation Accuracy of 0.38823529411764707
256 1e-06 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [01:55<00:00,  1.35s/it]


Epochs: 1 | Train Loss:  0.069             | Train Accuracy:  0.394             | Val Loss:  0.071             | Val Accuracy:  0.382


100%|██████████| 85/85 [01:57<00:00,  1.38s/it]


Epochs: 2 | Train Loss:  0.068             | Train Accuracy:  0.398             | Val Loss:  0.070             | Val Accuracy:  0.441


100%|██████████| 85/85 [01:59<00:00,  1.40s/it]


Epochs: 3 | Train Loss:  0.067             | Train Accuracy:  0.471             | Val Loss:  0.070             | Val Accuracy:  0.447


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 4 | Train Loss:  0.066             | Train Accuracy:  0.515             | Val Loss:  0.069             | Val Accuracy:  0.465


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 5 | Train Loss:  0.062             | Train Accuracy:  0.618             | Val Loss:  0.064             | Val Accuracy:  0.612


100%|██████████| 11/11 [00:05<00:00,  1.97it/s]



Validation Accuracy of 0.6294117647058823
256 1e-06 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:03<00:00,  1.38it/s]


Epochs: 1 | Train Loss:  0.136             | Train Accuracy:  0.388             | Val Loss:  0.140             | Val Accuracy:  0.376


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 2 | Train Loss:  0.130             | Train Accuracy:  0.499             | Val Loss:  0.135             | Val Accuracy:  0.524


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 3 | Train Loss:  0.124             | Train Accuracy:  0.603             | Val Loss:  0.127             | Val Accuracy:  0.594


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 4 | Train Loss:  0.113             | Train Accuracy:  0.729             | Val Loss:  0.120             | Val Accuracy:  0.700


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 5 | Train Loss:  0.101             | Train Accuracy:  0.786             | Val Loss:  0.113             | Val Accuracy:  0.688


100%|██████████| 22/22 [00:05<00:00,  3.91it/s]



Validation Accuracy of 0.6941176470588235
256 1e-06 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:10<00:00,  2.61it/s]


Epochs: 1 | Train Loss:  0.272             | Train Accuracy:  0.379             | Val Loss:  0.271             | Val Accuracy:  0.412


100%|██████████| 340/340 [02:11<00:00,  2.58it/s]


Epochs: 2 | Train Loss:  0.254             | Train Accuracy:  0.527             | Val Loss:  0.253             | Val Accuracy:  0.559


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 3 | Train Loss:  0.224             | Train Accuracy:  0.701             | Val Loss:  0.222             | Val Accuracy:  0.700


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.182             | Train Accuracy:  0.854             | Val Loss:  0.198             | Val Accuracy:  0.753


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.147             | Train Accuracy:  0.896             | Val Loss:  0.172             | Val Accuracy:  0.812


100%|██████████| 43/43 [00:05<00:00,  7.60it/s]



Validation Accuracy of 0.8117647058823529
256 1e-06 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 1 | Train Loss:  0.535             | Train Accuracy:  0.432             | Val Loss:  0.529             | Val Accuracy:  0.447


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 2 | Train Loss:  0.480             | Train Accuracy:  0.615             | Val Loss:  0.454             | Val Accuracy:  0.700


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.394             | Train Accuracy:  0.774             | Val Loss:  0.398             | Val Accuracy:  0.747


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 4 | Train Loss:  0.320             | Train Accuracy:  0.836             | Val Loss:  0.344             | Val Accuracy:  0.765


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 5 | Train Loss:  0.265             | Train Accuracy:  0.859             | Val Loss:  0.316             | Val Accuracy:  0.782


100%|██████████| 85/85 [00:06<00:00, 13.97it/s]



Validation Accuracy of 0.7823529411764706
256 1e-05 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 1 | Train Loss:  0.064             | Train Accuracy:  0.481             | Val Loss:  0.062             | Val Accuracy:  0.582


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 2 | Train Loss:  0.047             | Train Accuracy:  0.735             | Val Loss:  0.042             | Val Accuracy:  0.747


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 3 | Train Loss:  0.029             | Train Accuracy:  0.846             | Val Loss:  0.034             | Val Accuracy:  0.794


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 4 | Train Loss:  0.018             | Train Accuracy:  0.905             | Val Loss:  0.025             | Val Accuracy:  0.876


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 5 | Train Loss:  0.010             | Train Accuracy:  0.969             | Val Loss:  0.020             | Val Accuracy:  0.912


100%|██████████| 11/11 [00:05<00:00,  1.96it/s]



Validation Accuracy of 0.9058823529411765
256 1e-05 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 1 | Train Loss:  0.121             | Train Accuracy:  0.537             | Val Loss:  0.111             | Val Accuracy:  0.659


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 2 | Train Loss:  0.069             | Train Accuracy:  0.842             | Val Loss:  0.057             | Val Accuracy:  0.871


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 3 | Train Loss:  0.029             | Train Accuracy:  0.954             | Val Loss:  0.053             | Val Accuracy:  0.865


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 4 | Train Loss:  0.015             | Train Accuracy:  0.976             | Val Loss:  0.055             | Val Accuracy:  0.882


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 5 | Train Loss:  0.009             | Train Accuracy:  0.990             | Val Loss:  0.043             | Val Accuracy:  0.888


100%|██████████| 22/22 [00:05<00:00,  3.92it/s]



Validation Accuracy of 0.8823529411764706
256 1e-05 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 1 | Train Loss:  0.259             | Train Accuracy:  0.435             | Val Loss:  0.246             | Val Accuracy:  0.500


100%|██████████| 340/340 [02:11<00:00,  2.58it/s]


Epochs: 2 | Train Loss:  0.190             | Train Accuracy:  0.635             | Val Loss:  0.168             | Val Accuracy:  0.706


100%|██████████| 340/340 [02:11<00:00,  2.58it/s]


Epochs: 3 | Train Loss:  0.101             | Train Accuracy:  0.871             | Val Loss:  0.114             | Val Accuracy:  0.865


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.039             | Train Accuracy:  0.970             | Val Loss:  0.066             | Val Accuracy:  0.935


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.021             | Train Accuracy:  0.982             | Val Loss:  0.058             | Val Accuracy:  0.935


100%|██████████| 43/43 [00:05<00:00,  7.63it/s]



Validation Accuracy of 0.9352941176470588
256 1e-05 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:31<00:00,  4.50it/s]


Epochs: 1 | Train Loss:  0.449             | Train Accuracy:  0.576             | Val Loss:  0.366             | Val Accuracy:  0.706


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 2 | Train Loss:  0.193             | Train Accuracy:  0.879             | Val Loss:  0.175             | Val Accuracy:  0.894


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.093             | Train Accuracy:  0.956             | Val Loss:  0.161             | Val Accuracy:  0.912


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 4 | Train Loss:  0.062             | Train Accuracy:  0.974             | Val Loss:  0.199             | Val Accuracy:  0.900


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 5 | Train Loss:  0.043             | Train Accuracy:  0.981             | Val Loss:  0.188             | Val Accuracy:  0.906


100%|██████████| 85/85 [00:06<00:00, 13.96it/s]



Validation Accuracy of 0.9117647058823529
256 0.0001 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 1 | Train Loss:  0.062             | Train Accuracy:  0.451             | Val Loss:  0.065             | Val Accuracy:  0.424


100%|██████████| 85/85 [01:59<00:00,  1.41s/it]


Epochs: 2 | Train Loss:  0.057             | Train Accuracy:  0.515             | Val Loss:  0.063             | Val Accuracy:  0.494


100%|██████████| 85/85 [01:59<00:00,  1.41s/it]


Epochs: 3 | Train Loss:  0.057             | Train Accuracy:  0.521             | Val Loss:  0.064             | Val Accuracy:  0.465


100%|██████████| 85/85 [01:59<00:00,  1.41s/it]


Epochs: 4 | Train Loss:  0.057             | Train Accuracy:  0.491             | Val Loss:  0.064             | Val Accuracy:  0.465


100%|██████████| 85/85 [01:59<00:00,  1.41s/it]


Epochs: 5 | Train Loss:  0.057             | Train Accuracy:  0.493             | Val Loss:  0.063             | Val Accuracy:  0.453


100%|██████████| 11/11 [00:05<00:00,  1.99it/s]



Validation Accuracy of 0.4764705882352941
256 0.0001 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:03<00:00,  1.38it/s]


Epochs: 1 | Train Loss:  0.132             | Train Accuracy:  0.460             | Val Loss:  0.136             | Val Accuracy:  0.465


100%|██████████| 170/170 [02:03<00:00,  1.37it/s]


Epochs: 2 | Train Loss:  0.122             | Train Accuracy:  0.469             | Val Loss:  0.135             | Val Accuracy:  0.447


100%|██████████| 170/170 [02:03<00:00,  1.37it/s]


Epochs: 3 | Train Loss:  0.121             | Train Accuracy:  0.471             | Val Loss:  0.132             | Val Accuracy:  0.429


100%|██████████| 170/170 [02:04<00:00,  1.37it/s]


Epochs: 4 | Train Loss:  0.121             | Train Accuracy:  0.445             | Val Loss:  0.130             | Val Accuracy:  0.524


100%|██████████| 170/170 [02:04<00:00,  1.37it/s]


Epochs: 5 | Train Loss:  0.121             | Train Accuracy:  0.450             | Val Loss:  0.131             | Val Accuracy:  0.447


100%|██████████| 22/22 [00:05<00:00,  3.99it/s]



Validation Accuracy of 0.4588235294117647
256 0.0001 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:09<00:00,  2.62it/s]


Epochs: 1 | Train Loss:  0.260             | Train Accuracy:  0.438             | Val Loss:  0.275             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:06<00:00,  2.68it/s]


Epochs: 2 | Train Loss:  0.260             | Train Accuracy:  0.455             | Val Loss:  0.257             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:09<00:00,  2.63it/s]


Epochs: 3 | Train Loss:  0.242             | Train Accuracy:  0.471             | Val Loss:  0.258             | Val Accuracy:  0.447


100%|██████████| 340/340 [02:09<00:00,  2.63it/s]


Epochs: 4 | Train Loss:  0.242             | Train Accuracy:  0.465             | Val Loss:  0.262             | Val Accuracy:  0.435


100%|██████████| 340/340 [02:09<00:00,  2.62it/s]


Epochs: 5 | Train Loss:  0.241             | Train Accuracy:  0.463             | Val Loss:  0.259             | Val Accuracy:  0.476


100%|██████████| 43/43 [00:05<00:00,  7.81it/s]



Validation Accuracy of 0.43529411764705883
256 0.0001 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:28<00:00,  4.59it/s]


Epochs: 1 | Train Loss:  0.522             | Train Accuracy:  0.440             | Val Loss:  0.662             | Val Accuracy:  0.435


100%|██████████| 680/680 [02:30<00:00,  4.52it/s]


Epochs: 2 | Train Loss:  0.493             | Train Accuracy:  0.465             | Val Loss:  0.530             | Val Accuracy:  0.453


100%|██████████| 680/680 [02:30<00:00,  4.52it/s]


Epochs: 3 | Train Loss:  0.487             | Train Accuracy:  0.476             | Val Loss:  0.500             | Val Accuracy:  0.476


100%|██████████| 680/680 [02:30<00:00,  4.53it/s]


Epochs: 4 | Train Loss:  0.483             | Train Accuracy:  0.465             | Val Loss:  0.517             | Val Accuracy:  0.447


100%|██████████| 680/680 [02:30<00:00,  4.52it/s]


Epochs: 5 | Train Loss:  0.482             | Train Accuracy:  0.460             | Val Loss:  0.508             | Val Accuracy:  0.465


100%|██████████| 85/85 [00:05<00:00, 14.25it/s]



Validation Accuracy of 0.4647058823529412
512 1e-06 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 1 | Train Loss:  0.069             | Train Accuracy:  0.389             | Val Loss:  0.071             | Val Accuracy:  0.376


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 2 | Train Loss:  0.068             | Train Accuracy:  0.426             | Val Loss:  0.070             | Val Accuracy:  0.424


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 3 | Train Loss:  0.066             | Train Accuracy:  0.448             | Val Loss:  0.068             | Val Accuracy:  0.453


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 4 | Train Loss:  0.066             | Train Accuracy:  0.456             | Val Loss:  0.069             | Val Accuracy:  0.459


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 5 | Train Loss:  0.065             | Train Accuracy:  0.462             | Val Loss:  0.067             | Val Accuracy:  0.453


100%|██████████| 11/11 [00:05<00:00,  1.98it/s]



Validation Accuracy of 0.45294117647058824
512 1e-06 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:01<00:00,  1.39it/s]


Epochs: 1 | Train Loss:  0.136             | Train Accuracy:  0.412             | Val Loss:  0.140             | Val Accuracy:  0.500


100%|██████████| 170/170 [02:03<00:00,  1.38it/s]


Epochs: 2 | Train Loss:  0.131             | Train Accuracy:  0.514             | Val Loss:  0.133             | Val Accuracy:  0.576


100%|██████████| 170/170 [02:04<00:00,  1.37it/s]


Epochs: 3 | Train Loss:  0.120             | Train Accuracy:  0.617             | Val Loss:  0.121             | Val Accuracy:  0.647


100%|██████████| 170/170 [02:04<00:00,  1.36it/s]


Epochs: 4 | Train Loss:  0.100             | Train Accuracy:  0.785             | Val Loss:  0.104             | Val Accuracy:  0.794


100%|██████████| 170/170 [02:04<00:00,  1.36it/s]


Epochs: 5 | Train Loss:  0.083             | Train Accuracy:  0.874             | Val Loss:  0.092             | Val Accuracy:  0.812


100%|██████████| 22/22 [00:05<00:00,  3.92it/s]



Validation Accuracy of 0.788235294117647
512 1e-06 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:09<00:00,  2.63it/s]


Epochs: 1 | Train Loss:  0.266             | Train Accuracy:  0.432             | Val Loss:  0.271             | Val Accuracy:  0.406


100%|██████████| 340/340 [02:10<00:00,  2.61it/s]


Epochs: 2 | Train Loss:  0.251             | Train Accuracy:  0.517             | Val Loss:  0.248             | Val Accuracy:  0.571


100%|██████████| 340/340 [02:10<00:00,  2.60it/s]


Epochs: 3 | Train Loss:  0.216             | Train Accuracy:  0.710             | Val Loss:  0.216             | Val Accuracy:  0.729


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.169             | Train Accuracy:  0.865             | Val Loss:  0.185             | Val Accuracy:  0.806


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.126             | Train Accuracy:  0.935             | Val Loss:  0.156             | Val Accuracy:  0.824


100%|██████████| 43/43 [00:05<00:00,  7.60it/s]



Validation Accuracy of 0.8294117647058824
512 1e-06 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 1 | Train Loss:  0.523             | Train Accuracy:  0.455             | Val Loss:  0.508             | Val Accuracy:  0.488


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 2 | Train Loss:  0.447             | Train Accuracy:  0.657             | Val Loss:  0.431             | Val Accuracy:  0.712


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.326             | Train Accuracy:  0.812             | Val Loss:  0.334             | Val Accuracy:  0.759


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 4 | Train Loss:  0.238             | Train Accuracy:  0.868             | Val Loss:  0.297             | Val Accuracy:  0.782


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 5 | Train Loss:  0.171             | Train Accuracy:  0.935             | Val Loss:  0.266             | Val Accuracy:  0.806


100%|██████████| 85/85 [00:06<00:00, 14.01it/s]



Validation Accuracy of 0.8235294117647058
512 1e-05 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [01:54<00:00,  1.34s/it]


Epochs: 1 | Train Loss:  0.069             | Train Accuracy:  0.399             | Val Loss:  0.070             | Val Accuracy:  0.429


100%|██████████| 85/85 [02:00<00:00,  1.41s/it]


Epochs: 2 | Train Loss:  0.062             | Train Accuracy:  0.527             | Val Loss:  0.062             | Val Accuracy:  0.559


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 3 | Train Loss:  0.055             | Train Accuracy:  0.590             | Val Loss:  0.047             | Val Accuracy:  0.753


100%|██████████| 85/85 [02:01<00:00,  1.42s/it]


Epochs: 4 | Train Loss:  0.030             | Train Accuracy:  0.869             | Val Loss:  0.033             | Val Accuracy:  0.829


100%|██████████| 85/85 [02:01<00:00,  1.43s/it]


Epochs: 5 | Train Loss:  0.013             | Train Accuracy:  0.949             | Val Loss:  0.021             | Val Accuracy:  0.894


100%|██████████| 11/11 [00:05<00:00,  1.97it/s]



Validation Accuracy of 0.9058823529411765
512 1e-05 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [02:04<00:00,  1.37it/s]


Epochs: 1 | Train Loss:  0.132             | Train Accuracy:  0.455             | Val Loss:  0.126             | Val Accuracy:  0.559


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 2 | Train Loss:  0.100             | Train Accuracy:  0.705             | Val Loss:  0.111             | Val Accuracy:  0.624


100%|██████████| 170/170 [02:05<00:00,  1.35it/s]


Epochs: 3 | Train Loss:  0.067             | Train Accuracy:  0.813             | Val Loss:  0.080             | Val Accuracy:  0.735


100%|██████████| 170/170 [02:05<00:00,  1.36it/s]


Epochs: 4 | Train Loss:  0.040             | Train Accuracy:  0.842             | Val Loss:  0.060             | Val Accuracy:  0.765


100%|██████████| 170/170 [02:04<00:00,  1.36it/s]


Epochs: 5 | Train Loss:  0.029             | Train Accuracy:  0.857             | Val Loss:  0.055             | Val Accuracy:  0.782


100%|██████████| 22/22 [00:05<00:00,  3.88it/s]



Validation Accuracy of 0.7823529411764706
512 1e-05 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:11<00:00,  2.58it/s]


Epochs: 1 | Train Loss:  0.218             | Train Accuracy:  0.588             | Val Loss:  0.191             | Val Accuracy:  0.706


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 2 | Train Loss:  0.122             | Train Accuracy:  0.815             | Val Loss:  0.108             | Val Accuracy:  0.841


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 3 | Train Loss:  0.055             | Train Accuracy:  0.935             | Val Loss:  0.093             | Val Accuracy:  0.900


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 4 | Train Loss:  0.027             | Train Accuracy:  0.974             | Val Loss:  0.091             | Val Accuracy:  0.871


100%|██████████| 340/340 [02:12<00:00,  2.57it/s]


Epochs: 5 | Train Loss:  0.023             | Train Accuracy:  0.974             | Val Loss:  0.074             | Val Accuracy:  0.924


100%|██████████| 43/43 [00:05<00:00,  7.59it/s]



Validation Accuracy of 0.9176470588235294
512 1e-05 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:30<00:00,  4.51it/s]


Epochs: 1 | Train Loss:  0.469             | Train Accuracy:  0.537             | Val Loss:  0.416             | Val Accuracy:  0.653


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 2 | Train Loss:  0.190             | Train Accuracy:  0.875             | Val Loss:  0.142             | Val Accuracy:  0.924


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 3 | Train Loss:  0.067             | Train Accuracy:  0.965             | Val Loss:  0.108             | Val Accuracy:  0.941


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 4 | Train Loss:  0.038             | Train Accuracy:  0.978             | Val Loss:  0.151             | Val Accuracy:  0.918


100%|██████████| 680/680 [02:31<00:00,  4.49it/s]


Epochs: 5 | Train Loss:  0.023             | Train Accuracy:  0.988             | Val Loss:  0.136             | Val Accuracy:  0.929


100%|██████████| 85/85 [00:06<00:00, 14.07it/s]



Validation Accuracy of 0.9294117647058824
512 0.0001 16


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 85/85 [01:59<00:00,  1.41s/it]


Epochs: 1 | Train Loss:  0.063             | Train Accuracy:  0.457             | Val Loss:  0.069             | Val Accuracy:  0.465


100%|██████████| 85/85 [02:00<00:00,  1.42s/it]


Epochs: 2 | Train Loss:  0.059             | Train Accuracy:  0.490             | Val Loss:  0.067             | Val Accuracy:  0.465


100%|██████████| 85/85 [01:59<00:00,  1.40s/it]


Epochs: 3 | Train Loss:  0.060             | Train Accuracy:  0.459             | Val Loss:  0.066             | Val Accuracy:  0.394


100%|██████████| 85/85 [01:59<00:00,  1.40s/it]


Epochs: 4 | Train Loss:  0.061             | Train Accuracy:  0.451             | Val Loss:  0.065             | Val Accuracy:  0.500


100%|██████████| 85/85 [01:59<00:00,  1.40s/it]


Epochs: 5 | Train Loss:  0.060             | Train Accuracy:  0.477             | Val Loss:  0.067             | Val Accuracy:  0.388


100%|██████████| 11/11 [00:05<00:00,  2.02it/s]



Validation Accuracy of 0.45294117647058824
512 0.0001 8


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 170/170 [01:58<00:00,  1.44it/s]


Epochs: 1 | Train Loss:  0.132             | Train Accuracy:  0.449             | Val Loss:  0.140             | Val Accuracy:  0.447


100%|██████████| 170/170 [01:54<00:00,  1.48it/s]


Epochs: 2 | Train Loss:  0.131             | Train Accuracy:  0.457             | Val Loss:  0.141             | Val Accuracy:  0.447


100%|██████████| 170/170 [01:54<00:00,  1.48it/s]


Epochs: 3 | Train Loss:  0.131             | Train Accuracy:  0.457             | Val Loss:  0.141             | Val Accuracy:  0.447


100%|██████████| 170/170 [01:57<00:00,  1.44it/s]


Epochs: 4 | Train Loss:  0.128             | Train Accuracy:  0.465             | Val Loss:  0.138             | Val Accuracy:  0.441


100%|██████████| 170/170 [02:02<00:00,  1.39it/s]


Epochs: 5 | Train Loss:  0.124             | Train Accuracy:  0.439             | Val Loss:  0.133             | Val Accuracy:  0.441


100%|██████████| 22/22 [00:05<00:00,  4.04it/s]



Validation Accuracy of 0.43529411764705883
512 0.0001 4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 340/340 [02:05<00:00,  2.71it/s]


Epochs: 1 | Train Loss:  0.272             | Train Accuracy:  0.427             | Val Loss:  0.278             | Val Accuracy:  0.388


100%|██████████| 340/340 [02:01<00:00,  2.79it/s]


Epochs: 2 | Train Loss:  0.275             | Train Accuracy:  0.401             | Val Loss:  0.278             | Val Accuracy:  0.359


100%|██████████| 340/340 [02:01<00:00,  2.80it/s]


Epochs: 3 | Train Loss:  0.275             | Train Accuracy:  0.404             | Val Loss:  0.278             | Val Accuracy:  0.388


100%|██████████| 340/340 [02:01<00:00,  2.80it/s]


Epochs: 4 | Train Loss:  0.275             | Train Accuracy:  0.403             | Val Loss:  0.278             | Val Accuracy:  0.388


100%|██████████| 340/340 [02:01<00:00,  2.80it/s]


Epochs: 5 | Train Loss:  0.275             | Train Accuracy:  0.404             | Val Loss:  0.278             | Val Accuracy:  0.388


100%|██████████| 43/43 [00:05<00:00,  7.81it/s]



Validation Accuracy of 0.38823529411764707
512 0.0001 2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using Cuda: True


100%|██████████| 680/680 [02:29<00:00,  4.55it/s]


Epochs: 1 | Train Loss:  0.517             | Train Accuracy:  0.450             | Val Loss:  0.525             | Val Accuracy:  0.429


100%|██████████| 680/680 [02:29<00:00,  4.54it/s]


Epochs: 2 | Train Loss:  0.492             | Train Accuracy:  0.443             | Val Loss:  0.524             | Val Accuracy:  0.441


100%|██████████| 680/680 [02:29<00:00,  4.53it/s]


Epochs: 3 | Train Loss:  0.485             | Train Accuracy:  0.484             | Val Loss:  0.522             | Val Accuracy:  0.429


100%|██████████| 680/680 [02:30<00:00,  4.53it/s]


Epochs: 4 | Train Loss:  0.484             | Train Accuracy:  0.471             | Val Loss:  0.503             | Val Accuracy:  0.441


100%|██████████| 680/680 [02:30<00:00,  4.53it/s]


Epochs: 5 | Train Loss:  0.489             | Train Accuracy:  0.461             | Val Loss:  0.534             | Val Accuracy:  0.400


100%|██████████| 85/85 [00:05<00:00, 14.33it/s]


Validation Accuracy of 0.4
The highest accuracy achieved by the model was: 159
Best Hyperparameters: {'learning_rate': 1e-05, 'hidden_size': 256, 'batch_size': 4}





In [None]:
# best_validation_accuracy = 0.9411764705882353

# 256 1e-05 2
# Epoch 3
# 0.953
# {'Epochs': 3, 'hidden_layer_size': 128, 'learning_rate': 1e-05, 'batch_size': 4}


# Train Model

In [19]:
num_epochs = 5
best_hyperparameters = {'learning_rate': 1e-05, 'hidden_size': 256, 'batch_size': 4}

clf = BertClassifier(hidden_size=best_hyperparameters['hidden_size'])

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
train(clf, df_train, df_val, best_hyperparameters['learning_rate'], num_epochs, best_hyperparameters['batch_size'])
torch.save(clf.state_dict(), os.path.join(GOOGLE_DRIVE_PATH, 'checkpoint_tuned_8.pth'))

files.download(os.path.join(GOOGLE_DRIVE_PATH, 'checkpoint_tuned_8.pth'))

Using Cuda


100%|██████████| 340/340 [01:19<00:00,  4.26it/s]


Epochs: 1 | Train Loss: 0.23 | Train Accuracy: 0.557 | Val Loss: 0.181| Val Accuracy: 0.747


100%|██████████| 340/340 [01:19<00:00,  4.26it/s]


Epochs: 2 | Train Loss: 0.112 | Train Accuracy: 0.854 | Val Loss: 0.099| Val Accuracy: 0.882


100%|██████████| 340/340 [01:19<00:00,  4.26it/s]


Epochs: 3 | Train Loss: 0.043 | Train Accuracy: 0.964 | Val Loss: 0.094| Val Accuracy: 0.882


100%|██████████| 340/340 [01:19<00:00,  4.26it/s]


Epochs: 4 | Train Loss: 0.024 | Train Accuracy: 0.977 | Val Loss: 0.091| Val Accuracy: 0.882


100%|██████████| 340/340 [01:19<00:00,  4.26it/s]


Epochs: 5 | Train Loss: 0.019 | Train Accuracy: 0.98 | Val Loss: 0.064| Val Accuracy: 0.935


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test Model

In [22]:
from model import Dataset
from eval import Evaluation


cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

model_test = BertClassifier(hidden_size=best_hyperparameters['hidden_size'])
state_dict = torch.load(os.path.join(GOOGLE_DRIVE_PATH, 'checkpoint_tuned_8.pth'))
model_test.load_state_dict(state_dict)
model_test.to(device)

test_data = Dataset(df_test)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=best_hyperparameters['batch_size'], shuffle=True)

total_correct = 0
with torch.no_grad():
    preds = []
    labels = []
    for test_input, test_label in tqdm(test_dataloader):
        test_label = test_label.to(device)
        attention_mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1)
        input_id = input_id.to(device)

        output = model_test(input_id, attention_mask)
        num_correct = (output.argmax(dim=1) == test_label).sum().item()
        preds.extend(output.argmax(dim=1).tolist())
        labels.extend(test_label.tolist())
        total_correct += num_correct

eval = Evaluation(preds, labels)
eval.all_metrics()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 43/43 [00:03<00:00, 12.92it/s]


Accuracy: 0.9235294117647059
-----------------------
Macro
F1 score macro: 0.9309742587161942
Precision macro: 0.9256115698240936
Recall macro: 0.9370401810979061
-----------------------
Micro
F1 score micro: 0.9235294117647059
Precision micro: 0.9235294117647059
Recall micro: 0.9235294117647059
