# Import requirements

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip3 install wandb
!wandb login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[34m[1mwandb[0m: Currently logged in as: [33msuhyeon-k-official[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import os
import pdb
import wandb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd "/content/drive/MyDrive/GOORM/copy-of-6th-goorm-project-1-text-classification"

/content/drive/MyDrive/GOORM/copy-of-6th-goorm-project-1-text-classification


# 1. Preprocess

In [6]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
"""
from google.colab import files
uploaded = files.upload()
"""

'\nfrom google.colab import files\nuploaded = files.upload()\n'

In [9]:
!ls

goorm-proj1_김수현.ipynb  sentiment.dev.1    submission.csv
pytorch_model.bin	  sentiment.train.0  test_no_label.csv
sentiment.dev.0		  sentiment.train.1  wandb


In [10]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [11]:
train_pos[:10]

['101 6581 2833 1012 102',
 '101 21688 8013 2326 1012 102',
 '101 2027 2036 2031 3679 19247 1998 3256 6949 2029 2003 2428 2204 1012 102',
 '101 2009 1005 1055 1037 2204 15174 2098 7570 22974 2063 1012 102',
 '101 1996 3095 2003 5379 1012 102',
 '101 2204 3347 2833 1012 102',
 '101 2204 2326 1012 102',
 '101 11350 1997 2154 2003 25628 1998 7167 1997 19247 1012 102',
 '101 2307 2173 2005 6265 2030 3347 27962 1998 5404 1012 102',
 '101 1996 2047 2846 3504 6429 1012 102']

In [12]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [13]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [14]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

(array([ 101, 6581, 2833, 1012,  102]), array([1]))
(array([  101, 21688,  8013,  2326,  1012,   102]), array([1]))
(array([  101,  2027,  2036,  2031,  3679, 19247,  1998,  3256,  6949,
        2029,  2003,  2428,  2204,  1012,   102]), array([1]))
(array([  101,  2009,  1005,  1055,  1037,  2204, 15174,  2098,  7570,
       22974,  2063,  1012,   102]), array([1]))
(array([ 101, 1996, 3095, 2003, 5379, 1012,  102]), array([1]))
(array([ 101, 2204, 3347, 2833, 1012,  102]), array([1]))
(array([ 101, 2204, 2326, 1012,  102]), array([1]))
(array([  101, 11350,  1997,  2154,  2003, 25628,  1998,  7167,  1997,
       19247,  1012,   102]), array([1]))
(array([  101,  2307,  2173,  2005,  6265,  2030,  3347, 27962,  1998,
        5404,  1012,   102]), array([1]))
(array([ 101, 1996, 2047, 2846, 3504, 6429, 1012,  102]), array([1]))
(array([ 101, 2023, 2173, 2001, 2200, 2204, 1012,  102]), array([1]))


In [15]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

#Model - bert

*배치 사이즈 변경* <br>
- train_batch_size = 32 에서 train_batch_size =64 로 변경

In [16]:
train_batch_size=64
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [17]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model1.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

*러닝 레이트 변경* <br>
- 학습률은 손실 함수의 최소값을 향해 이동하면서 각 반복에서 단계 크기를 결정하는 최적화 알고리즘의 조정 매개변수이다.

*러닝레이트 스케줄러 추가* <br>
- 학습과정에서 learning rate를 조정하는 것, 처음엔 큰 learning rate(보폭)으로 빠르게 optimize를 하고 최적값에 가까워질수록 learning rate(보폭)를 줄여 미세조정을 하는 것이다.

In [18]:
model1.train()
learning_rate = 3e-6
train_epoch = 3
optimizer = AdamW(model1.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)



*wandb*
- wandb를 이용해서 필요한 metric의 log를 기록하고, 그래프 형태로 확인할 수 있다.

In [19]:
wandb.init(project="goorm-first-pj-suhyeon-bert_base", entity="goorm-3")

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msuhyeon-k-official[0m ([33mgoorm-3[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [20]:
wandb.config = {
  "learning_rate": learning_rate,
  "epochs": train_epoch,
  "batch_size": train_batch_size,
  "model_name": 'bert-base-uncased',
}

In [21]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [22]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model1(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model1.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model1(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)

                wandb.log({"loss": valid_loss, "acc": acc})
                # Optional
                wandb.watch(model1)


                if lowest_valid_loss > valid_loss:
                    scheduler.step()
                    print('Acc for model which have lower valid loss: ', acc)
                    print("lr: ", optimizer.param_groups[0]['lr'])
                    torch.save(model1.state_dict(), "./pytorch_model.bin")

Epoch 0:  20%|█▉        | 1385/6926 [01:45<06:53, 13.40batch/s, loss=0.127]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:07,  7.90it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.64it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.43it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.52it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.12it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.16it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.58it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.52it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 36.24it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.79it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.00it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.72it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.53it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.78it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.74it/s][A
Eval:  97%|█████████▋| 6

Acc for model which have lower valid loss:  0.96875
lr:  2.85e-06


Epoch 0:  40%|███▉      | 2769/6926 [03:31<05:09, 13.45batch/s, loss=0.0582]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.56it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.72it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.79it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.15it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.95it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.09it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.64it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.45it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.13it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.53it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.08it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.85it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.50it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.65it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.59it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97025
lr:  2.7075e-06


Epoch 0:  60%|█████▉    | 4155/6926 [05:18<03:25, 13.45batch/s, loss=0.0922]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.41it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.69it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.77it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.90it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.49it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.76it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.50it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.40it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.08it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.51it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.98it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.52it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.20it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.63it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.30it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97475
lr:  2.5721249999999995e-06


Epoch 0:  80%|███████▉  | 5539/6926 [07:05<01:43, 13.35batch/s, loss=0.0471]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.59it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.57it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.40it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.44it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.99it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.65it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.37it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.26it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 34.79it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.63it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.51it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.24it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.18it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.36it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.29it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97525
lr:  2.4435187499999996e-06


Epoch 0: 100%|█████████▉| 6925/6926 [08:53<00:00, 12.79batch/s, loss=0.0216]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.59it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.66it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.73it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.73it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.36it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.21it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.05it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.08it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.90it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.30it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.22it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.16it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.18it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.36it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.42it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.9765
lr:  2.3213428124999993e-06


Epoch 0: 100%|██████████| 6926/6926 [08:56<00:00, 12.91batch/s, loss=0.0216]
Epoch 1:  20%|█▉        | 1385/6926 [01:44<06:51, 13.46batch/s, loss=0.0286]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.50it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.29it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.05it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.16it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.81it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.12it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.14it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.01it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.88it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.51it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.11it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.58it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.44it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.39it/s][A
Eva

Acc for model which have lower valid loss:  0.976
lr:  2.2052756718749993e-06


Epoch 1:  40%|███▉      | 2769/6926 [03:31<05:02, 13.75batch/s, loss=0.0501]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.29it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.35it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.68it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.16it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.12it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.43it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.06it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.83it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 36.56it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.76it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.19it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.75it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.66it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.96it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.92it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97625
lr:  2.0950118882812494e-06


Epoch 1:  60%|█████▉    | 4155/6926 [05:19<03:26, 13.45batch/s, loss=0.133] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:07,  7.76it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.58it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.42it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.50it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.32it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.45it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.02it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.60it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 36.18it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.57it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.14it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.93it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.53it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.60it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.46it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97775
lr:  1.9902612938671867e-06


Epoch 1:  80%|███████▉  | 5539/6926 [07:07<01:43, 13.39batch/s, loss=0.241] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.13it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.64it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.74it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.17it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.37it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.82it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.54it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.41it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.22it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.80it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.30it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.02it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.71it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.01it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.88it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97675
lr:  1.8907482291738274e-06


Epoch 1: 100%|█████████▉| 6925/6926 [08:55<00:00, 12.67batch/s, loss=0.136] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:07,  7.84it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.65it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.48it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.66it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.14it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.11it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.22it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.92it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.73it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.23it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.77it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.36it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.99it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.19it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.10it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97925
lr:  1.7962108177151362e-06


Epoch 1: 100%|██████████| 6926/6926 [08:59<00:00, 12.85batch/s, loss=0.136]
Epoch 2:  20%|█▉        | 1385/6926 [01:45<06:51, 13.45batch/s, loss=0.0852]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:09,  6.80it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.41it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.54it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.80it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.84it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.03it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.34it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.09it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.86it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.38it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.89it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.59it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.76it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.24it/s][A
Eval

Acc for model which have lower valid loss:  0.977
lr:  1.7064002768293791e-06


Epoch 2:  40%|███▉      | 2769/6926 [03:34<05:06, 13.56batch/s, loss=0.0112]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.65it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.23it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.98it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.99it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.73it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.88it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.40it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.20it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.71it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.38it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.06it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.45it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.64it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.20it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.32it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.978
lr:  1.62108026298791e-06


Epoch 2:  60%|█████▉    | 4155/6926 [05:22<03:24, 13.53batch/s, loss=0.108] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:07,  7.97it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.89it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.43it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.33it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.12it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.28it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.86it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.73it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 36.41it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.98it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.43it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.13it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.90it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.98it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 35.02it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.9785
lr:  1.5400262498385144e-06


Epoch 2:  80%|███████▉  | 5539/6926 [07:11<02:01, 11.41batch/s, loss=0.00753]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:07,  7.84it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.82it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.54it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.72it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.48it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.72it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.17it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.90it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 36.37it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.52it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.96it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.64it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.52it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.76it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.67it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.98025
lr:  1.4630249373465887e-06


Epoch 2: 100%|█████████▉| 6925/6926 [09:01<00:00, 11.42batch/s, loss=0.0127]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.70it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.01it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.85it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.25it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.05it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.85it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.15it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.86it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.63it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.19it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.71it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.42it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.28it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 34.55it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.37it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.98025
lr:  1.3898736904792592e-06


Epoch 2: 100%|██████████| 6926/6926 [09:04<00:00, 12.72batch/s, loss=0.0127]


##Test

In [23]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [24]:
test_dataset = test_df['Id']

In [25]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [26]:
test = make_id_file_test(tokenizer, test_dataset)

In [27]:
test[:10]

['101 2009 1005 1055 1037 2878 2047 3325 1998 2047 26389 2169 2051 2017 2175 1012 102',
 '101 2061 15640 2013 2019 2214 5440 1012 102',
 '101 2009 2003 1996 2087 14469 7273 1999 1996 3028 1012 102',
 '101 2079 2025 3696 1037 10084 2007 2122 2111 1012 102',
 '101 1045 2001 6091 1998 2016 2081 2033 2514 2061 6625 1998 6160 1012 102',
 '101 1996 2069 2518 2057 2363 2008 2001 2980 2001 1996 4157 1012 102',
 '101 2053 1010 2025 1996 3924 2012 2004 2226 1010 1996 3924 1999 3502 2152 1012 102',
 '101 2027 3288 2009 2041 2392 2005 2017 1998 2024 2200 14044 1012 102',
 '101 4606 1996 12043 2106 1050 1005 1056 2130 2113 2129 2000 2147 1996 3274 1012 102',
 '101 2027 2031 2019 6581 4989 1997 25025 2015 2000 5454 2013 1012 102']

In [28]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [29]:
test_dataset = SentimentTestDataset(tokenizer, test)

*버그 수정*

In [30]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)
    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    sorted_indices = range(len(samples))

    
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [31]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [33]:
with torch.no_grad():
    model1.eval()
    predictions1 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model1(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions1 += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:03,  8.78it/s][A
Test:  19%|█▉        | 6/32 [00:00<00:00, 31.52it/s][A
Test:  38%|███▊      | 12/32 [00:00<00:00, 42.04it/s][A
Test:  59%|█████▉    | 19/32 [00:00<00:00, 49.96it/s][A
Test:  78%|███████▊  | 25/32 [00:00<00:00, 52.68it/s][A
Test: 100%|██████████| 32/32 [00:00<00:00, 55.77it/s][A
                                                     [A

In [73]:
test_df['Category'] = predictions1

In [74]:
test_df.to_csv('submission-bert.csv', index=False)

#Model - RoBERTa

In [35]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [36]:
train_batch_size=64
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [37]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model2 = RobertaForSequenceClassification.from_pretrained('roberta-base')
model2.to(device)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [38]:
model2.train()
learning_rate = 5e-5
train_epoch = 3
optimizer = AdamW(model2.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)



In [39]:
wandb.init(project="goorm-first-pj-suhyeon-roberta_base", entity="goorm-3")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁▂▅▅▆▅▆▆▆▇▆▇▇██
loss,█▅▄▃▂▃▃▂▂▁▂▂▂▁▁

0,1
acc,0.98025
loss,0.05703


In [40]:
wandb.config = {
  "learning_rate": learning_rate,
  "epochs": train_epoch,
  "batch_size": train_batch_size,
  "model_name": 'roberta-base',
}

In [41]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model2(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model2.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model2(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)

                wandb.log({"loss": valid_loss, "acc": acc})
                # Optional
                wandb.watch(model2)


                if lowest_valid_loss > valid_loss:
                    scheduler.step()
                    print('Acc for model which have lower valid loss: ', acc)
                    print("lr: ", optimizer.param_groups[0]['lr'])
                    torch.save(model2.state_dict(), "./pytorch_model.bin")

Epoch 0:  20%|█▉        | 1385/6926 [01:43<07:05, 13.03batch/s, loss=0.12] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.47it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.47it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.54it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.96it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.07it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.53it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.23it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.96it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.73it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 37.05it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.79it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.90it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.92it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.58it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 35.80it/s][A
Eval:  97%|█████████▋| 6

Acc for model which have lower valid loss:  0.94175
lr:  4.75e-05


Epoch 0:  40%|███▉      | 2769/6926 [03:29<05:06, 13.58batch/s, loss=0.0372]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.52it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.25it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 30.00it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.60it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.49it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.63it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.35it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.47it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 38.06it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 37.38it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 37.11it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.64it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.67it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.80it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.65it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.94875
lr:  4.5125e-05


Epoch 0:  60%|█████▉    | 4155/6926 [05:16<03:22, 13.71batch/s, loss=0.0578]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.10it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.21it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.65it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.13it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.17it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.45it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.17it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.59it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.62it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 37.26it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.90it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.74it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.55it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.71it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.73it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.95875
lr:  4.2868749999999995e-05


Epoch 0:  80%|███████▉  | 5539/6926 [07:02<01:43, 13.46batch/s, loss=0.0951]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.14it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.08it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.61it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.21it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.32it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.75it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.33it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.97it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.05it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.83it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.48it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.18it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.87it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.32it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.25it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.96675
lr:  4.07253125e-05


Epoch 0: 100%|█████████▉| 6925/6926 [08:49<00:00, 12.74batch/s, loss=0.0325]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.19it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.19it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.77it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.01it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.07it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.34it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.19it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.42it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 37.77it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 37.10it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.86it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.49it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.52it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.67it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.32it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.96775
lr:  3.868904687499999e-05


Epoch 0: 100%|██████████| 6926/6926 [08:52<00:00, 13.00batch/s, loss=0.0325]
Epoch 1:  20%|█▉        | 1385/6926 [01:43<06:57, 13.29batch/s, loss=0.0349]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.28it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.45it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.77it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.23it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.95it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.49it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.32it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 37.64it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 37.43it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.77it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.05it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 35.51it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 35.55it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 35.67it/s][A
Eva

Acc for model which have lower valid loss:  0.9655
lr:  3.675459453124999e-05


Epoch 1:  40%|███▉      | 2769/6926 [03:31<05:03, 13.70batch/s, loss=0.0621]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.14it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.09it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.56it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.18it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.32it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.58it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.11it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.73it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.55it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.80it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.47it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.22it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.12it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.50it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.45it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97425
lr:  3.4916864804687486e-05


Epoch 1:  60%|█████▉    | 4155/6926 [05:18<03:23, 13.63batch/s, loss=0.181] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.26it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.14it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.56it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.22it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.30it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.57it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.28it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 38.04it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.74it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 37.36it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.87it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.44it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.21it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.50it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.21it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.9715
lr:  3.3171021564453116e-05


Epoch 1:  80%|███████▉  | 5539/6926 [07:05<01:42, 13.52batch/s, loss=0.00857]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.11it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.07it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.58it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.31it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.38it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.75it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.44it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 38.05it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.59it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 37.00it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.31it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.13it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.00it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.38it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.33it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.9745
lr:  3.151247048623045e-05


Epoch 1: 100%|█████████▉| 6925/6926 [08:53<00:00, 12.47batch/s, loss=0.024] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.44it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.53it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.95it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.79it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.02it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.23it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.68it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.63it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.54it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 37.17it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.45it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.86it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.81it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.17it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.22it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97025
lr:  2.9936846961918937e-05


Epoch 1: 100%|██████████| 6926/6926 [08:57<00:00, 12.89batch/s, loss=0.024]
Epoch 2:  20%|█▉        | 1385/6926 [01:44<06:51, 13.48batch/s, loss=0.0285]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  6.97it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.50it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.25it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.77it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.98it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.00it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.97it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.26it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 38.08it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 37.36it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 37.19it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.76it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.71it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.77it/s][A
Eval

Acc for model which have lower valid loss:  0.97375
lr:  2.8440004613822984e-05


Epoch 2:  40%|███▉      | 2769/6926 [03:33<05:05, 13.59batch/s, loss=0.0199] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  6.91it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.53it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 28.59it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.60it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.87it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.42it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.07it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.86it/s][A
Eval:  52%|█████▏    | 33/63 [00:00<00:00, 37.63it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.88it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.20it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.79it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.62it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.16it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.10it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.97225
lr:  2.7018004383131835e-05


Epoch 2:  60%|█████▉    | 4155/6926 [05:22<03:23, 13.62batch/s, loss=0.00882]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.10it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 23.30it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.75it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 33.25it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.42it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.43it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 37.37it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.54it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 38.09it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 37.36it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 37.13it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.66it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.43it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.70it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.21it/s][A
Eval:  98%|█████████▊|

Acc for model which have lower valid loss:  0.97325
lr:  2.5667104163975243e-05


Epoch 2:  80%|███████▉  | 5539/6926 [07:11<02:02, 11.31batch/s, loss=0.0298]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:08,  7.28it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.98it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.68it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.82it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 35.02it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 36.28it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.99it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.15it/s][A
Eval:  54%|█████▍    | 34/63 [00:00<00:00, 37.66it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.87it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 35.89it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 35.55it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 35.68it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.13it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 35.95it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.97675
lr:  2.4383748955776477e-05


Epoch 2: 100%|█████████▉| 6925/6926 [09:00<00:00, 11.32batch/s, loss=0.0157]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:09,  6.85it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 22.42it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 29.19it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 32.80it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.87it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.62it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.52it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 37.84it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:00, 37.51it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.81it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.74it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.23it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.27it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.42it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.08it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.97475
lr:  2.3164561507987653e-05


Epoch 2: 100%|██████████| 6926/6926 [09:04<00:00, 12.73batch/s, loss=0.0157]


In [42]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [54]:
with torch.no_grad():
    model2.eval()
    predictions2 = []
    final_logit = 0
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model2(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions2 += batch_predictions
    final_logit = sum(predictions2)/len(predictions2)
    print(final_logit)


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:04,  7.40it/s][A
Test:  19%|█▉        | 6/32 [00:00<00:00, 28.56it/s][A
Test:  34%|███▍      | 11/32 [00:00<00:00, 35.28it/s][A
Test:  53%|█████▎    | 17/32 [00:00<00:00, 42.37it/s][A
Test:  72%|███████▏  | 23/32 [00:00<00:00, 47.68it/s][A
Test:  91%|█████████ | 29/32 [00:00<00:00, 50.51it/s][A
                                                     [A

0.503


In [75]:
test_df['Category'] = predictions2

In [76]:
test_df.to_csv('submission-roberta.csv', index=False)

#Model - Albert

In [44]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

In [45]:
train_batch_size=64
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [46]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model3 = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
model3 = AutoModelForSequenceClassification.from_pretrained("albert-base-v2")
model3.to(device)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'predictions.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [47]:
model3.train()
learning_rate = 3e-5
train_epoch = 3
optimizer = AdamW(model3.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)

In [48]:
wandb.init(project="goorm-first-pj-suhyeon-albert-base", entity="goorm-3")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁▂▄▆▆▆█▇█▇▇▇▇██
loss,█▇▄▂▂▃▁▂▁▂▁▁▂▁▁

0,1
acc,0.97475
loss,0.07488


In [49]:
wandb.config = {
  "learning_rate": learning_rate,
  "epochs": train_epoch,
  "batch_size": train_batch_size,
  "model_name": "albert-base-v2",
}

In [65]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model3(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model3.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model3(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)

                wandb.log({"loss": valid_loss, "acc": acc})
                # Optional
                wandb.watch(model3)

                if lowest_valid_loss > valid_loss:
                    scheduler.step()
                    print('Acc for model which have lower valid loss: ', acc)
                    print("lr: ", optimizer.param_groups[0]['lr'])
                    torch.save(model3.state_dict(), "./pytorch_model.bin")

Epoch 0:  20%|█▉        | 1385/6926 [01:35<06:26, 14.32batch/s, loss=0.0833]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  6.09it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.78it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.09it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.61it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.49it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.93it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.56it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.24it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.05it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.57it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.03it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.85it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.67it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.92it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.85it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.95975
lr:  2.8499999999999998e-05


Epoch 0:  40%|███▉      | 2769/6926 [03:14<04:51, 14.25batch/s, loss=0.0705]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.74it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.01it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.31it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 29.95it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.10it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.67it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.50it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.08it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.09it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.73it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.05it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.16it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.03it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.46it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.52it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.96325
lr:  2.7075e-05


Epoch 0:  60%|█████▉    | 4155/6926 [04:51<03:11, 14.47batch/s, loss=0.0331]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.96it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.50it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.92it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.35it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.56it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.06it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.50it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.42it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.33it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.79it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.15it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.67it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.27it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.56it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.29it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97125
lr:  2.5721249999999997e-05


Epoch 0:  80%|███████▉  | 5539/6926 [06:29<01:36, 14.44batch/s, loss=0.0678]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.94it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.58it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.65it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.28it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.50it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.91it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.54it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.44it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.07it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.18it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 33.75it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.45it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.22it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.45it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.38it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97025
lr:  2.44351875e-05


Epoch 0: 100%|█████████▉| 6925/6926 [08:08<00:00, 13.64batch/s, loss=0.0555]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  6.01it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.62it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.88it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.42it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.45it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.97it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.53it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.28it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.04it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.52it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.11it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.87it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.65it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.89it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.85it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.976
lr:  2.3213428124999993e-05


Epoch 0: 100%|██████████| 6926/6926 [08:10<00:00, 14.12batch/s, loss=0.0555]
Epoch 1:  20%|█▉        | 1385/6926 [01:36<06:19, 14.60batch/s, loss=0.116] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.71it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.11it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.44it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.34it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.41it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.79it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.49it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.44it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 34.97it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.44it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 33.75it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.46it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.37it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.56it/s][A
Eva

Acc for model which have lower valid loss:  0.97
lr:  2.2052756718749993e-05


Epoch 1:  40%|███▉      | 2769/6926 [03:14<05:00, 13.83batch/s, loss=0.0365]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  6.15it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.32it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.54it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.28it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.51it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.76it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.53it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.40it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.33it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.66it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.10it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.52it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.28it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.67it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.56it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.974
lr:  2.0950118882812494e-05


Epoch 1:  60%|█████▉    | 4155/6926 [04:52<03:11, 14.45batch/s, loss=0.0855]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.69it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.00it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.54it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.29it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.30it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.84it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.65it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.60it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.23it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.83it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.30it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.03it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.77it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.96it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.93it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.971
lr:  1.9902612938671867e-05


Epoch 1:  80%|███████▉  | 5539/6926 [06:30<01:35, 14.56batch/s, loss=0.0113]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.86it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.03it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.61it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.39it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.66it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.13it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.63it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.43it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.20it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.79it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.20it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.67it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.41it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.67it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.56it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97575
lr:  1.890748229173827e-05


Epoch 1: 100%|█████████▉| 6925/6926 [08:08<00:00, 13.64batch/s, loss=0.0525]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.78it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.92it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.21it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 29.66it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 31.64it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.15it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 33.93it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 34.71it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 34.48it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.00it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 33.64it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.50it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.34it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.71it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.40it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97525
lr:  1.796210817715136e-05


Epoch 1: 100%|██████████| 6926/6926 [08:10<00:00, 14.11batch/s, loss=0.0525]
Epoch 2:  20%|█▉        | 1385/6926 [01:36<06:24, 14.40batch/s, loss=0.0238]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.97it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.06it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.66it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.43it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.57it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.76it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.33it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.20it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.03it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.49it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 33.96it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.67it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.53it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.72it/s][A
Eva

Acc for model which have lower valid loss:  0.97525
lr:  1.706400276829379e-05


Epoch 2:  40%|███▉      | 2769/6926 [03:14<04:58, 13.93batch/s, loss=0.0382] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.99it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.40it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.68it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.14it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.29it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.34it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.31it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.33it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.02it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.47it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 32.98it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 32.90it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 32.95it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.08it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.09it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.97325
lr:  1.62108026298791e-05


Epoch 2:  60%|█████▉    | 4155/6926 [04:53<03:11, 14.47batch/s, loss=0.0268]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  6.12it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.36it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.67it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.08it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.29it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.77it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.24it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.22it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.17it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.45it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 33.86it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.58it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.38it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.72it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.67it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97375
lr:  1.5400262498385145e-05


Epoch 2:  80%|███████▉  | 5539/6926 [06:31<01:41, 13.63batch/s, loss=0.185] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  6.17it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.65it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.92it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.44it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.49it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.92it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.61it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.46it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.25it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.70it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 32.88it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 32.95it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.08it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.47it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.48it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97575
lr:  1.4630249373465886e-05


Epoch 2: 100%|█████████▉| 6925/6926 [08:10<00:00, 13.44batch/s, loss=0.00461]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.81it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.12it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.42it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.03it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.21it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 33.45it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 34.11it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 34.99it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 34.88it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 34.51it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.08it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 33.83it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 33.43it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 33.50it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 33.32it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.976
lr:  1.3898736904792591e-05


Epoch 2: 100%|██████████| 6926/6926 [08:12<00:00, 14.06batch/s, loss=0.00461]


In [66]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [67]:
with torch.no_grad():
    model3.eval()
    predictions3 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model3(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions3 += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:05,  5.96it/s][A
Test:  19%|█▉        | 6/32 [00:00<00:01, 25.09it/s][A
Test:  34%|███▍      | 11/32 [00:00<00:00, 34.67it/s][A
Test:  53%|█████▎    | 17/32 [00:00<00:00, 41.81it/s][A
Test:  72%|███████▏  | 23/32 [00:00<00:00, 47.17it/s][A
Test:  91%|█████████ | 29/32 [00:00<00:00, 50.88it/s][A
                                                     [A

In [77]:
test_df['Category'] = predictions3

In [78]:
test_df.to_csv('submission-albert.csv', index=False)

#Model - ELECTRA

In [80]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification

In [81]:
train_batch_size=64
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [82]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model4 = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator')
model4.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [83]:
model4.train()
learning_rate = 3e-5
train_epoch = 3
optimizer = AdamW(model4.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)



In [84]:
wandb.init(project="goorm-first-pj-suhyeon-electra-base", entity="goorm-3")

0,1
acc,▁▃▆▆█▅▇▆███▇▇██
loss,█▆▃▃▂▄▃▃▂▁▃▃▂▁▂

0,1
acc,0.976
loss,0.07755


In [85]:
wandb.config = {
  "learning_rate": learning_rate,
  "epochs": train_epoch,
  "batch_size": train_batch_size,
  "model_name": 'google/electra-base-discriminator',
}

In [86]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model4(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model4.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model4(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)

                wandb.log({"loss": valid_loss, "acc": acc})
                # Optional
                wandb.watch(model4)
                
                if lowest_valid_loss > valid_loss:
                    scheduler.step()
                    print('Acc for model which have lower valid loss: ', acc)
                    print("lr: ", optimizer.param_groups[0]['lr'])
                    torch.save(model4.state_dict(), "./pytorch_model.bin")

Epoch 0:  20%|█▉        | 1385/6926 [01:42<06:51, 13.46batch/s, loss=0.0476]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.58it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.31it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.17it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.47it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.04it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.70it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.62it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 38.14it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:00, 37.84it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 37.07it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.69it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.05it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.18it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.55it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.43it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.97475
lr:  2.8499999999999998e-05


Epoch 0:  40%|███▉      | 2769/6926 [03:27<05:04, 13.63batch/s, loss=0.128] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.34it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.54it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.50it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.56it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.48it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.41it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.49it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.43it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.21it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.80it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.41it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.10it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.01it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.28it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.38it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.98025
lr:  2.7075e-05


Epoch 0:  60%|█████▉    | 4155/6926 [05:12<03:21, 13.76batch/s, loss=0.0742]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.54it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.28it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.17it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.46it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.73it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.51it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.59it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 37.93it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:00, 37.53it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.81it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.51it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 35.91it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 35.88it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.33it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.29it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.9775
lr:  2.5721249999999997e-05


Epoch 0:  80%|███████▉  | 5539/6926 [06:57<01:42, 13.54batch/s, loss=0.106]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.63it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.41it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.35it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.51it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.90it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.55it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.44it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 37.74it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:00, 37.47it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.89it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.69it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 36.07it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 35.96it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.37it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.25it/s][A
Eval:  98%|█████████▊| 6

Acc for model which have lower valid loss:  0.981
lr:  2.44351875e-05


Epoch 0: 100%|█████████▉| 6925/6926 [08:42<00:00, 13.30batch/s, loss=0.0405]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.68it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.00it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.00it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.17it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.71it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.42it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.38it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.35it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.22it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.55it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.08it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.91it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.76it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.12it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 35.95it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.97125
lr:  2.3213428124999993e-05


Epoch 0: 100%|██████████| 6926/6926 [08:45<00:00, 13.17batch/s, loss=0.0405]
Epoch 1:  20%|█▉        | 1385/6926 [01:42<06:44, 13.68batch/s, loss=0.0152]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.32it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.88it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.08it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.40it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 34.06it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.74it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.55it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.17it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.05it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.80it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.44it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.25it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.88it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.26it/s][A
Eva

Acc for model which have lower valid loss:  0.98125
lr:  2.2052756718749993e-05


Epoch 1:  40%|███▉      | 2769/6926 [03:28<05:04, 13.63batch/s, loss=0.0312]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.74it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.98it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.80it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.06it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.52it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.28it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.08it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.69it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.66it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.27it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.82it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.47it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.79it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.36it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 35.69it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.982
lr:  2.0950118882812494e-05


Epoch 1:  60%|█████▉    | 4155/6926 [05:14<03:21, 13.78batch/s, loss=0.0142]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.72it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.27it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.21it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.12it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.62it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.34it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.15it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.29it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.29it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.90it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.39it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.17it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.10it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.51it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.53it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.98025
lr:  1.9902612938671867e-05


Epoch 1:  80%|███████▉  | 5539/6926 [07:01<01:47, 12.93batch/s, loss=0.0205] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.24it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:03, 19.24it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 25.22it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 29.17it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 30.61it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 32.31it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 33.77it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 35.26it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 35.48it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 35.07it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 34.69it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 34.50it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 34.69it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.10it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 34.84it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.98425
lr:  1.890748229173827e-05


Epoch 1: 100%|█████████▉| 6925/6926 [08:48<00:00, 12.84batch/s, loss=0.114] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:10,  5.70it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.48it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.41it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.45it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.92it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.46it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.30it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.31it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.09it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.67it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.92it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.78it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.80it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.96it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 35.93it/s][A
Eval:  97%|█████████▋| 

Acc for model which have lower valid loss:  0.98075
lr:  1.796210817715136e-05


Epoch 1: 100%|██████████| 6926/6926 [08:51<00:00, 13.02batch/s, loss=0.114]
Epoch 2:  20%|█▉        | 1385/6926 [01:43<06:53, 13.39batch/s, loss=0.00736]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.31it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.60it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.37it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 30.54it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 32.98it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 34.77it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.76it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 36.85it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.94it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.54it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 35.93it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 35.51it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.42it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 35.65it/s][A
Eva

Acc for model which have lower valid loss:  0.98325
lr:  1.706400276829379e-05


Epoch 2:  40%|███▉      | 2769/6926 [03:31<05:09, 13.45batch/s, loss=0.0508]  
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.54it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.30it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.33it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.57it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.89it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.58it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.45it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.48it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 36.96it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.72it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.33it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.06it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.79it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.17it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.11it/s][A
Eval:  97%|█████████▋

Acc for model which have lower valid loss:  0.98275
lr:  1.62108026298791e-05


Epoch 2:  60%|█████▉    | 4155/6926 [05:19<03:25, 13.47batch/s, loss=0.00025]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.55it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.19it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.22it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.37it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.74it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.39it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.41it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.24it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.14it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.74it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.45it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.31it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 36.11it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.16it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.06it/s][A
Eval:  97%|█████████▋|

Acc for model which have lower valid loss:  0.98175
lr:  1.5400262498385145e-05


Epoch 2:  80%|███████▉  | 5539/6926 [07:06<01:57, 11.79batch/s, loss=0.0231]  
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.59it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 20.31it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:01, 27.33it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.45it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.94it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.61it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 36.18it/s][A
Eval:  46%|████▌     | 29/63 [00:00<00:00, 37.10it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:00, 37.06it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 36.85it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 36.42it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 36.15it/s][A
Eval:  78%|███████▊  | 49/63 [00:01<00:00, 35.88it/s][A
Eval:  84%|████████▍ | 53/63 [00:01<00:00, 36.19it/s][A
Eval:  90%|█████████ | 57/63 [00:01<00:00, 36.29it/s][A
Eval:  97%|█████████▋

Acc for model which have lower valid loss:  0.9775
lr:  1.4630249373465886e-05


Epoch 2: 100%|█████████▉| 6925/6926 [08:55<00:00, 11.80batch/s, loss=0.0213]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:11,  5.39it/s][A
Eval:   8%|▊         | 5/63 [00:00<00:02, 19.86it/s][A
Eval:  14%|█▍        | 9/63 [00:00<00:02, 26.98it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 31.25it/s][A
Eval:  27%|██▋       | 17/63 [00:00<00:01, 33.66it/s][A
Eval:  33%|███▎      | 21/63 [00:00<00:01, 35.39it/s][A
Eval:  40%|███▉      | 25/63 [00:00<00:01, 35.93it/s][A
Eval:  48%|████▊     | 30/63 [00:00<00:00, 37.46it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:00, 37.09it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 36.56it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 36.42it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 35.99it/s][A
Eval:  79%|███████▉  | 50/63 [00:01<00:00, 36.09it/s][A
Eval:  86%|████████▌ | 54/63 [00:01<00:00, 36.31it/s][A
Eval:  92%|█████████▏| 58/63 [00:01<00:00, 36.08it/s][A
Eval:  98%|█████████▊| 

Acc for model which have lower valid loss:  0.9825
lr:  1.3898736904792591e-05


Epoch 2: 100%|██████████| 6926/6926 [08:58<00:00, 12.86batch/s, loss=0.0213]


In [87]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [88]:
with torch.no_grad():
    model4.eval()
    predictions4 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model4(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions4 += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:05,  5.81it/s][A
Test:  22%|██▏       | 7/32 [00:00<00:00, 29.55it/s][A
Test:  44%|████▍     | 14/32 [00:00<00:00, 42.62it/s][A
Test:  62%|██████▎   | 20/32 [00:00<00:00, 47.54it/s][A
Test:  84%|████████▍ | 27/32 [00:00<00:00, 52.41it/s][A
                                                     [A

In [89]:
test_df['Category'] = predictions4

In [90]:
test_df.to_csv('submission-electra.csv', index=False)

#Model - XLNet

In [91]:
train_batch_size=64
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [92]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
model5 = AutoModelForSequenceClassification.from_pretrained('xlnet-base-cased')
model5.to(device)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [93]:
model5.train()
learning_rate = 5e-6
train_epoch = 3
optimizer = AdamW(model5.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)

In [94]:
wandb.init(project="goorm-first-pj-suhyeon-xlnet_base", entity="goorm-3")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▃▆▄▆▁▆▇▆█▆▇▇▇▄▇
loss,▅▃▄▃▅▂▂▃▁▁▄▃▆█▃

0,1
acc,0.9825
loss,0.05907


In [95]:
wandb.config = {
  "learning_rate": learning_rate,
  "epochs": train_epoch,
  "batch_size": train_batch_size,
  "model_name": "xlnet-base-cased",
}

In [96]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model5(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model5.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model5(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)

                wandb.log({"loss": valid_loss, "acc": acc})
                # Optional
                wandb.watch(model5)

                if lowest_valid_loss > valid_loss:
                    scheduler.step()
                    print('Acc for model which have lower valid loss: ', acc)
                    print("lr: ", optimizer.param_groups[0]['lr'])
                    torch.save(model5.state_dict(), "./pytorch_model.bin")

Epoch 0:  20%|█▉        | 1384/6926 [02:11<08:48, 10.48batch/s, loss=0.221]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.99it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.02it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.17it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.95it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.08it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.12it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 26.80it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.19it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.35it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 27.99it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.40it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.95it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.88it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.77it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.81it/s][A
Eval:  73%|███████▎  | 4

Acc for model which have lower valid loss:  0.90975
lr:  4.75e-06


Epoch 0:  40%|███▉      | 2770/6926 [04:24<06:24, 10.80batch/s, loss=0.13] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.10it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.17it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.20it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 23.00it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.17it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.57it/s][A
Eval:  32%|███▏      | 20/63 [00:00<00:01, 27.96it/s][A
Eval:  38%|███▊      | 24/63 [00:00<00:01, 28.47it/s][A
Eval:  43%|████▎     | 27/63 [00:01<00:01, 28.52it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 29.33it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.69it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 28.26it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 27.29it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.88it/s][A
Eval:  73%|███████▎  | 46/63 [00:01<00:00, 26.59it/s][A
Eval:  78%|███████▊  | 4

Acc for model which have lower valid loss:  0.94125
lr:  4.5125e-06


Epoch 0:  60%|█████▉    | 4155/6926 [06:37<04:20, 10.64batch/s, loss=0.151] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.06it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.04it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.12it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.68it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.90it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.33it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.38it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.69it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.88it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.35it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.52it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.06it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.71it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.87it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 27.00it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.95375
lr:  4.2868749999999995e-06


Epoch 0:  80%|███████▉  | 5540/6926 [08:50<02:08, 10.80batch/s, loss=0.078]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.07it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.08it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.15it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 23.03it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.15it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.38it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.32it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.85it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.95it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.32it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.70it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.14it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.73it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.94it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.98it/s][A
Eval:  73%|███████▎  | 4

Acc for model which have lower valid loss:  0.9635
lr:  4.07253125e-06


Epoch 0: 100%|█████████▉| 6924/6926 [11:04<00:00, 10.54batch/s, loss=0.0613]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.13it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.33it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.33it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.93it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.12it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.60it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.62it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.71it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 28.00it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.49it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.74it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.08it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.18it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.24it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.46it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.96675
lr:  3.868904687499999e-06


Epoch 0: 100%|██████████| 6926/6926 [11:08<00:00, 10.36batch/s, loss=0.0613]
Epoch 1:  20%|█▉        | 1385/6926 [02:10<08:28, 10.89batch/s, loss=0.0248]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.91it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 14.94it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.15it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.94it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.14it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.57it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.25it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.64it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.76it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.36it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.63it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.08it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.85it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.63it/s][A
Eva

Acc for model which have lower valid loss:  0.96875
lr:  3.675459453124999e-06


Epoch 1:  40%|███▉      | 2769/6926 [04:23<06:29, 10.66batch/s, loss=0.0594]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:13,  4.58it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:04, 14.22it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 19.12it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.09it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.39it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 25.73it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 26.76it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.05it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.19it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 27.68it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.19it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.55it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.13it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.23it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.44it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.96925
lr:  3.491686480468749e-06


Epoch 1:  60%|█████▉    | 4154/6926 [06:38<04:16, 10.83batch/s, loss=0.108] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.13it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.19it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.26it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 23.09it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.29it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.70it/s][A
Eval:  32%|███▏      | 20/63 [00:00<00:01, 28.05it/s][A
Eval:  37%|███▋      | 23/63 [00:00<00:01, 28.50it/s][A
Eval:  41%|████▏     | 26/63 [00:01<00:01, 28.15it/s][A
Eval:  48%|████▊     | 30/63 [00:01<00:01, 29.08it/s][A
Eval:  52%|█████▏    | 33/63 [00:01<00:01, 28.54it/s][A
Eval:  57%|█████▋    | 36/63 [00:01<00:00, 28.16it/s][A
Eval:  62%|██████▏   | 39/63 [00:01<00:00, 27.00it/s][A
Eval:  67%|██████▋   | 42/63 [00:01<00:00, 27.11it/s][A
Eval:  71%|███████▏  | 45/63 [00:01<00:00, 26.66it/s][A
Eval:  76%|███████▌  | 

Acc for model which have lower valid loss:  0.96725
lr:  3.3171021564453116e-06


Epoch 1:  80%|███████▉  | 5540/6926 [08:52<02:09, 10.66batch/s, loss=0.0941]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.78it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:04, 14.56it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 19.87it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.85it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.96it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.30it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.25it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.71it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.73it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.11it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.54it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.98it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.64it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.87it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.92it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.96975
lr:  3.1512470486230457e-06


Epoch 1: 100%|█████████▉| 6925/6926 [11:08<00:00, 10.47batch/s, loss=0.0631]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.12it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.20it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.07it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.73it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.88it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.00it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.04it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.52it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.83it/s][A
Eval:  46%|████▌     | 29/63 [00:01<00:01, 28.94it/s][A
Eval:  51%|█████     | 32/63 [00:01<00:01, 28.81it/s][A
Eval:  56%|█████▌    | 35/63 [00:01<00:00, 28.03it/s][A
Eval:  60%|██████    | 38/63 [00:01<00:00, 27.11it/s][A
Eval:  65%|██████▌   | 41/63 [00:01<00:00, 27.13it/s][A
Eval:  70%|██████▉   | 44/63 [00:01<00:00, 27.24it/s][A
Eval:  75%|███████▍  | 

Acc for model which have lower valid loss:  0.969
lr:  2.993684696191894e-06


Epoch 1: 100%|██████████| 6926/6926 [11:12<00:00, 10.30batch/s, loss=0.0631]
Epoch 2:  20%|█▉        | 1385/6926 [02:11<08:33, 10.78batch/s, loss=0.0204] 
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.10it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.23it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.40it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 23.19it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.35it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.35it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.29it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.64it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.72it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.16it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.39it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.85it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.50it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.61it/s][A
Ev

Acc for model which have lower valid loss:  0.9705
lr:  2.8440004613822987e-06


Epoch 2:  40%|███▉      | 2770/6926 [04:27<06:34, 10.53batch/s, loss=0.119]  
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.97it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.04it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 19.96it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.81it/s][A
Eval:  21%|██        | 13/63 [00:00<00:01, 25.11it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.57it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.60it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.85it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 28.20it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.54it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.79it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.24it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.80it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.86it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.92it/s][A
Eval:  73%|███████▎  |

Acc for model which have lower valid loss:  0.97225
lr:  2.7018004383131835e-06


Epoch 2:  60%|█████▉    | 4155/6926 [06:43<04:10, 11.06batch/s, loss=0.0337]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.96it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:04, 14.22it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 19.57it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.29it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.59it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 26.19it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 27.30it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.83it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 28.09it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.58it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.75it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 28.05it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.43it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.66it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.72it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.97
lr:  2.5667104163975244e-06


Epoch 2:  80%|███████▉  | 5540/6926 [09:00<02:20,  9.87batch/s, loss=0.0179]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  4.86it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:04, 14.68it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 19.62it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.25it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.44it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 25.84it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 26.97it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.35it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 27.52it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 28.06it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.37it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.78it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.53it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.67it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.82it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.96925
lr:  2.438374895577648e-06


Epoch 2: 100%|█████████▉| 6924/6926 [11:17<00:00,  9.15batch/s, loss=0.0285]
Eval:   0%|          | 0/63 [00:00<?, ?it/s][A
Eval:   2%|▏         | 1/63 [00:00<00:12,  5.01it/s][A
Eval:   6%|▋         | 4/63 [00:00<00:03, 15.02it/s][A
Eval:  11%|█         | 7/63 [00:00<00:02, 20.12it/s][A
Eval:  16%|█▌        | 10/63 [00:00<00:02, 22.52it/s][A
Eval:  21%|██        | 13/63 [00:00<00:02, 24.60it/s][A
Eval:  25%|██▌       | 16/63 [00:00<00:01, 25.96it/s][A
Eval:  30%|███       | 19/63 [00:00<00:01, 26.54it/s][A
Eval:  35%|███▍      | 22/63 [00:00<00:01, 27.04it/s][A
Eval:  40%|███▉      | 25/63 [00:01<00:01, 26.96it/s][A
Eval:  44%|████▍     | 28/63 [00:01<00:01, 27.59it/s][A
Eval:  49%|████▉     | 31/63 [00:01<00:01, 28.16it/s][A
Eval:  54%|█████▍    | 34/63 [00:01<00:01, 27.71it/s][A
Eval:  59%|█████▊    | 37/63 [00:01<00:00, 27.47it/s][A
Eval:  63%|██████▎   | 40/63 [00:01<00:00, 26.61it/s][A
Eval:  68%|██████▊   | 43/63 [00:01<00:00, 26.41it/s][A
Eval:  73%|███████▎  | 

Acc for model which have lower valid loss:  0.9665
lr:  2.3164561507987653e-06


Epoch 2: 100%|██████████| 6926/6926 [11:21<00:00, 10.17batch/s, loss=0.0285]


In [97]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [98]:
with torch.no_grad():
    model5.eval()
    predictions5 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model5(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions5 += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:06,  4.99it/s][A
Test:  16%|█▌        | 5/32 [00:00<00:01, 18.50it/s][A
Test:  28%|██▊       | 9/32 [00:00<00:00, 24.81it/s][A
Test:  41%|████      | 13/32 [00:00<00:00, 29.29it/s][A
Test:  53%|█████▎    | 17/32 [00:00<00:00, 32.46it/s][A
Test:  66%|██████▌   | 21/32 [00:00<00:00, 33.91it/s][A
Test:  78%|███████▊  | 25/32 [00:00<00:00, 35.27it/s][A
Test:  91%|█████████ | 29/32 [00:00<00:00, 36.10it/s][A
                                                     [A

In [103]:
test_df['Category'] = predictions5

In [104]:
test_df.to_csv('submission-xlnet.csv', index=False)

##ensemble

In [51]:
predictions = []
for i in range(len(predictions1)):
    # Assign weights to the predictions made by each model
    weight1 = 0.5
    weight2 = 0.3
    weight3 = 0.2
    weighted_sum = weight1 * predictions4[i] + weight2 * predictions5[i] + weight3 * predictions1[i]
    # Classify as 1 if the weighted sum is greater than or equal to 0.5, else classify as 0
    if weighted_sum >= 0.5:
        predictions.append(1)
    else:
        predictions.append(0)

In [69]:
test_df['Category'] = predictions

In [70]:
test_df.to_csv('submission-ensemble.csv', index=False)