In [1]:
import torch
import datetime
import time
import torch.nn as nn
import numpy as np

from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from sklearn.metrics import f1_score

In [2]:
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 2e-5
EPS = 1e-8
WARMUP = 100

# hh:mm:ss
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))    
    return str(datetime.timedelta(seconds=elapsed_rounded))

dataset = load_dataset('glue', 'cola')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(data):
    return tokenizer(data['sentence'], truncation=True, max_length=MAX_LENGTH, padding='max_length')


tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=5000)

def create_dataloader(dataset, batch_size):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_masks = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['label'])
    tensor_dataset = TensorDataset(input_ids, attention_masks, labels)
    random_sampler = RandomSampler(tensor_dataset)
    return DataLoader(tensor_dataset, sampler=random_sampler, batch_size = batch_size)

train_dataloader = create_dataloader(tokenized_dataset['train'], BATCH_SIZE)
test_dataloader = create_dataloader(tokenized_dataset['validation'], BATCH_SIZE)

Found cached dataset glue (C:/Users/yeti/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\yeti\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-a11e411136772b5a.arrow
Loading cached processed dataset at C:\Users\yeti\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-2d6bb69b6bcc0406.arrow
Loading cached processed dataset at C:\Users\yeti\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-199577ef2822b192.arrow


In [3]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
def train_model(model, device):
    optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, eps = EPS)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP, num_training_steps=total_steps)

    model.to(device)
    start_time = time.time()

    def checkpoint():
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'total_loss': total_loss
        }, 'cola_checkpoint.pt')

    for epoch in range(EPOCHS):
        total_loss = 0
        model.train()
        print(f'--- training {epoch + 1} / {EPOCHS}')
        
        for step, batch in enumerate(train_dataloader):
            if (step+1) % 500 == 0:
                checkpoint()
                print(f'--------- {(step+1) * BATCH_SIZE} / {len(train_dataloader) * BATCH_SIZE} trained.', format_time(time.time() - start_time))

            batch_inputs = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch_inputs[0],
                'attention_mask': batch_inputs[1],
                'labels': batch_inputs[2]
            }
            
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs[0]
            total_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1} - Average training loss: {avg_train_loss}')
        checkpoint()
    print(f'--- train finished. {format_time(time.time() - start_time)}')

In [5]:
def evaluate_model(model, device):
    model.to(device)
    total_eval_accuracy = 0
    total_eval_loss = 0
    start_time = time.time()
    num_items = 0

    labels = np.array([])
    predictions = np.array([])


    print('--- evaluating')
    for step, batch in enumerate(test_dataloader):
        if (step+1) % 100 == 0:
            print(f'------ {(step+1)*BATCH_SIZE} / {len(test_dataloader) * BATCH_SIZE}', format_time(time.time() - start_time))

        batch_inputs = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch_inputs[0],
            'attention_mask': batch_inputs[1],
            'labels': batch_inputs[2]
        }
        
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs[0]
            logits = outputs[1]
        
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        labels = np.concatenate((batch_inputs[2].to('cpu').numpy(), labels))
        predictions = np.concatenate((logits.argmax(axis=1), predictions))

        num_items += BATCH_SIZE
        
    avg_eval_loss = total_eval_loss / len(test_dataloader)
    avg_eval_accuracy = total_eval_accuracy / num_items
    print(f'--- evaluating f1: {f1_score(labels, predictions)}, evaluating loss: {avg_eval_loss:.4f} {format_time(time.time() - start_time)}')
    print(f'num total : {predictions.shape[0]},  num rights : {np.sum(predictions == labels)}')

In [6]:
# model.train()
# train_model(model, torch.device('cuda'))
# torch.save(model, "cola_trained100.pt")

In [7]:
model.load_state_dict(torch.load('../task/cola_checkpoint.pt')['model_state_dict'])
model.eval()
print("original BERT model score")
evaluate_model(model, torch.device('cpu'))

original BERT model score
--- evaluating
--- evaluating f1: 0.880842659644503, evaluating loss: 0.8401 0:01:41
num total : 1043,  num rights : 862


In [8]:
import copy

qe_model = copy.deepcopy(model)
qe_model.eval()
qe_model.to('cpu')
q_model = torch.quantization.quantize_dynamic(qe_model, {torch.nn.Linear}, dtype=torch.qint8)
qe_model.bert.encoder = q_model.bert.encoder

print("quantized BERT model score")
evaluate_model(q_model, torch.device('cpu'))

print("quantized encoder BERT model score")
evaluate_model(qe_model, torch.device('cpu'))

quantized BERT model score
--- evaluating
--- evaluating f1: 0.8748370273794002, evaluating loss: 0.4951 0:01:23
num total : 1043,  num rights : 851
quantized encoder BERT model score
--- evaluating
--- evaluating f1: 0.8743523316062176, evaluating loss: 0.4872 0:01:23
num total : 1043,  num rights : 849


In [9]:
import sys
import os
sys.path.append(os.path.relpath("."))
from correction_encoder import create_corrected_encoder, train_correction_model, create_train_dataset
# from efficientnet_for_encoder import EfficientNetLiteForEncoder
from MobileNetV3_for_encoder import MobileNetV3
# from resnet50_for_encoder import ResNetForEncoder, BasicBlock


# create_train_dataset(model, q_model, train_dataloader)



# correction_model = ResNetForEncoder(BasicBlock, [3,4,6,3], 128, 768)
correction_model = MobileNetV3(128, 768)
correction_model.to('cuda')
train_correction_model(correction_model, "./train_dataset", save_path="./cola_encoder_mobilenet.pt", epochs=2)

# correction_model.load_state_dict(torch.load('./cola_encoder_mobilenet.pt')['model_state_dict'])
# correction_model.load_state_dict(torch.load('./cola_encoder_resnet.pt')['model_state_dict'])

total_params = sum(p.numel() for p in correction_model.parameters())
total_size = total_params * 4 / (1024 ** 2)  # 모델 사이즈 (MB) 계산
print("correction model size ", total_size)



in 16 out 16 stride 1
in 16 out 24 stride 2
in 24 out 24 stride 1
in 24 out 24 stride 1
in 24 out 24 stride 1
in 24 out 24 stride 1
in 24 out 40 stride 2
in 40 out 40 stride 1
in 40 out 40 stride 1
in 40 out 40 stride 1
in 40 out 40 stride 1
in 40 out 40 stride 1
in 40 out 80 stride 2
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 80 stride 1
in 80 out 112 stride 1
in 112 out 112 stride 1
in 112 out 112 stride 1
in 112 out 112 stride 1
in 112 out 112 stride 1
in 112 out 112 stride 1
in 112 out 160 stride 2
in 160 out 160 stride 1
in 160 out 160 stride 1
in 160 out 160 stride 1
in 160 out 160 stride 1
in 160 out 320 stride 1




--- training 1 / 2


  return F.mse_loss(input, target, reduction=self.reduction)


tensor(0.2693, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(3.2698, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.5129, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(1.4823, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(1.0330, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.6872, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.4282, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.3587, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.6028, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.7334, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.4134, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2061, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2303, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2750, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2467, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2513, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.2825, device='cuda:0', grad_fn=

In [10]:

c_model = create_corrected_encoder(qe_model, correction_model)
c_model.to('cpu')
c_model.eval()
evaluate_model(c_model, torch.device('cpu'))

--- evaluating
--- evaluating f1: 0.87565445026178, evaluating loss: 0.4556 0:02:29
num total : 1043,  num rights : 853
