In [None]:
!pip install transformers datasets

#library

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tokenizers import Tokenizer

from datasets import load_dataset

from sklearn.metrics import multilabel_confusion_matrix
from transformers import BertForSequenceClassification, BertConfig

from tqdm.auto import tqdm

from IPython.display import display
from typing import List, Dict, Any, Tuple
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# dataset

In [6]:
class ApeachDataset(Dataset):
    def __init__(self,
                 split: str,
                 tokenizer: Tokenizer, 
                 max_length: int = 256,
                 padding: str = "max_length") -> None:
        super().__init__()
        dataset = load_dataset("jason9693/APEACH")
        texts = dataset[split]['text']
        inputs = tokenizer(texts, padding=padding, max_length=max_length, truncation=True, return_tensors="pt")
        
        self.input_ids = inputs["input_ids"]
        self.attention_masks = inputs["attention_mask"]
        
        labels = dataset[split]['class']
        self.labels = torch.tensor(labels, dtype=torch.float32)
        
    def __len__(self):
        return self.input_ids.shape[0]
        
    def __getitem__(self, index: Any) -> Dict:
        return self.input_ids[index], self.attention_masks[index], self.labels[index]

    def dataloader(self, **kwargs) -> DataLoader:
        return DataLoader(self, **kwargs)

# student model dataset to get soft_label

In [7]:
class ApeachStudentDataset(ApeachDataset):
    def __init__(self,
                 teacher_model: torch.nn.Module,
                 split: str,
                 teacher_tokenizer: Tokenizer, 
                 student_tokenizer: Tokenizer, 
                 max_length: int = 256,
                 padding: str = "max_length",
                 device: str="cuda") -> None:
        super().__init__(split, student_tokenizer, max_length, padding)
        
        teacher_ds = ApeachDataset(split, teacher_tokenizer, max_length, padding)

        teacher_model = teacher_model.to(device)
        with torch.no_grad():
            soft_labels = [self._get_soft_label(teacher_model, teacher_ds, i, device) 
                        for i in range(len(self))]
            self.soft_labels = torch.stack(soft_labels)

    def __getitem__(self, index: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        return *super().__getitem__(index), self.soft_labels[index]

    def _get_soft_label(self, model, teacher_ds, index, device):
        ids, mask, _ = teacher_ds[index]
        ids = ids.unsqueeze(0).to(device)
        mask = mask.unsqueeze(0).to(device)
        return model(ids, mask).logits.sigmoid().cpu().squeeze(0)

# load huggingface pretrained model

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
teacher_huggingface_model_name = 'beomi/KcELECTRA-base'
student_hgf_model_name = 'monologg/koelectra-small-v3-discriminator'

teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_huggingface_model_name)
student_tokenizer = AutoTokenizer.from_pretrained(student_hgf_model_name)

#curse_teacher = 'ckpt/hate_78.ckpt'
#bias_teacher = 'ckpt/bias_val_epoch_loss=0.6996.ckpt'
#hate_teacher = 'ckpt/hate_94_val_acc=0.7973.ckpt'
#apeach_teacher = 'ckpt/hate_115_val_acc=0.8610.ckpt'
temp_teacher = '/content/drive/MyDrive/curse_model/model.pth'
teacher_name = temp_teacher

labels = ['hate']

teacher_module = AutoModelForSequenceClassification.from_pretrained(teacher_huggingface_model_name, num_labels=len(labels))

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

# dataset & dataloader

In [17]:
max_length = 32

train_ds = ApeachStudentDataset(
    teacher_module, 
    "train",
    teacher_tokenizer,
    student_tokenizer,
    max_length=max_length, 
    device=device)



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
val_ds = ApeachStudentDataset(
    teacher_module,
    "test",
    teacher_tokenizer,
    student_tokenizer,
    max_length=max_length, 
    device=device)



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
batch_size = 32

train_dl = train_ds.dataloader(batch_size = batch_size)

In [27]:
val_dl = val_ds.dataloader(batch_size=batch_size)

# student model

In [19]:
student_module = AutoModelForSequenceClassification.from_pretrained(student_hgf_model_name, num_labels=len(labels))

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

# hyperparameter

In [20]:
criterion = nn.CrossEntropyLoss()
soft_label_criterion = nn.BCELoss()
alpha = 0.3
lr = 1e-4
optimizer = optim.Adam(student_module.parameters(), lr=lr)
epochs = 2

In [None]:
student_module.to(device)

In [None]:
epochs = 30

# training

In [None]:
student_module.train()

for epoch in tqdm(range(1,epochs+1)):
    
    train_loss = 0.0
    train_acc = 0.0
    count = 0

    for batch in tqdm(train_dl):
        
        input_ids, attention_masks, hard_labels, soft_labels = batch

        ##이거 안하니까 cpu가 터지더라
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        hard_labels = hard_labels.to(device)
        soft_labels = soft_labels.to(device)
        ##
        
        logits = student_module(input_ids,attention_masks)

        outputs = logits.logits.sigmoid()

        ce_loss = criterion(outputs.squeeze(1),hard_labels)
        kd_loss = soft_label_criterion(outputs.squeeze(0),soft_labels)

        loss = alpha*ce_loss + (1-alpha)*kd_loss
        train_loss += loss

        loss.backward()
        optimizer.step()

        acc = ((outputs.squeeze(1) >= 0.5) == hard_labels).sum()/(outputs.squeeze(1).size()[0])
        train_acc += acc

        count += 1

        if count % 20 == 0:
            
            print(f'average accuracy: {train_acc/(count)}')
            print(f'average loss: {train_loss/(count)}')

In [None]:
torch.save(student_module.state_dict(), '/content/drive/MyDrive/curse_model/student.pth')

In [None]:
hard_labels.shape

torch.Size([16])

In [None]:
outputs.squeeze(1).shape

torch.Size([16])

# validation

In [28]:
student_module.eval()
    
val_loss = 0.0
val_acc = 0.0
count = 0

with torch.no_grad():

    for batch in tqdm(val_dl):
        
        input_ids, attention_masks, hard_labels, soft_labels = batch

        ##이거 안하니까 cpu가 터지더라
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        hard_labels = hard_labels.to(device)
        soft_labels = soft_labels.to(device)
        ##
        
        logits = student_module(input_ids,attention_masks)

        outputs = logits.logits.sigmoid()

        ce_loss = criterion(outputs.squeeze(1),hard_labels)
        kd_loss = soft_label_criterion(outputs.squeeze(0),soft_labels)

        loss = alpha*ce_loss + (1-alpha)*kd_loss
        val_loss += loss

        acc = ((outputs.squeeze(1) >= 0.5) == hard_labels).sum()/(outputs.squeeze(1).size()[0])
        val_acc += acc

        count += 1

        if count % 20 == 0:
            
            print(f'average accuracy: {val_acc/(count)}')
            print(f'average loss: {val_loss/(count)}')

  0%|          | 0/118 [00:00<?, ?it/s]

average accuracy: 1.0
average loss: 0.48316651582717896
average accuracy: 1.0
average loss: 0.4831407070159912
average accuracy: 0.7333333492279053
average loss: 9.355469703674316
average accuracy: 0.559765636920929
average loss: 15.130263328552246
average accuracy: 0.5484374761581421
average loss: 15.507172584533691
