In [None]:
!pip install transformers datasets

# library

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tokenizers import Tokenizer

from datasets import load_dataset

from sklearn.metrics import multilabel_confusion_matrix
from transformers import BertForSequenceClassification, BertConfig

from tqdm.auto import tqdm

from IPython.display import display
from typing import List, Dict, Any, Tuple
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# dataset

In [None]:
class ApeachDataset(Dataset):
    def __init__(self,
                 split: str,
                 tokenizer: Tokenizer, 
                 max_length: int = 256,
                 padding: str = "max_length") -> None:
        super().__init__()
        dataset = load_dataset("jason9693/APEACH")
        texts = dataset[split]['text']
        inputs = tokenizer(texts, padding=padding, max_length=max_length, truncation=True, return_tensors="pt")
        
        self.input_ids = inputs["input_ids"]
        self.attention_masks = inputs["attention_mask"]
        
        labels = dataset[split]['class']
        self.labels = torch.tensor(labels, dtype=torch.float32)
        
    def __len__(self):
        return self.input_ids.shape[0]
        
    def __getitem__(self, index: Any) -> Dict:
        return self.input_ids[index], self.attention_masks[index], self.labels[index]

    def dataloader(self, **kwargs) -> DataLoader:
        return DataLoader(self, **kwargs)

# load huggingface pretrained model

In [None]:
"""
monologg/koelectra-small-v3-discriminator
beomi/KcELECTRA-base
beomi/kcbert-base
beomi/kcbert-large
"""

huggingface_model_name = "beomi/KcELECTRA-base"
labels = ['hate']
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)
model = AutoModelForSequenceClassification.from_pretrained(huggingface_model_name, num_labels=len(labels))

Downloading (…)okenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/514 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.weight', 'classifi

# dataloader

In [None]:
max_length = 64
batch_size = 128
train_dl = ApeachDataset("train", tokenizer, max_length=max_length).dataloader(batch_size=batch_size)
val_dl = ApeachDataset("test", tokenizer, max_length=max_length).dataloader(batch_size=batch_size)



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

# hyperparameter

In [None]:
lr = 1e-4
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
len(train_dl)

62

In [None]:
device

'cuda'

In [None]:
model.to(device)

In [None]:
epochs = 10

# training

In [None]:
model.train()

for epoch in tqdm(range(1,epochs+1)):
    
    train_loss = 0.0
    train_acc = 0.0
    count = 0

    for ind,batch in enumerate(tqdm(train_dl),start=1):
        
        input_ids,attention_masks,labels = batch

        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)

        logits = model(input_ids,attention_masks)
        output = logits.logits.sigmoid().squeeze(1)

        loss = criterion(output,labels)
        train_loss += loss

        loss.backward()
        optimizer.step()

        acc = ((output >= 0.5) == labels).sum().item()/(output.size()[0])
        train_acc += acc

        count += 1

        if count % 30 == 0:

            print(f'average accuracy: {train_acc/(count)}')
            print(f'average loss: {train_loss/(count)}')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


  0%|          | 0/62 [00:00<?, ?it/s]

average accuracy: 0.4314453125
average loss: 4.609428882598877


In [None]:
torch.save(model.state_dict(),'/content/drive/MyDrive/curse_model/model.pth')

# validation

In [None]:
model.eval()

val_loss = 0.0
val_acc = 0.0

with torch.no_grad():

    for ind,batch in enumerate(tqdm(val_dl)):
        
        input_ids,attention_masks,labels = batch

        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)

        logits = model(input_ids,attention_masks)
        output = logits.logits.sigmoid().squeeze(1)

        loss = criterion(output,labels)
        val_loss += loss
        acc = ((output >= 0.5) == labels).sum().item()/(output.size()[0])
        val_acc += acc

print(f'average accuracy: {val_acc/(len(val_dl))}')
print(f'average loss: {val_loss/(len(val_dl))}')

  0%|          | 0/30 [00:00<?, ?it/s]

average accuracy: 0.48125
average loss: 309.3230285644531


In [None]:
labels.long().size()

torch.Size([128])

In [None]:
logits.logits.sigmoid().squeeze(1).size()

torch.Size([128])

In [None]:
output = logits.logits.sigmoid().squeeze(1)

out = output.detach().numpy()

print(torch.tensor((output >= 0.5) == labels).sum().item())

print(output.size()[0])

55
128


  print(torch.tensor((output >= 0.5) == labels).sum().item())


In [None]:
((output >= 0.5) == labels).sum().item()

63