In [1]:
!pip install transformers --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import shutil
import sys

pd.set_option('display.max_colwidth', None)

In [2]:
# load data
train = pd.read_csv('./data/ko_train_label.csv')
test = pd.read_csv('./data/ko_test_label.csv')

In [3]:
test.drop(columns=['Unnamed: 7'], inplace=True)
test.dropna(inplace=True)

In [4]:
train.columns

Index(['id', 'document', 'toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [5]:
# drop useless columns
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

In [6]:
target_list = ['toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-5

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [9]:
class CustomDataSet(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.document = df['document']
        self.targets = self.df[target_list].values
        self.max_len = max_len
        
    def __len__(self):
        return len(self.document)
    
    def __getitem__(self, index):
        document = str(self.document[index])
        document = " ".join(document.split())
        
        inputs = self.tokenizer.encode_plus(
            document,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [10]:
train_size = 0.8
train_df = train.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train.drop(train_df.index).reset_index(drop=True)

In [11]:
train_dataset = CustomDataSet(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataSet(val_df, tokenizer, MAX_LEN)

In [12]:
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
val_data_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [14]:
device

device(type='cuda')

In [15]:
def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    val_loss_min = checkpoint['val_loss_min']
    
    return model, optimizer, checkpoint['epoch'], val_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased', return_dict=True)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 6)
        
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        
        return output

In [17]:
model = BERTClass()
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets = []
val_outputs = []

In [None]:
def train_model(n_epochs, training_loader, validation_loader, 
                model, optimizer, checkpoint_path, best_model_path):
    vaid_loss_min = np.Inf
    
    for epoch in range(1, n_epochs + 1):
        train_loss = 0.0
        valid_loss = 0.0
        
        model.train()
        print("################# Epoch {}: Training Start #################".format(epoch))
        for batch_idx, data in enumerate(training_loader):
            ids = data['inputs_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
            
    print("################# Epoch {}: Training End #################".format(epoch))
    print("################# Epoch {}: Validation Start #################".format(epoch))
    
    ##########
    # validation
    ##########
    
    model.eval()
    
    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader):
            ids = data['inputs_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
            
            targets = targets.cpu().detach().numpy().tolist()
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
            
            val_targets.extend(targets)
            val_outputs.extend(outputs)
            
        print("################# Epoch {}: Validation End #################".format(epoch))
        # calculate average loss
        train_loss = train_loss / len(training_loader)
        valid_loss = valid_loss / len(validation_loader)
        
        print("Epoch {}, Training Loss: {}, Validation Loss: {}".format(epoch, train_loss, valid_loss))
        
        # save model
        checkpoint = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        # save checkpoint data
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
        if valid_loss < valid_loss_min:
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss
    print("################# Epoch {}: Validation End #################".format(epoch))
    
    return model