In [65]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)

In [38]:
# load data
train = pd.read_csv('./data/ko_train_label.csv')
test = pd.read_csv('./data/ko_test_label.csv')

In [39]:
test.drop(columns=['Unnamed: 7'], inplace=True)
test.dropna(inplace=True)

In [40]:
train.columns

Index(['id', 'document', 'toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [41]:
train_melted = pd.melt(train, id_vars=['id', 'document' ], value_vars=['toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [42]:
train_melted = train_melted[train_melted['value'] == 1]
train_melted.drop(columns=['value'], inplace=True)

In [32]:
train_melted.groupby(['variable'])['document'].count()

variable
identity_hate     261
insult            225
obscene            95
threat             33
toxic            9969
Name: document, dtype: int64

In [44]:
# count duplicate rows
train_melted[train_melted.duplicated(subset=['id', 'document'], keep=False)].groupby(['id', 'document']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,variable
id,document,Unnamed: 2_level_1
89430,차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?,2
131249,이글을 보자 마자 끄면 이명박이가 죽습니다.. 이글을 싸이트 5곳에 안올,2
150731,일본영화 특유의 지루함... 그러나 ...,2
156881,사람들 옷벗기고 흐느적거리게 하면 다 예술인가.,2
157858,많이 실망..,3
...,...,...
10254965,"""연기자들의 명연기와 조수미와 김경호의 """"나 가거든""""의 MV 때문에 역사에 남을 국썅이 마치 나라를 사랑하고 청렴했으며 순수했던 한 여인으로 비춰진 게 씁쓸하다. 민비는 대한민국 역사에서 손꼽는 쓰레기이건만...""",3
10261749,개봉당시 안땡기는데 여친이 보자고해서 봤는데 이민기만 불쌍한 영화.. 김고은 연기는 최악,3
10270444,"감독아,안녕하시냐?",2
10274929,시키가 패기도 못쓰는 루피한테 진것도 씹밸붕이고 양복입고 총쏘는씬 진짜 뭔생각으로만든거지;;원피스랑 어울린다고 생각하나 초딩들이랑 여자들용 극장판 끝,4


In [45]:
train_melted.loc[train_melted['document'] == '차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?']

Unnamed: 0,id,document,variable
8070,89430,차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?,toxic
18069,89430,차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?,obscene


In [46]:
# label map dict
label_map = {
    'toxic': 0,
    'obscene': 1,
    'threat': 2,
    'insult': 3,
    'identity_hate': 4
}

# mapping
train_melted['label'] = train_melted['variable'].map(label_map)

In [49]:
train_melted.loc[train_melted['document'] == '차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?']

Unnamed: 0,id,document,label
107,89430,차라리 김현수의 카섹씬이나 길게 해주지(끝까지) 왜 보여주다가 그만둬?,"[0, 1]"


In [48]:
# make same document to one row
train_melted = train_melted.groupby(['id', 'document'])['label'].apply(list).reset_index()

In [50]:
len(train_melted)

9970

In [73]:
# Dataset Class

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
labels = {'toxic': 0, 'obscene': 1, 'threat': 2, 'insult': 3, 'identity_hate': 4}

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, 
                            padding='max_length', max_length = 512, truncation=True,
                            return_tensors="pt") for text in df['document']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [85]:
np.random.seed(112)
df_train, df_val, df_test = np.split(train_melted.sample(frac=1, random_state=112), [int(.8*len(train_melted)), int(.9*len(train_melted))])

print(len(df_train), len(df_val), len(df_test))

7976 997 997


In [70]:
# model building
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_id, attention_mask=mask)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        
        return self.relu(linear_output)

In [74]:
# Training Loop
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = CustomDataset(train_data), CustomDataset(val_data)
    train_dataloader = DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val, batch_size=8)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
        
    for epoch_num in range(epochs):
        
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label)
            train_loss += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0
        
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                
                output = model(input_id, mask)
                
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
                
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [86]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

KeyError: '0'

In [82]:
# label convert list to str
train_melted['label'] = train_melted['label'].apply(lambda x: ''.join(map(str, x)))

In [87]:
[labels[label] for label in df_train['label']]

KeyError: '0'

In [89]:
for label in df_train['label']:
    print(label)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
0
0
04
0
0
0
0
0
0
0
0
0
0
024
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
013
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
02
0
0
01
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
024
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
01
0
04
0
0
0
0
0
0
0
0
01
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
04
0
0
034
0
0
0
04
03
0
0
0
0
0
0
0
0
0
0
0
0
04
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
0
0
0
013
0
0
0
0
0
0
0
0
0
0
0
0
0



0
034
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
024
0
0
0
034
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0134
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
01
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
034
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [83]:
train_melted['label']

0       0
1       0
2       0
3       0
4       0
       ..
9965    0
9966    0
9967    0
9968    0
9969    0
Name: label, Length: 9970, dtype: object