In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import jamotools
import re
import gluonnlp as nlp
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
import gluonnlp as nlp
import numpy as np
import random

In [2]:
class CharDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, mode):
        self.sentences = [line[sent_idx] for line in dataset]
        self.labels = [int(line[label_idx]) for line in dataset]
        self.korean = re.compile('[^1!ㄱ-ㅣ가-힣]+')
        self.mode = mode
        tok_path = '/home/yeonsik/kobert/kobert_news_wiki_ko_cased-1087f8699e.spiece'
        self.sp = SentencepieceTokenizer(tok_path)
        self.vocab = self.make_vocab()
        self.vocab_size = len(self.vocab)
        self.q3 = self.get_q3()
        self.char2idx = {u:i for i, u in enumerate(self.vocab)}
        self.idx2char = {i:u for i, u in enumerate(self.vocab)}
        self.max_len = self.find_max_len()
        
    def __getitem__(self, i):
        return (self.preprocess_sentence(self.sentences[i]), torch.tensor(self.labels[i]).to(torch.float32))
    
    def __len__(self):
        return len(self.labels)
    
    def make_vocab(self):
        vocab = ''
        for sentence in self.sentences:
            vocab+=' '+sentence
        vocab = self.make_token(vocab)
        vocab = set(vocab)
        vocab = sorted(vocab)
        vocab.append('<UNK>') #######
        vocab.append('<PAD>')
        return vocab
    
    def make_token(self, sentence):
        if self.mode == 'jamo':
            chars = self.korean.sub('', jamotools.split_syllables(sentence))
            return list(chars)
        elif self.mode == 'char':
            chars = self.korean.sub('', sentence)
            return list(chars)
        elif self.mode == 'sentencepiece':
            return self.sp(sentence)
        
    def preprocess_sentence(self, sentence):
        chars = self.make_token(sentence)
        if len(chars) < self.q3:
            need_pad = self.q3 - len(chars)
            chars.extend(['<PAD>']*need_pad)
        else:
            chars = chars[:self.q3]
        chars = torch.tensor([self.char2idx[x] for x in chars]).to(torch.int64)
        return chars
    
    def find_max_len(self):
        return max(len(self.make_token(item)) for item in self.sentences)
    
    def find_max_idx(self):
        return self.sentences[np.argmax([len(self.make_token(item)) for item in self.sentences])]

    
    def get_q3(self):
        values = np.array([len(self.make_token(x)) for x in self.sentences])
        return int(np.quantile(values, 0.75))
    
    
    def plot_len(self):
        values = np.array([len(self.make_token(x)) for x in self.sentences])
        plt.hist(values, density=True, bins=80)
        plt.ylabel('count')
        plt.xlabel('length of sequence')
        plt.show()
        print('문장 최대 길이 :',self.max_len)
        results = stats.describe(values)
        print('min={}, max={}, mean={}, Q2={} Q3={}'.format(results[1][0], results[1][1], results[2],
                                                          np.median(values), np.quantile(values, 0.75)))

In [3]:
class Conv1d(nn.Module):
    def __init__(self, vocab_size, seq_len, filters, embedding_dim, num_of_kernel):
        super().__init__()
        self.filters=filters
        self.dropout_prob = 0.5
        self.embedding_dim = embedding_dim
        self.num_of_kernel = num_of_kernel
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
        self.bn1 = nn.BatchNorm1d(self.num_of_kernel)
        for i in range(len(self.filters)):
            conv = nn.Conv1d(in_channels=self.embedding_dim, out_channels=self.num_of_kernel, kernel_size=self.filters[i])
            setattr(self, f'conv_{i}', conv)
        
        self.dropout = nn.Dropout(p=self.dropout_prob)
        self.bn2 = nn.BatchNorm1d(self.num_of_kernel*len(self.filters))
        self.classifier = nn.Sequential(
            nn.Linear(self.num_of_kernel*len(self.filters), 1),
            nn.Sigmoid()
        )
        
    def get_conv(self, i):
        return getattr(self, f'conv_{i}')
    
    def forward(self, inp):
        x = self.embedding(inp)
        x = x.permute(0, 2, 1) ### embedding 을 transpose해줘야함.안하면 1d-conv이 seq 방향이 아닌, 임베딩 방향으로 진행됨.
        conv_results = [
            F.relu(self.bn1(self.get_conv(i)(x))).permute(0,2,1).max(1)[0]
        for i in range(len(self.filters))]
        x = torch.cat(conv_results, 1)
        x = self.classifier(x)
        x = x.squeeze()
        
        return x

In [4]:
class GRU(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim, hidden_size, n_layers):
        super().__init__()
        self.dropout_prob = 0.5
        self.embedding_dim = embedding_dim
        self.bidirectional = 0
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
        self.rnn = nn.GRU(self.embedding_dim, self.hidden_size, num_layers=self.n_layers, dropout=self.dropout_prob, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size, 1),
            #nn.Linear(hidden_size*(self.bidirectional+1)*self.n_layers, 1),
            nn.Sigmoid()
        )
        
    
    def forward(self, inp):
        x = self.embedding(inp)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.rnn(x, h_0)
        h_t = x[:,-1,:]
        output = self.classifier(h_t)
        return output
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_size).zero_()

In [5]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=1,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Sequential(
          nn.Linear(hidden_size , num_classes),
          nn.Sigmoid()
        )
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [30]:
device = torch.device('cuda')
mode = ['char', 'jamo', 'sentencepiece']

dataset_train = nlp.data.TSVDataset('./data/train_hate_dataset_v2.txt')
data_train_char = CharDataset(dataset_train, 0, 1, mode=mode[0])
data_train_jamo = CharDataset(dataset_train, 0, 1, mode=mode[1])
data_train_sentencepiece = CharDataset(dataset_train, 0, 1, mode=mode[2])

#Conv1d
filters = [4,5,6]
embedding_dim = 1000
num_of_kernel = 100

#GRU
embedding_dim_GRU = 750
hidden_size = 100
n_layers = 2

#Conv1d
conv_char = Conv1d(data_train_char.vocab_size, data_train_char.get_q3(), filters, embedding_dim, num_of_kernel).to(device)
conv_char.load_state_dict(torch.load('./check_point/1dcnn_char.pt'))
conv_char.eval()
conv_jamo = Conv1d(data_train_jamo.vocab_size, data_train_jamo.get_q3(), filters, embedding_dim, num_of_kernel).to(device)
conv_jamo.load_state_dict(torch.load('./check_point/1dcnn_jamo.pt'))
conv_jamo.eval()
conv_sentencepiece = Conv1d(data_train_sentencepiece.vocab_size, data_train_sentencepiece.get_q3(), filters, embedding_dim, num_of_kernel).to(device)
conv_sentencepiece.load_state_dict(torch.load('./check_point/1dcnn_sentencepiece.pt'))
conv_sentencepiece.eval()

#GRU
GRU_char = GRU(data_train_char.vocab_size, data_train_char.get_q3(), embedding_dim_GRU, hidden_size, n_layers).to(device)
GRU_char.load_state_dict(torch.load('./check_point/gru_char.pt'))
GRU_char.eval()
GRU_jamo = GRU(data_train_jamo.vocab_size, data_train_jamo.get_q3(), embedding_dim_GRU, hidden_size, n_layers).to(device)
GRU_jamo.load_state_dict(torch.load('./check_point/gru_jamo.pt'))
GRU_jamo.eval()
GRU_sentencepiece = GRU(data_train_sentencepiece.vocab_size, data_train_sentencepiece.get_q3(), embedding_dim_GRU, hidden_size, n_layers).to(device)
GRU_sentencepiece.load_state_dict(torch.load('./check_point/gru_sentencepiece.pt'))
GRU_sentencepiece.eval()

#KoBERT
max_len = 32
bertmodel, vocab = get_pytorch_kobert_model()
dataset_train = nlp.data.TSVDataset('./data/train_hate_dataset_v2.txt')
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
kobert_tokenizer = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=max_len, pad=True, pair=False)
koBERT = BERTClassifier(bertmodel,  dr_rate=0.5)
model_dict = koBERT.state_dict()
checkpoint = torch.load("./check_point/koBERT_state.pt")
convert_keys = {}
for k, v in checkpoint.items():
    new_key_name = k.replace("module.", '')
    if new_key_name not in model_dict:
        print("{} is not int model_dict".format(new_key_name))
        continue
    convert_keys[new_key_name] = v
koBERT.load_state_dict(convert_keys)
koBERT.to(device)

using cached model
using cached model
using cached model


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [70]:
models = {'GRU_jamo':GRU_jamo,
          'GRU_char':GRU_char,
          'GRU_sentencepiece':GRU_sentencepiece,
          'conv_jamo':conv_jamo,
          'conv_char':conv_char,
          'conv_sentencepiece':conv_sentencepiece}

def inference_by_all_model(sentence, models):
    for model_name, model in models.items():
        mode = model_name.split('_')[-1]
        inference(sentence, model_name, model, mode)

def inference(sentence, model_name, model, mode):
    with torch.no_grad():
        korean = re.compile('[^1!ㄱ-ㅣ가-힣]+')
        if mode == 'char':
            chars = data_train_char.make_token(sentence)
            q3 = data_train_char.get_q3()
            char2idx = data_train_char.char2idx
            idx2char = data_train_char.idx2char
        elif mode == 'jamo':
            chars = data_train_jamo.make_token(sentence)
            q3 = data_train_jamo.get_q3()
            char2idx = data_train_jamo.char2idx
            idx2char = data_train_jamo.idx2char
        elif mode == 'sentencepiece':
            chars = data_train_sentencepiece.make_token(sentence)
            q3 = data_train_sentencepiece.get_q3()
            char2idx = data_train_sentencepiece.char2idx
            idx2char = data_train_sentencepiece.idx2char
        if len(chars) < q3:
            need_pad = q3 - len(chars)
            chars.extend(['<PAD>']*need_pad)
        else:
            chars = chars[:q3]
        chars = torch.tensor([char2idx[x] for x in chars]).to(torch.int64)
        #print(chars)
        chars = chars.to(device)
        chars = torch.unsqueeze(chars, 0)
        prt = [idx2char[x.item()] for x in chars.squeeze()]
        #print(prt)
        #print(chars)
        outputs = model(chars)
        pred = int(outputs>0.5)
        if pred:
            pred = '욕설'
        else:
            pred = '정상'
            

        print('model : {} {} {:.3f}'.format(model_name, pred, outputs.item()))

# Inference

In [99]:
import warnings
warnings.filterwarnings(action='ignore')

sentences = ['시1발새끼야']

for sentence in sentences:
    print('sentence :', sentence)
    model_name = 'KoBERT'
    tokenized = tok(sentence)
    tokenized.insert(0, '[CLS]')
    tokenized.append('[SEP]')
    token_ids = [vocab.token_to_idx[x] for x in tokenized]
    len_token_ids = len(token_ids)
    if len_token_ids < max_len:
        need_pad = max_len - len_token_ids
        token_ids.extend([3]*need_pad)
    else:
        token_ids = token_ids[:max_len]
        token_ids[-1] = 3
    token_ids = torch.tensor(token_ids)

    _, valid_length, segment_ids = kobert_tokenizer(sentence)
    token_ids = torch.tensor(token_ids).long().unsqueeze(0).to(device)
    segment_ids = torch.tensor(segment_ids).long().unsqueeze(0).to(device)
    valid_length = torch.tensor(valid_length).unsqueeze(0).to(device)
    koBERT.eval()
    out = koBERT(token_ids, valid_length, segment_ids)
    pred = int(out>0.5)
    if pred:
        pred = '욕설'
    else:
        pred = '정상'
    print('model : {} {} {:.3f}'.format(model_name, pred, out.item()))

    inference_by_all_model(sentence, models)
    print()

sentence : 시1발새끼야
model : KoBERT 정상 0.026
model : GRU_jamo 욕설 0.987
model : GRU_char 욕설 0.999
model : GRU_sentencepiece 욕설 0.999
model : conv_jamo 욕설 0.560
model : conv_char 욕설 0.729
model : conv_sentencepiece 욕설 0.967



# 토크나이저 비교

In [27]:
print(data_train_jamo.make_token('ㄱ부터 ㄴ까지'))
print(data_train_char.make_token('ㄱ부터 ㄴ까지'))
print(data_train_sentencepiece.make_token('ㄱ부터 ㄴ까지'))

['ㄱ', 'ㅂ', 'ㅜ', 'ㅌ', 'ㅓ', 'ㄴ', 'ㄲ', 'ㅏ', 'ㅈ', 'ㅣ']
['ㄱ', '부', '터', 'ㄴ', '까', '지']
['▁', 'ᄀ', '부터', '▁', 'ᄂ', '까지']
