In [219]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from collections import Counter
import json
import string
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
import torch.nn.utils.rnn as rnn_utils

In [220]:
args = Namespace(
    comment_csv="data/preprocessed_train_data.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    # 날짜와 경로 정보
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/model3_LSTM",
    # 모델 하이퍼파라미터
    embedding_size=32,
    rnn_hidden_size=32,
    max_sequence_length=1309,
    # 훈련 하이퍼파라미터
    seed=1337,
    learning_rate=0.001,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=10,
    # 실행 옵션
    cuda=True,
    expand_filepaths_to_save_dir=True
)

np.random.seed(args.seed)

In [221]:
class CommentDataset(Dataset):
    def __init__(self, comment_df, vectorizer):
        self.comment_df = comment_df
        self._vectorizer = vectorizer

        self._max_seq_length = args.max_sequence_length

        self.train_df = self.comment_df[self.comment_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.comment_df[self.comment_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.comment_df[self.comment_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size), 
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, comment_csv):  
        comment_df = pd.read_csv(comment_csv)
        return cls(comment_df, CommentVectorizer.from_dataframe(comment_df))
        
    @classmethod
    def load_dataset_and_load_vectorizer(cls, comment_csv, vectorizer_filepath):
        comment_df = pd.read_csv(comment_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(comment_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return CommentVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        comment_vector, vector_length = self._vectorizer.vectorize(row.comment, self._max_seq_length)
        toxicity_label = row.toxicity

        return {'x_data': comment_vector, 'y_target': toxicity_label, 'x_length': vector_length}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

In [222]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>", mask_token="<MASK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}

        self._unk_token = unk_token
        if add_unk:
            self.unk_index = self.add_token(unk_token)
        self._mask_token = mask_token
        self.mask_index = self.add_token(self._mask_token)
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx.get(token, self.unk_index)

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [223]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        """ 토큰에 대응하는 인덱스를 추출합니다.
        토큰이 없으면 UNK 인덱스를 반환합니다.
        
        매개변수:
            token (str): 찾을 토큰 
        반환값:
            index (int): 토큰에 해당하는 인덱스
        노트:
            UNK 토큰을 사용하려면 (Vocabulary에 추가하기 위해)
            `unk_index`가 0보다 커야 합니다.
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

### `Vectorizer`

In [224]:
class CommentVectorizer(object):
    def __init__(self, token_vocab, label_vocab):
        self.token_vocab = token_vocab
        self.label_vocab = label_vocab

    def vectorize(self, comment, max_seq_length):
        indices = [self.token_vocab.lookup_token(token) for token in comment.split(" ")]
        
        if len(indices) > max_seq_length:
            indices = indices[:max_seq_length] 
        elif len(indices) < max_seq_length:
            indices += [self.token_vocab.mask_index] * (max_seq_length - len(indices)) 

        return np.array(indices), len(indices)

    @classmethod
    def from_dataframe(cls, comment_df):
        token_vocab = Vocabulary(add_unk=True)
        label_vocab = Vocabulary(add_unk=False)

        for index, row in comment_df.iterrows():
            for token in row.comment.split(" "):
                token_vocab.add_token(token)
            label_vocab.add_token(row.toxicity)

        return cls(token_vocab, label_vocab)

    @classmethod
    def from_serializable(cls, contents):
        token_vocab = Vocabulary.from_serializable(contents['token_vocab'])
        label_vocab = Vocabulary.from_serializable(contents['label_vocab'])

        return cls(token_vocab=token_vocab, label_vocab=label_vocab)

    def to_serializable(self):
        return {'token_vocab': self.token_vocab.to_serializable(), 
                'label_vocab': self.label_vocab.to_serializable()}

### `DataLoader`

In [225]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            if name == 'x_data':
                tensor = tensor[:, :args.max_sequence_length]
                if tensor.size(1) < args.max_sequence_length:
                    tensor = F.pad(tensor, pad=(0, args.max_sequence_length - tensor.size(1)), 
                                   mode='constant', value=dataset._vectorizer.token_vocab.mask_index)
                out_data_dict[name] = tensor.to(device)
            else:
                out_data_dict[name] = tensor.to(device)
        x_lengths = torch.sum(out_data_dict['x_data'] != dataset._vectorizer.token_vocab.mask_index, dim=1)
        yield out_data_dict, x_lengths

## 모델

In [226]:
class CommentClassificationModel(nn.Module):
    def __init__(self, embedding_size, vocab_size, rnn_hidden_size, 
                 batch_first=True, padding_idx=0, dropout_p=0.5):

        super(CommentClassificationModel, self).__init__()
        
        self.word_emb = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size,
                                     padding_idx=padding_idx)
        self.rnn = nn.LSTM(input_size=embedding_size,
                           hidden_size=rnn_hidden_size,
                           batch_first=batch_first)
        self.fc = nn.Linear(in_features=rnn_hidden_size,
                            out_features=1)
        self._dropout_p = dropout_p

    def forward(self, x_in, x_lengths, apply_sigmoid=False):
        x_embedded = self.word_emb(x_in)
        x_packed = rnn_utils.pack_padded_sequence(x_embedded, x_lengths.cpu(), 
                                                  batch_first=True, enforce_sorted=False)

        packed_out, (hidden, cell) = self.rnn(x_packed)
        y_out, _ = rnn_utils.pad_packed_sequence(packed_out, batch_first=True)

        y_out = y_out[range(len(x_lengths)), x_lengths - 1, :]

        y_out = self.fc(F.dropout(y_out, p=self._dropout_p))

        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)

        return y_out.squeeze()


In [227]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [228]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, 'vectorizer.json')

    args.model_state_file = os.path.join(args.save_dir,'model.pth')
    
# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

In [229]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):

    # 적어도 한 번 모델을 저장합니다
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # 성능이 향상되면 모델을 저장합니다
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # 손실이 나빠지면
        if loss_t >= loss_tm1:
            # 조기 종료 단계 업데이트
            train_state['early_stopping_step'] += 1
        # 손실이 감소하면
        else:
            # 최상의 모델 저장
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # 조기 종료 단계 재설정
            train_state['early_stopping_step'] = 0

        # 조기 종료 여부 확인
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def normalize_sizes(y_pred, y_true):
    
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true):
    y_pred = torch.sigmoid(y_pred)
    predictions = y_pred > 0.5
    correct = (predictions == y_true).float()
    accuracy = correct.sum() / len(correct)
    return accuracy * 100

def binary_classification_loss(y_pred, y_true):
    return F.binary_cross_entropy_with_logits(y_pred, y_true.float())

In [230]:
# CUDA 체크
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

# 데이터셋과 Vectorizer
dataset = CommentDataset.load_dataset_and_make_vectorizer(args.comment_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# 모델
model = CommentClassificationModel(embedding_size=args.embedding_size,
                                   vocab_size=len(vectorizer.token_vocab),
                                   rnn_hidden_size=args.rnn_hidden_size,
                                   padding_idx=vectorizer.token_vocab.mask_index)
model = model.to(args.device)

# 손실 함수와 옵티마이저
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)

## 훈련 반복

In [231]:
epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
                               total=args.num_epochs,
                               position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                               total=dataset.get_num_batches(args.batch_size), 
                               position=1, 
                               leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                             total=dataset.get_num_batches(args.batch_size), 
                             position=1, 
                             leave=True)

for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # 훈련 세트에 대한 순회
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    model.train()

    for batch_index, (batch_dict, x_lengths) in enumerate(batch_generator):
        # 훈련 과정은 5단계로 이루어집니다

        # --------------------------------------
        # 단계 1. 그레이디언트를 0으로 초기화합니다
        optimizer.zero_grad()

        # 단계 2. 출력을 계산합니다
        y_pred = model(x_in=batch_dict['x_data'], x_lengths=x_lengths)

        # 단계 3. 손실을 계산합니다
        loss = binary_classification_loss(y_pred, batch_dict['y_target'])
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])

        # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
        loss.backward()

        # 단계 5. 옵티마이저로 가중치를 업데이트합니다
        optimizer.step()
        # -----------------------------------------

        # 이동 손실과 이동 정확도를 계산합니다
        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # 진행 상태 막대 업데이트
        train_bar.set_postfix(loss=running_loss,
                              acc=running_acc,
                              epoch=epoch_index)
        train_bar.update()

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # 검증 세트에 대한 순회

    # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.
    running_acc = 0.
    model.eval()

    for batch_index, (batch_dict, x_lengths) in enumerate(batch_generator):
        # 단계 1. 출력을 계산합니다
        y_pred = model(x_in=batch_dict['x_data'], x_lengths=x_lengths)

        # 단계 2. 손실을 계산합니다
        loss = binary_classification_loss(y_pred, batch_dict['y_target'])
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])

        # 단계 3. 이동 손실과 이동 정확도를 계산합니다
        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # 진행 상태 막대 업데이트
        val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                        epoch=epoch_index)
        val_bar.update()

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

    train_state = update_train_state(args=args, model=model, 
                                     train_state=train_state)

    scheduler.step(train_state['val_loss'][-1])

    if train_state['stop_early']:
        break

    train_bar.n = 0
    val_bar.n = 0
    epoch_bar.update()

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/436 [00:00<?, ?it/s]

split=val:   0%|          | 0/93 [00:00<?, ?it/s]

## 테스트 세트 평가

In [232]:
model.load_state_dict(torch.load(train_state['model_filename']))
model = model.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.0
running_acc = 0.0
model.eval()

for batch_index, (batch_dict, x_lengths) in enumerate(batch_generator):
    x_data = batch_dict['x_data']
    y_target = batch_dict['y_target']
    
    # 모델 실행
    y_pred = model(x_in=x_data, x_lengths=x_lengths)

    # 손실 및 정확도 계산
    loss = binary_classification_loss(y_pred, y_target)
    acc_t = compute_accuracy(y_pred, y_target)

    # 이동 평균 손실 및 정확도 업데이트
    running_loss += (loss.item() - running_loss) / (batch_index + 1)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

# 최종 손실 및 정확도 저장
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [235]:
print("테스트 손실: {}".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 0.1810012078413399
테스트 정확도: 94.2540283203125


## 추론

In [234]:
test_data_df = pd.read_csv('data/test_for_inference_preprocessed.csv')
test_data_df['comment'] = test_data_df['comment'].astype(str)

vectorizer = dataset.get_vectorizer()
model = CommentClassificationModel(embedding_size=args.embedding_size, 
                                   vocab_size=len(vectorizer.token_vocab), 
                                   rnn_hidden_size=args.rnn_hidden_size)
model.load_state_dict(torch.load(train_state['model_filename']))
model.eval()
model = model.to(args.device)

def predict_proba(model, vectorizer, comment, device, max_sequence_length):
    comment_vector, vector_length = vectorizer.vectorize(comment, max_sequence_length)
    vectorized_comment = torch.tensor(comment_vector, dtype=torch.long).unsqueeze(0).to(device)
    length_tensor = torch.tensor([vector_length], dtype=torch.long).to(device)

    result = model(vectorized_comment, length_tensor)
    probability = torch.sigmoid(result).detach().cpu().item()
    return probability

results = []
for comment in test_data_df['comment']:
    probability = predict_proba(model, vectorizer, comment, args.device, args.max_sequence_length)
    prediction = 1 if probability > 0.5 else 0
    results.append({'probability': probability, 'pred': prediction})

results_df = pd.DataFrame(results)
results_df.to_csv('result/inferenced_by_LSTM.csv', index=False)