In [203]:
from collections import defaultdict, Counter
import numpy as np
import string
import pandas as pd
import re
import os
import torch
from argparse import Namespace
import tqdm

In [204]:
args = Namespace(
    # 데이터 정보
    comment_csv="data/preprocessed_train_data.csv",
    proportion_subset_of_train=1.0,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    # 날짜와 경로 정보
    frequency_cutoff=25,
    model_state_file='model.pth',
    save_dir='model_storage/model1_BOW',
    vectorizer_file='vectorizer.json',
    # 훈련 하이퍼파라미터
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # 실행 옵션
    cuda=True,
    expand_filepaths_to_save_dir=True,
)

### `Dataset`

In [205]:
from torch.utils.data import Dataset

class CommentDataset(Dataset):
    def __init__(self, comment_df, vectorizer):
        self.comment_df = comment_df
        self._vectorizer = vectorizer
        
        self.train_df = self.comment_df[self.comment_df.split=='train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.comment_df[self.comment_df.split=='val']
        self.validation_size = len(self.val_df)
        
        self.test_df = self.comment_df[self.comment_df.split=='test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, comment_csv):
        comment_df = pd.read_csv(comment_csv)
        train_comment_df = comment_df[comment_df.split=='train']
        return cls(comment_df, CommentVectorizer.from_dataframe(train_comment_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, comment_csv, vectorizer_filepath):
        comment_df = pd.read_csv(comment_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(comment_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return CommentVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        assert 'toxicity' in row, f"'toxicity' column not found in row. Available columns: {row.index.tolist()}"
        comment_vector = self._vectorizer.vectorize(row.comment)
        toxicity_index = self._vectorizer.toxicity_vocab.lookup_token(str(row.toxicity))
        
        return {'x_data': comment_vector,
                'y_target': toxicity_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size


### `Vocabulary`

In [206]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: toekn for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token,self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("Vocabulary에 인덱스(%d)가 없습니다." % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

### `Vectorizer`

In [207]:
class CommentVectorizer(object):
    def __init__(self, comment_vocab, toxicity_vocab):

        self.comment_vocab = comment_vocab
        self.toxicity_vocab = toxicity_vocab

    def vectorize(self, comment):

        one_hot = np.zeros(len(self.comment_vocab), dtype = np.float32)
        
        for token in comment.split(" "):
            if token not in string.punctuation:
                one_hot[self.comment_vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, comment_df, cutoff=25):
        comment_vocab = Vocabulary(add_unk=True)
        toxicity_vocab = Vocabulary(add_unk=False)
        
        # 점수를 추가합니다
        for toxicity in sorted(set(comment_df.toxicity)):
            toxicity_vocab.add_token(str(toxicity))

        # count > cutoff인 단어를 추가합니다
        word_counts = Counter()
        for comment in comment_df.comment:
            for word in comment.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
               
        for word, count in word_counts.items():
            if count > cutoff:
                comment_vocab.add_token(word)

        return cls(comment_vocab, toxicity_vocab)

    @classmethod
    def from_serializable(cls, contents):
        comment_vocab = Vocabulary.from_serializable(contents['comment_vocab'])
        toxicity_vocab =  Vocabulary.from_serializable(contents['toxicity_vocab'])

        return cls(comment_vocab=comment_vocab, toxicity_vocab=toxicity_vocab)

    def to_serializable(self):
        return {'comment_vocab': self.comment_vocab.to_serializable(),
                'toxicity_vocab': self.toxicity_vocab.to_serializable()}

### `DataLoader`

In [208]:
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## 로지스틱 회귀 모델 기반 감성 분류기

In [209]:
import torch.nn as nn

class CommentClassifier(nn.Module):
    def __init__(self, num_features):
        super(CommentClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

## 설정

In [210]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [211]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, 'vectorizer.json')

    args.model_state_file = os.path.join(args.save_dir,'model.pth')
    
# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

### 헬퍼 함수

In [212]:
import torch.optim as optim

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    # 적어도 한 번 모델을 저장합니다
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # 성능이 향상되면 모델을 저장합니다
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # 손실이 나빠지면
        if loss_t >= train_state['early_stopping_best_val']:
            # 조기 종료 단계 업데이트
            train_state['early_stopping_step'] += 1
        # 손실이 감소하면
        else:
            # 최상의 모델 저장
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # 조기 종료 단계 재설정
            train_state['early_stopping_step'] = 0

        # 조기 종료 여부 확인
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

## 데이터셋, 모델, 손실, 옵티마이저, 훈련 상태 딕셔너리 만들기

In [213]:
# CUDA 체크
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

# 데이터셋과 Vectorizer
dataset = CommentDataset.load_dataset_and_make_vectorizer(args.comment_csv)
vectorizer = dataset.get_vectorizer()

# 모델
classifier = CommentClassifier(num_features=len(vectorizer.comment_vocab))
classifier = classifier.to(args.device)

# 손실 함수와 옵티마이저
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,mode='min',factor=0.5,patience=1)

train_state = make_train_state(args)

In [214]:
epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)


for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    dataset.set_split('train')
    batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # 단계 1. 그레이디언트를 0으로 초기화합니다
        optimizer.zero_grad()

        # 단계 2. 출력을 계산합니다
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # 단계 3. 손실을 계산합니다
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
        loss.backward()

        # 단계 5. 옵티마이저로 가중치를 업데이트합니다
        optimizer.step()
        # -----------------------------------------

        # 정확도를 계산합니다
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # 진행 바 업데이트
        train_bar.set_postfix(loss=running_loss, 
                              acc=running_acc, 
                              epoch=epoch_index)
        train_bar.update()

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # 검증 세트에 대한 순회

    # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):

        # 단계 1. 출력을 계산합니다
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # 단계 2. 손실을 계산합니다
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # 단계 3. 정확도를 계산합니다
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        val_bar.set_postfix(loss=running_loss, 
                            acc=running_acc, 
                            epoch=epoch_index)
        val_bar.update()

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

    train_state = update_train_state(args=args, model=classifier,
                                     train_state=train_state)

    scheduler.step(train_state['val_loss'][-1])

    train_bar.n = 0
    val_bar.n = 0
    epoch_bar.update()

    if train_state['stop_early']:
        break

    train_bar.n = 0
    val_bar.n = 0
    epoch_bar.update()

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/436 [00:00<?, ?it/s]

split=val:   0%|          | 0/93 [00:00<?, ?it/s]

## 테스트 세트 평가

In [215]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred = classifier(x_in = batch_dict['x_data'].float())

    # 손실을 계산합니다
    loss = loss_func(y_pred,batch_dict['y_target'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [216]:
print("테스트 손실: {}".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 0.1590842474852839
테스트 정확도: 94.74966397849462


## 추론

In [217]:
test_data_df = pd.read_csv('data/test_for_inference_preprocessed.csv')
test_data_df['comment'] = test_data_df['comment'].astype(str)

vectorizer = dataset.get_vectorizer()

model = CommentClassifier(num_features=len(vectorizer.comment_vocab))
model.load_state_dict(torch.load('model_storage/model1_BOW/model.pth'))
model.eval()
model = model.to(args.device)

def predict_proba(model, vectorizer, comment, device):
    comment_vector = vectorizer.vectorize(comment)
    vectorized_comment = torch.tensor(comment_vector).unsqueeze(0).to(device)
    result = model(vectorized_comment.float(), apply_sigmoid=True)
    probability = result.detach().cpu().item()
    return probability

results = []
for comment in test_data_df['comment']:
    probability = predict_proba(model, vectorizer, comment, args.device)
    prediction = 1 if probability > 0.5 else 0
    results.append({'probability': probability, 'pred': prediction})

results_df = pd.DataFrame(results)

results_df.to_csv('result/inferenced_by_BOW.csv', index=False)