Code is adapted from https://github.com/simonjisu/nsmc_study

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torchtext
import nltk
# from konlpy.tag import Mecab
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
USE_CUDA = torch.cuda.is_available()
USE_CUDA

True

In [3]:
def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

In [4]:
batch_size = 64

In [5]:
#tagger = Mecab()
#tagger = tagger.morphs
tagger = str.split

In [6]:
REVIEW = Field(tokenize=tagger, use_vocab=True, lower=True, #init_token="<s>", eos_token="</s>", 
               include_lengths=False, batch_first=True, preprocessing=pad_under_five)
LABEL = Field(sequential=False, use_vocab=False, preprocessing=lambda x: int(x))

In [7]:
train_data, test_data = TabularDataset.splits(
                   path="nsmc/", train='ratings_train.txt', validation="ratings_test.txt",
                   format='tsv', fields=[('id', None), ('document', REVIEW), ('label', LABEL)],
                   filter_pred = lambda x: True if len(x.document) > 1 else False,
                   skip_header=True)

In [8]:
print(len(train_data), len(test_data))

150000 50000


In [9]:
REVIEW.build_vocab(train_data, min_freq=2)

In [10]:
# make iterator for splits
train_loader, test_loader = BucketIterator.splits(
    (train_data, test_data), batch_size=batch_size,
    sort_key=lambda x: len(x.document), sort_within_batch=True, repeat=False) 

In [11]:
class CNN_TEXT(nn.Module):
    def __init__(self, vocab_size, embed_size, out_channel_size, output_size, ngrams, dropout_rate=0.5):
        super(CNN_TEXT, self).__init__()
        
        self.V = vocab_size
        self.D = embed_size
        self.out_channel_size = out_channel_size
        self.ngrams = ngrams
        
        self.embed = nn.Embedding(self.V, self.D)
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=1, 
                                              out_channels=self.out_channel_size,  # 채널 갯수
                                              kernel_size=self.D*k,  # 훑고 지나갈 길이
                                              stride=self.D) for k in self.ngrams])
        
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.out_channel_size * len(self.ngrams), output_size)
    
    def forward(self, inputs):
        # inputs: B, T
        outputs = self.embed(inputs) # B, T, D
        outputs = self.dropout(outputs) # B, T, D
        outputs = [conv(outputs.view(outputs.size(0), 1, -1)) for conv in self.convs] # B, T, D > B, 1, T*D
        outputs = [F.max_pool1d(o, o.size(2)).squeeze(2) for o in outputs] # 
        outputs = torch.cat(outputs, dim=1) # B, 3K
        outputs = self.fc(outputs)
        
        return outputs

In [12]:
STEP = 5
BATCH_SIZE = 128
EMBED = 300
KERNEL_SIZES = [3, 4, 5]
KERNEL_DIM = 100
LR = 0.001

In [13]:
model = CNN_TEXT(len(REVIEW.vocab), EMBED, KERNEL_DIM, 1, KERNEL_SIZES, dropout_rate=0.5)

if USE_CUDA:
    model = model.to('cuda')
    
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3], gamma=0.1)

In [14]:
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader):
        inputs, targets = batch.document, batch.label.float()
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
        model.zero_grad()
        preds = model(inputs)
        loss = loss_function(preds.squeeze(1), targets)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("[{}/{}]: mean_loss : {:.4f} , lr : {:.4f}".format(
                step+1, STEP, np.mean(losses), scheduler.get_lr()[0]))
            losses=[]

[1/5]: mean_loss : 0.7501 , lr : 0.0010
[1/5]: mean_loss : 0.7481 , lr : 0.0010
[1/5]: mean_loss : 0.6956 , lr : 0.0010
[1/5]: mean_loss : 0.6821 , lr : 0.0010
[1/5]: mean_loss : 0.6676 , lr : 0.0010
[1/5]: mean_loss : 0.6652 , lr : 0.0010
[1/5]: mean_loss : 0.6550 , lr : 0.0010
[1/5]: mean_loss : 0.6596 , lr : 0.0010
[1/5]: mean_loss : 0.6459 , lr : 0.0010
[1/5]: mean_loss : 0.6236 , lr : 0.0010
[1/5]: mean_loss : 0.6185 , lr : 0.0010
[1/5]: mean_loss : 0.6238 , lr : 0.0010
[1/5]: mean_loss : 0.6127 , lr : 0.0010
[1/5]: mean_loss : 0.5992 , lr : 0.0010
[1/5]: mean_loss : 0.5958 , lr : 0.0010
[1/5]: mean_loss : 0.5798 , lr : 0.0010
[1/5]: mean_loss : 0.5788 , lr : 0.0010
[1/5]: mean_loss : 0.5810 , lr : 0.0010
[1/5]: mean_loss : 0.5738 , lr : 0.0010
[1/5]: mean_loss : 0.5791 , lr : 0.0010
[1/5]: mean_loss : 0.5703 , lr : 0.0010
[1/5]: mean_loss : 0.5646 , lr : 0.0010
[1/5]: mean_loss : 0.5581 , lr : 0.0010
[1/5]: mean_loss : 0.5444 , lr : 0.0010
[2/5]: mean_loss : 0.5897 , lr : 0.0010


Test

In [None]:
model.eval()
num_hit=0
for i, batch in enumerate(test_loader):
    inputs, targets = batch.document, batch.label.float()
    if USE_CUDA:
        inputs = inputs.to('cuda')
        targets = targets.to('cuda')

    preds = model(inputs)
    preds = torch.ge(F.sigmoid(preds), 0.5).float()
    num_hit += torch.eq(preds.squeeze(), targets.squeeze()).sum().item()

print(num_hit/len(test_data)*100)

In [None]:
inputs, targets = batch.document, batch.label.float()

In [None]:
preds = model(inputs)

In [None]:
test_inputs = ["헐 진짜 개별로다..", "진짜 너무 재밌는 영화다 오랜만에","오..이건 진짜 봐야함", "진짜 쓰레기 같은 영화","노잼","존잼","꾸울잼","핵노잼"]

for test_input in test_inputs:
    tokenized = tagger(test_input)
    tokenized = pad_under_five(tokenized)
    input_ = REVIEW.numericalize([tokenized])
    if USE_CUDA: input_ = input_.cuda()

    prediction = model(input_)
    prediction = torch.ge(F.sigmoid(prediction), 0.5).float()
    prediction = "긍정" if prediction.item() == 1 else "부정"
    if prediction=="긍정":
        print(test_input,"\033[1;01;36m" + prediction + "\033[0m")
    else:
        print(test_input,"\033[1;01;31m" + prediction + "\033[0m")