## 데이터 전처리

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/dd/petition_sampled.csv', encoding='utf-8')

In [2]:
import re

def remove_sp_wh(text):
    text = re.sub(r'([\t\f\r\v\n])|([^ ㄱ-ㅎ가-힣0-9]+)', ' ', str(text))
    return text

df.title = df.title.apply(remove_sp_wh)
df.content = df.content.apply(remove_sp_wh)

In [3]:
from konlpy.tag import Okt
okt = Okt()

df['title_token'] = df.title.apply(okt.morphs)
df['content_token'] = df.content.apply(okt.nouns)
df['token_final'] = df.title_token + df.content_token
df['count'] = df['votes'].replace({',':''}, regex=True).apply(lambda x: int(x))
df['lable'] = df['count'].apply(lambda x: 'yes' if x>=1000 else 'no')

In [4]:
df = df[['token_final', 'lable']].copy()
df.rename(columns={'lable':'label'}, inplace=True)

## 임베딩 벡터

In [5]:
from gensim.models import Word2Vec, KeyedVectors

embedding_vector = Word2Vec(df['token_final'],
                           sg=1,
                           vector_size=100,
                           window=2)



In [6]:
model_result = embedding_vector.wv.most_similar('올림픽')
print(model_result)

[('평창', 0.8491191267967224), ('평창올림픽', 0.8447761535644531), ('동계올림픽', 0.8439947962760925), ('평창동계올림픽', 0.8237838745117188), ('월드컵', 0.8197010159492493), ('출전', 0.8047665953636169), ('참가', 0.7970430254936218), ('패럴림픽', 0.7910754084587097), ('동계', 0.7835447192192078), ('선수단', 0.773549497127533)]


In [7]:
embedding_vector.wv.save_word2vec_format('petition_token_w2v')
loaded_model = KeyedVectors.load_word2vec_format('petition_token_w2v')

In [8]:
from numpy.random import RandomState
import torchtext
from torchtext.legacy.data import Field

rng = RandomState()
train_data = df.sample(frac=.8, random_state=rng)
test_data = df.loc[~df.index.isin(train_data.index)]

In [9]:
def tokenizer(text):
    text = re.sub(r'[\[\]\']', '', str(text))
    text = text.split(', ')
    return text

# batch_first없으면 train함수에서 torch.transpose해줘야 함
TEXT = Field(tokenize=tokenizer, batch_first=True)
LABEL = Field(sequential=False)

In [46]:
train_data.to_csv('C:/dd/train_petition.csv', index=False)
test_data.to_csv('C:/dd/valid_petition.csv', index=False)

In [52]:
from torchtext.legacy.data import TabularDataset

train, valid = TabularDataset.splits(
    path = 'C:/dd/',
    train = 'train_petition.csv',
    validation = 'valid_petition.csv',
    format = 'csv',
    fields = [('text', TEXT), ('label', LABEL)],
    skip_header = True
)

In [55]:
import torch
from torchtext.vocab import Vectors
from torchtext.legacy.data import BucketIterator

vectors = Vectors(name='petition_token_w2v')

TEXT.build_vocab(train, vectors=vectors, min_freq=1, max_size=None)
LABEL.build_vocab(train)

vocab = TEXT.vocab

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = BucketIterator.splits(
    datasets = (train, valid),
    batch_size = 8,
    device = device,
    sort = False
)

## Text CNN

In [67]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab, emb_dim, c_out, kernel_wins, num_class):
        super().__init__()
        self.embed = nn.Embedding(len(vocab), emb_dim)
        # pre-trained vocab을 가져오기
        self.embed.weight.data.copy_(vocab.vectors)
        
        # Conv2d(c_in, c_out, filter_size=(kernel_size, emb_dim))
        # self.convs의 filter size : [3*100, 4*100, 5*100]
        self.convs = nn.ModuleList([nn.Conv2d(1, c_out, (w, emb_dim)) for w in kernel_wins])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        
        # 종류별 feature map의 개수 만큼 unit과 연결된다
        self.fc = nn.Linear(len(kernel_wins)*c_out, num_class)
        # output size = # of units
        
    def forward(self, x):
        # x : (timesteps)
        # emb_lookup : (timestep, emb_dim)
        emb = self.embed(x)
        
        # cnn에 넣으려면 이미지 처럼 2차원이어야 한다
        # (Batch, vocab_size, emb_dim) ==> (Batch, 1, vocab_size, emb_dim) : 채널 추가
        emb = emb.unsqueeze(1)
        
        # conv output shape : [c_out, H=kernel_size*emb_dim, W=1]
        # Width가 1인 이유? 필터 가로 크기와 input 가로 길이가 같으니까
        con = [self.relu(conv(emb)) for conv in self.convs] 
        
        # max_pool1d(input=(c_out, kernel_size*emb_dim), kernel_size=H)
        pool = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con]
        # pool output =  [c_out * 1, c_out * 1, c_out * 1] 
        # 필터 크기와 conv output의 크기가 동일하니까 채널과 1만 남는거 ㅇ
        
        # pool 1개 output shape : (Batch, c_out, 1)
        fc = torch.cat(pool, dim=1) # 필터 크기별로 cnn 돌린거 종합
        fc = fc.squeeze(-1) # 10*1 을 10기준으로 concat한 뒤, 1 빈 리스트 제거
        fc = self.dropout(fc)
        
        logit = self.fc(fc)
        return logit     

In [74]:
def train(model, device, train_iter, optimizer):
    model.train()
    corrects, train_loss = 0.0, 0.0
    
    for batch in train_iter:
        text, target = batch.text, batch.label
        # 이미 batch_first를 사용했기 때문에 필요없음
#         text = torch.transpose(text, 0, 1) # 1배치 1로우
        target.data.sub_(1) # target - 1
        text, target = text.to(device), target.to(device)
        
        optimizer.zero_grad()
        logit = model(text)
        
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        result = torch.max(logit, 1)[1] # 가장 큰 확률, 실제 값(0은 argmax임)
        corrects += (result.view(target.size()).data == target.data).sum()
        
    train_loss /= len(train_iter.dataset) # == train_loss/Batch_size
    accuracy = 100.0 * corrects / len(train_iter.dataset)
    
    return train_loss, accuracy

In [79]:
def evaluate(model, device, eval_iter):
    model.eval()
    corrects, test_loss = 0.0, 0.0
    
    for batch in eval_iter:
        text, target = batch.text, batch.label
#         text = torch.transpose(text, 0, 1)
        target.data.sub_(1)
        
        logit = model(text)
        
        loss = F.cross_entropy(logit, target)
        test_loss += loss.item()
        result = torch.max(logit, 1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
        
    test_loss /= len(eval_iter.dataset)
    accuracy = 100.0*corrects / len(eval_iter.dataset)
    return test_loss, accuracy

In [80]:
model = TextCNN(vocab, 100, 10, [3, 4, 5], 2).to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_test_acc = -1

for epoch in range(1, 3+1):
    tr_loss, tr_acc = train(model, device, train_iter, optimizer)
    print('train epoch: {} \t Loss: {} \t accuracy: {:.3f}%'.format(epoch, tr_loss, tr_acc))
    
    val_loss, val_acc = evaluate(model, device, valid_iter)
    print('valid epoch: {} \t Loss: {} \t accuracy: {:.3f}%'.format(epoch, val_loss, val_acc))
    
    if val_acc > best_test_acc:
        best_test_acc = val_acc
        
        print('model saved at {} accuracy'.format(best_test_acc))

train epoch: 1 	 Loss: 0.009061651507467162 	 accuracy: 98.818%
valid epoch: 1 	 Loss: 0.005662299163206732 	 accuracy: 99.281%
model saved at 99.28077697753906 accuracy
train epoch: 2 	 Loss: 0.007278435231444717 	 accuracy: 99.011%
valid epoch: 2 	 Loss: 0.0053660755365813875 	 accuracy: 99.281%
train epoch: 3 	 Loss: 0.00608348290886248 	 accuracy: 98.984%
valid epoch: 3 	 Loss: 0.005500715491229681 	 accuracy: 99.281%
