## 데이터 출처

[Naver sentiment movie corpus]: https://github.com/e9t/nsmc/

- RNN 모델의 학습을 위해 [Naver sentiment movie corpus] 데이터셋 중 일부를 추출하여 사용하였습니다.

In [None]:
# torchtext.legacy를 사용할 수 있는 torchtext 버전 설치
!pip install -U torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.0 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.6 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+

In [None]:
#colab 을 이용한 실행시
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#import torchtext.data as data
#import torchtext.datasets as datasets
#legacy 버전으로 변경

# torchtext : text의 preprocessing 파이프라인 정의, 
# 토크나이징, Vocab 생성, dataset splits, 데이터 로더 등 지원
from torchtext.legacy import data
import torchtext.datasets as datasets

import pickle
print (torch.__version__)

1.9.0+cu102


In [None]:
class RNN_Text(nn.Module):
    
    def __init__(self, embed_num, class_num):
        super(RNN_Text, self).__init__()
        # 단어 사전의 크기
        V = embed_num
        # 분류하고자 하는 클래스 개수
        C = class_num
        # 히든 사이즈
        H = 256
        
        # 단어벡터 차원 100
        self.embed = nn.Embedding(V, 100)
        
        # nn.LSTM(input_dim, hidden_size, bidirectional, batch_fisrt) 
        # batch_first : If True, then the input and output tensors are provided as (batch, seq, feature) 
        # 출력되는 벡터의 크기는 H * 2
        self.rnn = nn.LSTM(100, H, bidirectional = True) 
        
        # bidirectional이므로 히든 사이즈 * 2의 크기 벡터가 입력
        self.out = nn.Linear(H*2, C)
        
    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        ##x = x.unsqueeze(1)  # (N, Ci, W, D)

        # RNN 모듈 실행
        #[배치 사이즈, 256 * 2] * 시퀀스 길이
        x,(_,__) = self.rnn( x, ( self.h, self.c ) )
        
        # 예측 벡터
        # [배치 사이즈, 256 * 2]
        logit = self.out(x[-1])

        # 최종 예측 벡터 크기: [배치 사이즈, C], C: 클래스 개수
        return logit
    def inithidden(self,b):
        self.h = Variable(torch.randn(2, b, 256))
        self.c = Variable(torch.randn(2, b, 256))
        

In [None]:
class mydataset(data.Dataset):
    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            for i,line in enumerate(open(path,'r',encoding='utf-8')):
                if i==0:
                    continue
                line = line.strip().split('\t')
                txt = line[1].split(' ')
               
                # examples: 학습 텍스트, 라벨 텍스트
                examples += [ data.Example.fromlist( [txt, line[2]],fields ) ]
        super(mydataset, self).__init__(examples, fields, **kwargs)

In [None]:
text_field = data.Field(fix_length=20)
label_field = data.Field(sequential=False, batch_first = True, unk_token = None)

train_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/aivle/data/nsm/small_ratings_train_tok.txt')

test_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/aivle/data/nsm/small_ratings_test_tok.txt')

text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iter, test_iter = data.Iterator.splits(
                            (train_data, test_data), 
                            batch_sizes=(100, 1), repeat=False)#, device = -1)
len(text_field.vocab)

21893

In [None]:
rnn = RNN_Text(len(text_field.vocab),2)
optimizer = torch.optim.Adam(rnn.parameters())
rnn.train()


RNN_Text(
  (embed): Embedding(21893, 100)
  (rnn): LSTM(100, 256, bidirectional=True)
  (out): Linear(in_features=512, out_features=2, bias=True)
)

In [None]:
%%time
for epoch in range(10):
    
    totalloss = 0
    for batch in train_iter:
        optimizer.zero_grad()
        
        txt = batch.text
        label = batch.label
        #print (txt.size())
        rnn.inithidden(txt.size(1))
        
        pred = rnn(txt)
        #print(pred.size(), label.size())
        #print(label)
        loss = F.cross_entropy(pred, label)
        totalloss += loss.data
        
        loss.backward()
        optimizer.step()
        #print(data,label)
        
    print(epoch,'epoch')  
    print('loss : {:.3f}'.format(totalloss.numpy()))
       
torch.save(rnn,'/content/gdrive/My Drive/Colab Notebooks/aivle/model/rnn_model.pt')

0 epoch
loss : 69.827
1 epoch
loss : 67.446
2 epoch
loss : 55.511
3 epoch
loss : 43.541
4 epoch
loss : 33.843
5 epoch
loss : 25.897
6 epoch
loss : 19.522
7 epoch
loss : 14.586
8 epoch
loss : 10.595
9 epoch
loss : 7.711
CPU times: user 4min 25s, sys: 10.8 s, total: 4min 36s
Wall time: 4min 35s


In [None]:
%%time
from sklearn.metrics import classification_report
correct = 0
incorrect = 0
rnn.eval()
y_test = []
prediction = []

for batch in test_iter:
    txt = batch.text
    label = batch.label
    y_test.append(label.data[0])
    
    rnn.inithidden(txt.size(1))
    
    pred = rnn(txt)
    _,ans = torch.max(pred,dim=1)
    prediction.append(ans.data[0])
    
    if ans.data[0] == label.data[0]:
        correct += 1    
    else:
        incorrect += 1
    
print ('correct : ', correct)
print ('incorrect : ', incorrect)
print(classification_report(torch.tensor(y_test), 
                            torch.tensor(prediction), 
                            digits=4, 
                            target_names=['negative', 'positive']))


correct :  79
incorrect :  21
              precision    recall  f1-score   support

    negative     0.7458    0.8800    0.8073        50
    positive     0.8537    0.7000    0.7692        50

    accuracy                         0.7900       100
   macro avg     0.7997    0.7900    0.7883       100
weighted avg     0.7997    0.7900    0.7883       100

CPU times: user 1.04 s, sys: 58.2 ms, total: 1.1 s
Wall time: 1.57 s
