In [1]:
import os
import time
import torch
import urllib

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
if not os.path.exists('data'):
    os.makedirs('data')

In [3]:
# data 저장 directory에 데이터 다운로드 받기
data_dir = 'data'
file_path = f'{data_dir}/AG_news_train.csv'
if not os.path.exists(file_path):
    url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
    urllib.request.urlretrieve(url, file_path)

In [4]:
data_dir = 'data'
file_path = f'{data_dir}/AG_news_test.csv'
if not os.path.exists(file_path):
    url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'
    urllib.request.urlretrieve(url, file_path)

## 데이터를 열어봅시다

In [5]:
df = pd.read_csv('data/AG_news_train.csv', names=['class', 'title', 'description'])

In [6]:
df.head()

Unnamed: 0,class,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


- <span style = 'font-size:1.2em;line-height:1.5em'>Class ID는 1~4까지로 총 4개의 클래스가 존재</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>1: World</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>2: Sports</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>3: Business</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>4: Sci/Tech</span>

In [7]:
import preprocess

In [8]:
trn_file_path = 'data/AG_news_train.csv'
tst_file_path = 'data/AG_news_test.csv'
trn_pipe, tst_pipe = preprocess.get_pipe(trn_file_path=trn_file_path, 
                                         tst_file_path=tst_file_path)

In [9]:
# Split into training and val datapipes early on. Will build vocabulary from training datapipe only.
trn_dp, val_dp = trn_pipe.random_split(total_length=len(list(trn_pipe)),
                                       weights={"train": 0.8, "valid": 0.2},
                                       seed=0)

tst_dp = tst_pipe

In [10]:
list(trn_dp)[0]

['3',
 'Carlyle Looks Toward Commercial Aerospace (Reuters)',
 'Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.']

In [11]:
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

In [12]:
tokenizer = get_tokenizer(preprocess.preprocess_english) # 별도로 정의한 함수로 tokenizing하기

In [13]:
text_vocab = preprocess.get_vocab(trn_dp, tokenizer, data_type = 'description')
label_vocab = preprocess.get_vocab(trn_dp, tokenizer, data_type = 'class', specials = ["<UNK>"])

In [14]:
UNK_VALUE = text_vocab['<UNK>']
PADDING_VALUE = text_vocab['<PAD>']
VOCAB_SIZE = len(text_vocab.get_stoi())
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
VOCAB_SIZE

28062

In [16]:
text_transform = lambda x: [text_vocab[token] for token in tokenizer(x)]
label_transform = lambda x: int(x)-1

In [17]:
def collate_batch(batch):
    desc_list, label_list = [], []
    for (_label, _title, _desc) in batch:
        processed_desc = torch.tensor(text_transform(_desc))
        desc_list.append(processed_desc)
        label_list.append(label_transform(_label))     
        
    labels = torch.tensor(label_list).to(DEVICE)
    descs = pad_sequence(desc_list, padding_value = PADDING_VALUE, batch_first=True).to(DEVICE)
    
    return labels, descs

In [18]:
len(text_vocab.get_itos())

28062

## nn.Embedding() 알아보기
- <span style = 'font-size:1.2em;line-height:1.5em'>정수 index가 들어오면, 그 index에 맞는 벡터를 추출해 주는 lookup table(같은 것)</span>

In [20]:
embed = nn.Embedding(num_embeddings=6, 
                     embedding_dim=3)
embed.weight

Parameter containing:
tensor([[ 1.4587,  1.0858,  0.2739],
        [-0.8754, -2.6280,  0.3633],
        [ 0.7138, -1.2440, -0.1990],
        [ 1.8255, -2.3799, -0.0456],
        [-0.8720, -0.7075, -0.4527],
        [-1.4513, -0.7704,  0.2369]], requires_grad=True)

- <span style = 'font-size:1.2em;line-height:1.5em'>[Optional] 알아보기 쉽도록 weight값을 바꿔봅시다.</span>

In [21]:
w = np.array([[1,0,0],[0,1,0],[0,0,1],[2,0,0],[0,2,0],[0,0,2]])
w = torch.tensor(w, dtype=torch.float32)
embed.weight.data = w
embed.weight

Parameter containing:
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [2., 0., 0.],
        [0., 2., 0.],
        [0., 0., 2.]], requires_grad=True)

- <span style = 'font-size:1.2em;line-height:1.5em'>Integer Index가 들어오면 해당 index의 row vector를 반환해줍니다. </span>

In [21]:
idx1 = torch.LongTensor([0])
idx2 = torch.LongTensor([1])
print(embed(idx1))
print(embed(idx2))

tensor([[1., 0., 0.]], grad_fn=<EmbeddingBackward0>)
tensor([[0., 1., 0.]], grad_fn=<EmbeddingBackward0>)


- <span style = 'font-size:1.2em;line-height:1.5em'>Integer Sequence가 들어오면, sequence의 각 integer index에 해당하는 row vector들을 반환합니다</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>근데, 우리는 문장을 어떻게 받나요? 각 단어가 integer encoding된 상태인 integer sequence로 받습니다</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>즉, `nn.Embedding`을 사용하면 문장내의 각 단어들을 vector로 표현한 tensor로 만들 수 있어요</span>

In [22]:
sentence = torch.LongTensor([1,0,2])
print(embed(sentence))

tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.]], grad_fn=<EmbeddingBackward0>)


- <span style = 'font-size:1.2em;line-height:1.5em'>심지어 여러개의 문장에 대해서도 tensor형태로 변환할 수 있습니다.</span>
    - <span style = 'font-size:1.0em;line-height:1.5em'>첫번째 문장(integer encoding된): [1,0,2]</span>
    - <span style = 'font-size:1.0em;line-height:1.5em'>첫번째 문장(integer encoding된): [3,5,4]</span>

In [23]:
sentences = torch.LongTensor([[1,0,2],[3,5,4]])
print(embed(sentences))

tensor([[[0., 1., 0.],
         [1., 0., 0.],
         [0., 0., 1.]],

        [[2., 0., 0.],
         [0., 0., 2.],
         [0., 2., 0.]]], grad_fn=<EmbeddingBackward0>)


# 모델 정의하기

- <span style = 'font-size:1.2em;line-height:1.5em'>간단한 RNN기반의 문서 분류 모델을 정의합시다. 크게 다음과 같은 component가 존재합니다.</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>Embedding layer</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>단어가 Integer Encoding된 문장 sequence가 들어오면, integer index에 맞는 vector로 변환</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>vocabulary 수(`n_vocab`)는 미정, 단어 벡터 차원(`embed_dim`) = 200</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>RNN layer</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>RNN layer 수(`n_layer`) = 1, 은닉층 차원(`hidden_dim`) = 512</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>FFNN layer</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>마지막 time에서의 hidden을 산출하면, 이를 간단한 FFNN의 입력값으로 사용</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>FFNN의 input_dim = RNN의 hidden_dim=512, FFNN의 은닉층 = 128, FFNN의 output_dim = n_class = 4</span>

In [46]:
class MyNet(nn.Module):
    def __init__(self, n_vocab, embed_dim=200, n_layer=1, 
                 hidden_dim=512, output_dim=4, dropout_ratio=0.5):
        
        super(MyNet, self).__init__()
        self.embedding = nn.Embedding(n_vocab, embed_dim)
        self.rnn = nn.LSTM(input_size=embed_dim, 
                           hidden_size=hidden_dim, 
                           num_layers=n_layer, 
                           batch_first=True)
        self.fc1 = nn.Linear(in_features=hidden_dim, 
                             out_features=128)
        self.fc2 = nn.Linear(in_features=128, 
                            out_features=output_dim)
        self.dropout = nn.Dropout(dropout_ratio)
        
    def forward(self, x): # x: integer encoded sentence
        embed = self.embedding(x)
        outputs, (h_n, c_n) = self.rnn(embed)
        last_output = outputs[:,-1,:]
        out = self.fc1(last_output)
        out = F.relu(out)
        out = self.fc2(out)
        return out

# train() 함수
### (Week03의 '4_코드_정리.ipynb' 참조)


- <span style = 'font-size:1.2em;line-height:1.5em'>`train()`함수는 각 iteration마다 다음과 같이 진행됩니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 1.</b> batch_loader로부터 mini-batch x, y 데이터를 획득, tensor로 변환한 뒤, 원하는 device에 위치시키기</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 2.</b> 지난 batch로부터 계산했던 gradient를 초기화(`zero_grad()`)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 3.</b> 모델에 batch x를 입력하여 forward propagation</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 4.</b> loss function에 모델이 예측한 각 클래스에 속할 확률(`y_pred_prob`)과 실제 레이블 (`y`)을 넣어서 loss 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 5.</b> Backpropagation으로 각 parameter의 gradient를 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 6.</b> Gradient Descent로 parameter값 update</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 7.</b> `trn_loss` 변수에 mini-batch loss를 누적해서 합산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 8.</b> 데이터 한 개당 평균 train loss 산출</span>

In [47]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    for i, (label, text) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = torch.LongTensor(text).to(device)
        y = torch.LongTensor(label).to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

# evaluate()함수

### (Week03의 '4_코드_정리.ipynb' 참조)

- <span style = 'font-size:1.2em;line-height:1.5em'>`evaluate()`함수는 각 iteration마다 다음과 같이 진행됩니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 1.</b> batch_loader로부터 mini-batch x, y 데이터를 획득, tensor로 변환한 뒤, 원하는 device에 위치시키기</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 2.</b> 모델에 batch x를 입력하여 forward propagation</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 3.</b> loss function에 모델이 예측한 각 클래스에 속할 확률(`y_pred_prob`)과 실제 레이블 (`y`)을 넣어서 loss 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 4.</b> 모델이 예측하는 레이블을 산출 (with `torch.argmax()`)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 5.</b> Minibatch의 실제 레이블(`y`)과 예측 레이블(`y_pred_label`)을 누적하여 저장</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 6.</b> `eval_loss` 변수에 mini-batch loss를 누적해서 합산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 7.</b> 데이터 한 개당 평균 evaluation loss와 accuracy 산출</span>

In [48]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (label, text) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = torch.LongTensor(text).to(device)
            y = torch.LongTensor(label).to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

# 실제 모델 학습 및 평가하기

In [49]:
model = MyNet(n_vocab=VOCAB_SIZE)
model = model.to(DEVICE)

- <span style = 'font-size:1.2em;line-height:1.5em'>학습한 모델을 저장할 directory 생성하기</span>

In [50]:
save_dir = 'models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

- <span style = 'font-size:1.2em;line-height:1.5em'>필요한 hyperparameter값 설정하기</span>

In [51]:
N_EPOCHS = 10
LR = 0.001
BATCH_SIZE = 2**6

- <span style = 'font-size:1.2em;line-height:1.5em'>Mini-batch를 자동으로 생성할 DataLoader준비하기</span>

In [52]:
trn_dp_list = list(trn_dp)
val_dp_list = list(val_dp)
tst_dp_list = list(tst_dp)
trn_loader = DataLoader(trn_dp_list,
                        batch_sampler=preprocess.BatchSamplerSimilarLength(dataset=trn_dp_list,
                                                                           tokenizer=tokenizer,
                                                                           batch_size=BATCH_SIZE),
                        collate_fn=collate_batch)
val_loader = DataLoader(val_dp_list, 
                        batch_size=BATCH_SIZE, 
                        shuffle=False,
                        collate_fn=collate_batch)
tst_loader = DataLoader(tst_dp_list, 
                        batch_size=BATCH_SIZE, 
                        shuffle=False, 
                        collate_fn=collate_batch)

- <span style = 'font-size:1.2em;line-height:1.5em'>loss function정의하기</span>

In [53]:
loss_func = nn.CrossEntropyLoss(reduction='sum')

- <span style = 'font-size:1.2em;line-height:1.5em'>optimizer 생성하기</span>

In [54]:
my_opt = optim.Adam(model.parameters(), lr = LR)

- <span style = 'font-size:1.2em;line-height:1.5em'>매 epoch에 드는 시간을 측정하는 함수</span>

In [55]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

- <span style = 'font-size:1.2em;line-height:1.5em'>trn_data에 대해서 train()함수를, tst_data에 대해서 evaluate()함수를 반복적으로 호출하면서 모델을 학습</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>매 epoch마다 학습이 마무리되면, 모델 평가를 진행한다</span>

In [56]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=model, 
                     data_loader=trn_loader, 
                     criterion=loss_func,
                     optimizer=my_opt, 
                     device=DEVICE)
    val_loss, accuracy = evaluate(model=model, 
                                  data_loader=val_loader, 
                                  criterion=loss_func,
                                  optimizer=my_opt, 
                                  device=DEVICE)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{save_dir}/my_model3.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Val Loss: {val_loss:.3f} | Val Acc: {100*accuracy:.3f}% ')

Epoch: 01 | Time: 6m 44s
	Train Loss: 1.274 | Val Loss: 0.757 | Val Acc: 66.171% 
Epoch: 02 | Time: 6m 51s
	Train Loss: 0.454 | Val Loss: 0.357 | Val Acc: 87.983% 
Epoch: 03 | Time: 6m 41s
	Train Loss: 0.265 | Val Loss: 0.325 | Val Acc: 89.258% 
Epoch: 04 | Time: 6m 41s
	Train Loss: 0.202 | Val Loss: 0.321 | Val Acc: 89.562% 
Epoch: 05 | Time: 6m 38s
	Train Loss: 0.150 | Val Loss: 0.342 | Val Acc: 89.517% 
Epoch: 06 | Time: 6m 49s
	Train Loss: 0.107 | Val Loss: 0.389 | Val Acc: 89.212% 
Epoch: 07 | Time: 6m 40s
	Train Loss: 0.074 | Val Loss: 0.425 | Val Acc: 89.129% 
Epoch: 08 | Time: 6m 39s
	Train Loss: 0.055 | Val Loss: 0.453 | Val Acc: 89.412% 
Epoch: 09 | Time: 6m 41s
	Train Loss: 0.040 | Val Loss: 0.521 | Val Acc: 88.746% 
Epoch: 10 | Time: 6m 39s
	Train Loss: 0.032 | Val Loss: 0.544 | Val Acc: 89.296% 


In [57]:
model_loaded = torch.load(f'{save_dir}/my_model3.pt')

In [59]:
model_loaded

OrderedDict([('embedding.weight',
              tensor([[ 1.3037e+00, -1.2928e+00, -6.4955e-02,  ..., -1.2056e+00,
                        1.6196e-02,  2.7363e-01],
                      [-1.5426e-01,  1.0207e+00,  2.2950e+00,  ..., -2.9864e-01,
                        7.1270e-01,  1.5385e-01],
                      [-1.1895e-01,  2.6510e-01,  5.4087e-01,  ...,  1.4373e+00,
                        2.4773e-02,  7.0790e-01],
                      ...,
                      [ 1.7491e-03,  9.0890e-01,  1.3982e+00,  ...,  3.4248e-01,
                        4.2014e-01,  7.2376e-01],
                      [ 5.5438e-01,  6.6477e-01,  1.2892e+00,  ..., -1.7038e-01,
                        8.2063e-01,  1.2597e+00],
                      [ 7.4464e-01, -1.4678e-02, -1.7131e-01,  ..., -7.4931e-01,
                        1.4259e+00, -7.1166e-01]])),
             ('rnn.weight_ih_l0',
              tensor([[ 0.0542,  0.0412, -0.0433,  ..., -0.0485,  0.1257, -0.1004],
                      [-0.0380, 

In [61]:
model_state_dict = torch.load(f'{save_dir}/my_model3.pt', map_location=DEVICE)
model2 = MyNet(n_vocab=VOCAB_SIZE)
model2 = model2.to(DEVICE)
model2.load_state_dict(model_state_dict)

<All keys matched successfully>

In [62]:
tst_loss, tst_acc = evaluate(model=model2, 
                             data_loader=tst_loader, 
                             criterion=loss_func,
                             optimizer=my_opt, 
                             device=DEVICE)

In [63]:
print(f'Tst Loss: {tst_loss:.3f} | Tst Acc: {100*tst_acc:.3f}% ')

Tst Loss: 0.324 | Tst Acc: 89.645% 
