In [1]:
import os
import urllib

In [2]:
import copy
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer


In [3]:
if not os.path.exists('data'):
    os.makedirs('data')

In [4]:
# data 저장 directory에 데이터 다운로드 받기
data_dir = 'data'
file_path = f'{data_dir}/AG_news_train.csv'
if not os.path.exists(file_path):
    url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
    urllib.request.urlretrieve(url, file_path)

In [5]:
data_dir = 'data'
file_path = f'{data_dir}/AG_news_test.csv'
if not os.path.exists(file_path):
    url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'
    urllib.request.urlretrieve(url, file_path)

## 데이터를 열어봅시다

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('data/AG_news_train.csv', names=['class', 'title', 'description'])

In [8]:
df.head()

Unnamed: 0,class,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


- <span style = 'font-size:1.2em;line-height:1.5em'>Class ID는 1~4까지로 총 4개의 클래스가 존재</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>1: World</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>2: Sports</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>3: Business</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>4: Sci/Tech</span>

- <span style = 'font-size:1.2em;line-height:1.5em'>나중에 학습, 평가시 편의를 위해 class를 1\~4에서 0\~3으로 바꿔줍시다</span>

In [9]:
df['preprocessed_class'] = df['class']-1

- <span style = 'font-size:1.2em;line-height:1.5em'>데이터를 좀 볼까요?</span>

In [10]:
df['title'][0]

'Wall St. Bears Claw Back Into the Black (Reuters)'

In [11]:
df['description'][0]

"Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

- <span style = 'font-size:1.2em;line-height:1.5em'>간단한 텍스트 전처리 코드입니다. (특수문자 제거, 소문자 변환, 불용어 제거 등)</span>

In [12]:
def preprocess_english(text):
    my_text = copy.copy(text)
    my_text = my_text.replace('\n', '')
    sents = nltk.sent_tokenize(my_text)
    tokenizer = TreebankWordTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    
    p = re.compile('[^A-Za-z]')
    result = []
    for sent in sents:
        sent = sent.lower() # 소문자로 변환
        sent = p.sub(' ', sent) # 각 문장에서 특수문자 제거
        word_tokens = tokenizer.tokenize(sent) # word tokenization
        for token in word_tokens:
            if token not in stopwords:
                result.append(token) # stopwords removal
    result = ' '.join(result)
    return result

In [13]:
preprocess_english(df['description'][0])

'reuters short sellers wall street dwindling band ultra cynics seeing green'

In [14]:
%%time
df['preprocessed_desc'] = df['description'].apply(lambda x: preprocess_english(x))

CPU times: total: 30.1 s
Wall time: 30.1 s


In [14]:
print(df['description'][12])
print()
print(df['preprocessed_desc'][12])

 JAKARTA (Reuters) - Non-OPEC oil exporters should consider  increasing output to cool record crude prices, OPEC President  Purnomo Yusgiantoro said on Sunday.

jakarta reuters non opec oil exporters consider increasing output cool record crude prices opec president purnomo yusgiantoro said sunday


In [15]:
df

Unnamed: 0,class,title,description,preprocessed_class,preprocessed_desc
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",2,reuters short sellers wall street dwindling ba...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,2,reuters private investment firm carlyle group ...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,2,reuters soaring crude prices plus worries econ...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,2,reuters authorities halted oil export flows ma...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",2,afp tearaway world oil prices toppling records...
...,...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,0,karachi reuters pakistani president pervez mus...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,1,red sox general manager theo epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,1,miami dolphins put courtship lsu coach nick sa...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,1,pittsburgh ny giants time p line steelers reco...


- <span style = 'font-size:1.2em;line-height:1.5em'>전처리된 본문을 TF-IDF로 벡터 형태로 나타내봅시다.</span>

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# 전체 데이터를 train, test data로 나누고
df_trn, df_tst = train_test_split(df, test_size=0.2, stratify=df['preprocessed_class'])

In [18]:
df_trn.head()

Unnamed: 0,class,title,description,preprocessed_class,preprocessed_desc
111914,1,Rescuers search for more survivors as Philippi...,"REAL, Philippines : Philippine rescuers are di...",0,real philippines philippine rescuers digging r...
9036,3,UPDATE 1-Citigroup to acquire First American B...,"Citigroup Inc. (CN: Quote, Profile, Research) ...",2,citigroup inc cn quote profile research world ...
49094,4,"Neb. Professor Connects Football, Physics (AP)",AP - A physics professor at the University of ...,3,ap physics professor university nebraska linco...
96487,1,"After Iraq, Spain Turns Again Toward Latin Ame...","SAN JOSE, Costa Rica (Reuters) - After spendi...",0,san jose costa rica reuters spending several y...
72829,3,CME Profit Up on Trading Volume Growth,CHICAGO (Reuters) - Chicago Mercantile Exchan...,2,chicago reuters chicago mercantile exchange ho...


In [21]:
EMBED_DIM = 5000 # 5000차원의 벡터로 나타내봅시다.
vectorizer = TfidfVectorizer(max_features=EMBED_DIM)

#train 데이터의 preprocessed_desc column의 데이터로부터 TF-IDF를 학습하고
vectorizer.fit(df_trn['preprocessed_desc'])

# train, test data의 preprocessed_desc를 TF-IDF로 변환해줍시다.
x_trn = vectorizer.transform(df_trn['preprocessed_desc']).toarray()
x_tst = vectorizer.transform(df_tst['preprocessed_desc']).toarray()

In [25]:
y_trn = df_trn['preprocessed_class'].values
y_tst = df_tst['preprocessed_class'].values

# Make Custom Dataset 
## (week05의 4.use custom dataset.ipynb 참고)
<span style = 'font-size:1.2em;line-height:1.5em'>Pytorch에서 제공하는 torch.utils.data.Dataset을 활용하면 직접 custom dataset을 만들수 있습니다.</span>

```python
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self):
        # 데이터의 전처리를 해주는 부분
        pass
    
    def __len__(self):
        # 데이터셋의 길이. 즉, 총 샘플 갯수를 적어주는 부분
        pass
    
    def __getitem__(self, idx):
        # 데이터 셋에서 index를 받으면 해당 index에 해당하는 샘플을 가져오는 함수
```

In [19]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [20]:
class MyDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        data = self.x[idx]
        label = self.y[idx]
        return data, label
    
trn_dset = MyDataset(x_trn, y_trn)
tst_dset = MyDataset(x_tst, y_tst)

In [21]:
trn_dset = MyDataset(x_trn, y_trn)
tst_dset = MyDataset(x_tst, y_tst)

In [22]:
trn_dset.__getitem__(0)

(array([0., 0., 0., ..., 0., 0., 0.]), 0)

# 모델 정의하기

- <span style = 'font-size:1.2em;line-height:1.5em'>간단한 3-layer FFNN을 정의합시다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>input의 차원은 EMBED_DIM (여기선 5000)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>output의 차원은 class 수(여기선 4)</span>

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [24]:
class MyNet(nn.Module):
    def __init__(self, in_dim, h1_dim, h2_dim, out_dim):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(in_dim, h1_dim)
        self.fc2 = nn.Linear(h1_dim, h2_dim)
        self.fc3 = nn.Linear(h2_dim, out_dim)
        
    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = F.relu(out)
        out = self.fc3(out)
        return out

# train() 함수
### (Week03의 '4_코드_정리.ipynb' 참조)


- <span style = 'font-size:1.2em;line-height:1.5em'>`train()`함수는 각 iteration마다 다음과 같이 진행됩니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 1.</b> batch_loader로부터 mini-batch x, y 데이터를 획득, tensor로 변환한 뒤, 원하는 device에 위치시키기</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 2.</b> 지난 batch로부터 계산했던 gradient를 초기화(`zero_grad()`)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 3.</b> 모델에 batch x를 입력하여 forward propagation</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 4.</b> loss function에 모델이 예측한 각 클래스에 속할 확률(`y_pred_prob`)과 실제 레이블 (`y`)을 넣어서 loss 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 5.</b> Backpropagation으로 각 parameter의 gradient를 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 6.</b> Gradient Descent로 parameter값 update</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 7.</b> `trn_loss` 변수에 mini-batch loss를 누적해서 합산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 8.</b> 데이터 한 개당 평균 train loss 산출</span>

In [25]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    for i, (x, y) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = torch.tensor(x, dtype=torch.float32).to(device)
        y = torch.LongTensor(y).to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

# evaluate()함수

### (Week03의 '4_코드_정리.ipynb' 참조)

- <span style = 'font-size:1.2em;line-height:1.5em'>`evaluate()`함수는 각 iteration마다 다음과 같이 진행됩니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 1.</b> batch_loader로부터 mini-batch x, y 데이터를 획득, tensor로 변환한 뒤, 원하는 device에 위치시키기</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 2.</b> 모델에 batch x를 입력하여 forward propagation</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 3.</b> loss function에 모델이 예측한 각 클래스에 속할 확률(`y_pred_prob`)과 실제 레이블 (`y`)을 넣어서 loss 계산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 4.</b> 모델이 예측하는 레이블을 산출 (with `torch.argmax()`)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 5.</b> Minibatch의 실제 레이블(`y`)과 예측 레이블(`y_pred_label`)을 누적하여 저장</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 6.</b> `eval_loss` 변수에 mini-batch loss를 누적해서 합산</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'><b>Step 7.</b> 데이터 한 개당 평균 evaluation loss와 accuracy 산출</span>

In [26]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (x, y) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = torch.tensor(x, dtype=torch.float32).to(device)
            y = torch.LongTensor(y).to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

# 실제 모델 학습 및 평가하기

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [28]:
model = MyNet(in_dim=EMBED_DIM, h1_dim=512, h2_dim=64, out_dim=4)
model = model.to(device)

- <span style = 'font-size:1.2em;line-height:1.5em'>학습한 모델을 저장할 directory 생성하기</span>

In [29]:
save_dir = 'models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

- <span style = 'font-size:1.2em;line-height:1.5em'>필요한 hyperparameter값 설정하기</span>

In [30]:
N_EPOCHS = 10
LR = 2e-4
BATCH_SIZE = 2**8

- <span style = 'font-size:1.2em;line-height:1.5em'>Mini-batch를 자동으로 생성할 DataLoader준비하기</span>

In [31]:
trn_loader = DataLoader(trn_dset, 
                        batch_size=BATCH_SIZE, 
                        shuffle=True, 
                        drop_last=True)
tst_loader = DataLoader(tst_dset, 
                        batch_size=BATCH_SIZE, 
                        shuffle=False, 
                        drop_last=False)

- <span style = 'font-size:1.2em;line-height:1.5em'>loss function정의하기</span>

In [32]:
loss_func = nn.CrossEntropyLoss(reduction='sum')

- <span style = 'font-size:1.2em;line-height:1.5em'>optimizer 생성하기</span>

In [33]:
my_opt = optim.Adam(model.parameters(), lr = LR)

- <span style = 'font-size:1.2em;line-height:1.5em'>매 epoch에 드는 시간을 측정하는 함수</span>

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

- <span style = 'font-size:1.2em;line-height:1.5em'>trn_data에 대해서 train()함수를, tst_data에 대해서 evaluate()함수를 반복적으로 호출하면서 모델을 학습</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>매 epoch마다 학습이 마무리되면, 모델 평가를 진행한다</span>

In [35]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=model, 
                     data_loader=trn_loader, 
                     criterion=loss_func,
                     optimizer=my_opt, 
                     device=device)
    val_loss, accuracy = evaluate(model=model, 
                                  data_loader=tst_loader, 
                                  criterion=loss_func,
                                  optimizer=my_opt, 
                                  device=device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{save_dir}/my_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Test Loss: {val_loss:.3f} | Test Acc: {100*accuracy:.3f}% ')

  x = torch.tensor(x, dtype=torch.float32).to(device)
  x = torch.tensor(x, dtype=torch.float32).to(device)


Epoch: 01 | Time: 0m 17s
	Train Loss: 0.703 | Test Loss: 0.331 | Test Acc: 89.450% 
Epoch: 02 | Time: 0m 17s
	Train Loss: 0.290 | Test Loss: 0.302 | Test Acc: 89.917% 
Epoch: 03 | Time: 0m 17s
	Train Loss: 0.257 | Test Loss: 0.297 | Test Acc: 89.912% 
Epoch: 04 | Time: 0m 17s
	Train Loss: 0.240 | Test Loss: 0.296 | Test Acc: 89.842% 
Epoch: 05 | Time: 0m 17s
	Train Loss: 0.228 | Test Loss: 0.300 | Test Acc: 89.867% 
Epoch: 06 | Time: 0m 17s
	Train Loss: 0.219 | Test Loss: 0.303 | Test Acc: 89.692% 
Epoch: 07 | Time: 0m 17s
	Train Loss: 0.211 | Test Loss: 0.307 | Test Acc: 89.496% 
Epoch: 08 | Time: 0m 17s
	Train Loss: 0.203 | Test Loss: 0.312 | Test Acc: 89.404% 
Epoch: 09 | Time: 0m 17s
	Train Loss: 0.196 | Test Loss: 0.315 | Test Acc: 89.346% 
Epoch: 10 | Time: 0m 17s
	Train Loss: 0.189 | Test Loss: 0.320 | Test Acc: 89.221% 
