In [1]:
!pip install torchtext==0.4.0

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
max_length = 256 # sms(가사) 최대 길이

# 1. 데이터 불러오기


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('sms.tsv', sep='\t')
print(df.columns)
print(df.shape)

Index([u'label', u'sms'], dtype='object')
(5572, 2)


In [5]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# 클래스 파악
classes = sorted(set(set(df['label'])))
class_to_idx = {}

for i, c in enumerate(classes): # 모든 클래스에 대해
    class_to_idx.update({c:i})
    
nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


# 2. 새로운 DataFrame

## 1) 'labels, sms'만 남기기

## 2) 최대 텍스트 길이만큼 자르기 # pandas.Series.str.slice

- '성별, 가사'만 남기려면?

In [7]:
new_df = pd.DataFrame({'label':df['label'], 'sms':df['sms'].str.slice(start = 0, stop=max_length)})

## 3) 중복 제거

In [8]:
len(new_df)

5572

In [9]:
new_df = pd.DataFrame(new_df.drop_duplicates())

In [10]:
len(new_df)

5169

 ## 4) 셔플

In [11]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,Fuck cedar key and fuck her (come over anyway ...
1,ham,The world's most happiest frnds never have the...
2,ham,Got ur mail Dileep.thank you so muchand look f...
3,spam,Reply to win £100 weekly! What professional sp...
4,spam,"New Tones This week include: 1)McFly-All Ab..,..."


## 5) train, test 나누기

In [12]:
# train: test = 9:1
# train: test = 540:60 -> train:valid:test = 432:108:60
train_ratio = 0.9

# train dataset
s, e =0, int(df_shuffled.shape[0] * train_ratio) # of rows
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})
print("index for test: %d~%d" %(s,e))

# test dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d~%d" %(s,e))
df_test = pd.DataFrame({'label':df_shuffled['label'][s:e], 'sms': df_shuffled['sms'][s:e]})


index for test: 0~4652
index for test: 4652~5168


In [13]:
# column 수 확인
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


## 4) 저장

In [14]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv', header = False, index = False, sep = '\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv', header = False, index = False, sep = '\t')


In [15]:
import torch
print(torch.__version__)

1.4.0


# 데이터 로드하기

In [16]:
import torchtext
import numpy as np

In [17]:
from data_loader import DataLoader

# RNN+SMS 구현

## 0.1 라이브러리 임포트

In [18]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

## 0.2 하이퍼파라미터 셋팅

In [19]:
# Hyper-parameters
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

# yhk 추가
learning_rate = 0.001 # 디폴트 0,001

In [20]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. SMS train, test dataset 가져오기

In [21]:
from data_loader import DataLoader

In [22]:
loaders = DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.train.tsv',
    batch_size=batch_size,
    valid_ratio=.2,
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

In [23]:
test_loaders = DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.train.tsv',
    batch_size=batch_size,
    valid_ratio=.01,  # 모두 train
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

## 2. 대략적인 데이터 형태

In [24]:
print("|train| =", len(loaders.train_loader.dataset),
     '|valid| = ', len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print("|vocab| =", vocab_size, "|classes| =", num_classes)

('|train| =', 3722, '|valid| = ', 930)
('|vocab| =', 1551, '|classes| =', 2)


## 3. 데이터 로드함수
학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

### 데이터 로드함수 이해하기

In [25]:
n = 3 # 샘플로 출력할 데이터 개수
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i>n:
        break
    print("[%d]" %i)
    print("한 번에 로드되는 데이터 크기:", len(labels))
    
    # 출력
    for j in range(n):
        label = labels[j].numpy()
        text = texts[j].numpy()
        print("label: ", label)
        print("text: ", text.shape)

[0]
('\xed\x95\x9c \xeb\xb2\x88\xec\x97\x90 \xeb\xa1\x9c\xeb\x93\x9c\xeb\x90\x98\xeb\x8a\x94 \xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0 \xed\x81\xac\xea\xb8\xb0:', 128)
('label: ', array(0))
('text: ', (55,))
('label: ', array(0))
('text: ', (55,))
('label: ', array(0))
('text: ', (55,))
[1]
('\xed\x95\x9c \xeb\xb2\x88\xec\x97\x90 \xeb\xa1\x9c\xeb\x93\x9c\xeb\x90\x98\xeb\x8a\x94 \xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0 \xed\x81\xac\xea\xb8\xb0:', 128)
('label: ', array(0))
('text: ', (8,))
('label: ', array(0))
('text: ', (8,))
('label: ', array(0))
('text: ', (8,))
[2]
('\xed\x95\x9c \xeb\xb2\x88\xec\x97\x90 \xeb\xa1\x9c\xeb\x93\x9c\xeb\x90\x98\xeb\x8a\x94 \xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0 \xed\x81\xac\xea\xb8\xb0:', 128)
('label: ', array(1))
('text: ', (11,))
('label: ', array(0))
('text: ', (11,))
('label: ', array(0))
('text: ', (11,))
[3]
('\xed\x95\x9c \xeb\xb2\x88\xec\x97\x90 \xeb\xa1\x9c\xeb\x93\x9c\xeb\x90\x98\xeb\x8a\x94 \xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0 \xed\x81\xac\xea\xb8\xb0

## 4. 모델 선언

In [26]:
# Recurrent neural network(many-to-one)

class RNN(nn.Module):
    def __init__(self,
                input_size,
                word_vec_size,
                hidden_size,
                n_classes,
                num_layers=4,
                dropout_p=0.3
                ):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        # 입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb = nn.Embedding(input_size, word_vec_size) # 부터
        
        self.lstm = nn.LSTM(input_size = word_vec_size,
                           hidden_size = hidden_size,
                           num_layers = num_layers,
                           dropout = dropout_p,
                           batch_first = True,
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.activation = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        # x: (batch_size, length)
        x = self.emb(x)

        # x: (batch_size, length, word_vec_size
        x, _ = self.lstm(x)

        # x: (Batch_size, length, hidden_size, 2)
        # x[:,-1]: (batch_size, 1, hidden_size*2)
        out = self.activation(self.fc(x[:,-1]))
        # self.fc(x[:,-1])

        return out
            

In [27]:
model = RNN( input_size = vocab_size,
                word_vec_size=word_vec_size,
                hidden_size=hidden_size,
                n_classes=num_classes,
                num_layers=num_layers,
                dropout_p=dropout_p)

In [28]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0

    model.eval()  # test mode
    for i, data in enumerate(dloader):   # batch_size 만큼
        texts = data.text.to(device)  # (batch_size, length)
        labels = data.label.to(device)  # (batch_size, num_classes)

        # Forward prop.
        output = model(texts)  # (batch_size, num_classes)
        _, output_index = torch.max(output, 1)  # (batch_size, 1)

        total += labels.size(0)
        correct += (output_index == labels).sum().float()
    #print("Accuracy of Test Data: {}".format(100*correct/total))

    model.train()
    return (100*correct/total).numpy()  # tensor -> numpy

In [29]:
print("Accracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accracy of Test Data: 13.66


## 5. loss, optimizer

In [30]:
# Loss and optimizer
# loss_func = nn.CrossEntropyLoss()
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

## 6. 학습

In [31]:
# Train the model
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        # x: (batch_size, 1, seq_length, input_size)) -> (batch_size, seq_length, input_size)
        #img = imgs          
        #img = img.reshape(-1, sequence_length, input_size).to(device)        
        # y: (batch_size, num_classes)
        #label = labels.to(device)  # y  # Variable(labels)
        
        texts = data.text.to(device)  # (batch_size, length)
        labels = data.label.to(device)  # (batch_size, num_classes)
        
        print("[%d]" %i)
        print(texts.shape)
        print(labels.shape)
        
        # Forward prop.
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        # Backward prop. & optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
#         if (i+1) % 2 == 0:          
#             print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
#                    .format(epoch+1, num_epochs, i+1, total_step, 
#                            loss.item()))    
            
        if (i+1) % 10 == 0:          
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accr: {:.2f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, 
                           loss.item(), 
                           ComputeAccr(loaders.valid_loader, model)))   

[0]
torch.Size([128, 9])
torch.Size([128])
[1]
torch.Size([10, 60])
torch.Size([10])
[2]
torch.Size([128, 6])
torch.Size([128])
[3]
torch.Size([128, 25])
torch.Size([128])
[4]
torch.Size([128, 14])
torch.Size([128])
[5]
torch.Size([128, 12])
torch.Size([128])
[6]
torch.Size([128, 5])
torch.Size([128])
[7]
torch.Size([128, 7])
torch.Size([128])
[8]
torch.Size([128, 16])
torch.Size([128])
[9]
torch.Size([128, 55])
torch.Size([128])
Epoch [1/10], Step [10/30], Loss: 0.1272, Accr: 86.34
[10]
torch.Size([128, 7])
torch.Size([128])
[11]
torch.Size([128, 28])
torch.Size([128])
[12]
torch.Size([128, 8])
torch.Size([128])
[13]
torch.Size([128, 8])
torch.Size([128])
[14]
torch.Size([128, 4])
torch.Size([128])
[15]
torch.Size([128, 5])
torch.Size([128])
[16]
torch.Size([128, 15])
torch.Size([128])
[17]
torch.Size([128, 11])
torch.Size([128])
[18]
torch.Size([128, 12])
torch.Size([128])
[19]
torch.Size([128, 19])
torch.Size([128])
Epoch [1/10], Step [20/30], Loss: 0.3930, Accr: 86.34
[20]
torch.Si

## 7. 테스트


In [32]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 97.74


## 8. 학습된 파라미터 저장

In [33]:
netname = './nets/rnn_weight.pkl'
torch.save(model, netname,)

  "type " + obj.__name__ + ". It won't be checked "


## 9. 학습된 파라미터 로드
실무에서 학습된 파림터 로드하고 싶다면

In [34]:
netname = './nets/rnn_weight.pkl'
model = torch.load(netname)

In [36]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 97.74
