In [6]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()

## Tokenize 

In [7]:
token = nltk.word_tokenize("Hi, my name is sungdong. What's your name?")
print(token)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Searched in:
    - 'C:\\Users\\a/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\cosmos\\envs\\pytorch\\nltk_data'
    - 'C:\\cosmos\\envs\\pytorch\\share\\nltk_data'
    - 'C:\\cosmos\\envs\\pytorch\\lib\\nltk_data'
    - 'C:\\Users\\a\\AppData\\Roaming\\nltk_data'
    - ''
**********************************************************************


In [3]:
token = kor_tagger.morphs("안녕하세요! 저는 파이토치를 공부하는 중입니다.")
print(token)

['안녕', '하', '세요', '!', '저', '는', '파이', '토치', '를', '공부', '하', '는', '중', '이', 'ㅂ니다', '.']


## Build Vocab 

In [4]:
token

['안녕',
 '하',
 '세요',
 '!',
 '저',
 '는',
 '파이',
 '토치',
 '를',
 '공부',
 '하',
 '는',
 '중',
 '이',
 'ㅂ니다',
 '.']

In [5]:
word2index={} # dictionary for indexing
for vo in token:
    if word2index.get(vo) == None:
        word2index[vo]=len(word2index)
print(word2index)

{'안녕': 0, '하': 1, '세요': 2, '!': 3, '저': 4, '는': 5, '파이': 6, '토치': 7, '를': 8, '공부': 9, '중': 10, '이': 11, 'ㅂ니다': 12, '.': 13}


## One-hot Encoding 

In [9]:
def one_hot_encoding(word,word2index):
    tensor = torch.zeros(len(word2index))
    index = word2index[word] 
    tensor[index]=1.
    return tensor

In [10]:
torch_vector = one_hot_encoding("토치",word2index)
print(torch_vector)


 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 14]



In [13]:
py_vector = one_hot_encoding("파이",word2index)
py_vector.dot(torch_vector)

0.0

## Bag-of-Words 

In [14]:
train_data = [["배고프다 밥줘","FOOD"],
              ["뭐 먹을만한거 없냐","FOOD"],
              ["맛집 추천","FOOD"],
              ["이 근처 맛있는 음식점 좀","FOOD"],
              ["밥줘","FOOD"],
              ["뭐 먹지?","FOOD"],
              ["삼겹살 먹고싶어","FOOD"],
              ["영화 보고싶다","MEDIA"],
              ["요즘 볼만한거 있어?","MEDIA"],
              ["영화나 예능 추천","MEDIA"],
              ["재밌는 드라마 보여줘","MEDIA"],
              ["신과 함께 줄거리 좀 알려줘","MEDIA"],
              ["고등랩퍼 다시보기 좀","MEDIA"],
              ["재밌는 영상 하이라이트만 보여줘","MEDIA"]]

test_data = [["쭈꾸미 맛집 좀 찾아줘","FOOD"],
             ["매콤한 떡볶이 먹고싶다","FOOD"],
             ["강남 씨지비 조조 영화 스케줄표 좀","MEDIA"],
             ["효리네 민박 보고싶엉","MEDIA"]]

### Preprocessing 

In [15]:
train_X,train_y = list(zip(*train_data))

In [17]:
list(zip([1, 2, 3], 
         [4, 5, 6]))

[(1, 4), (2, 5), (3, 6)]

### 1. Tokenize 

In [18]:
train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize morphemes 형태소

In [16]:
train_X

[['배고프', '다', '밥', '주', '어'],
 ['뭐', '먹', '을', '만하', 'ㄴ', '거', '없', '냐'],
 ['맛', '집', '추천'],
 ['이', '근처', '맛있', '는', '음식', '점', '좀'],
 ['밥', '주', '어'],
 ['뭐', '먹', '지', '?'],
 ['삼겹살', '먹', '고', '싶', '어'],
 ['영화', '보', '고', '싶', '다'],
 ['요즘', '볼만', '하', 'ㄴ', '거', '있', '어', '?'],
 ['영화', '나', '예능', '추천'],
 ['재밌', '는', '드라마', '보여주', '어'],
 ['신', '과', '함께', '줄거리', '좀', '알려주', '어'],
 ['고등', '랩', '푸', '어', '다시', '보', '기', '좀'],
 ['재밌', '는', '영상', '하이라이트', '만', '보여주', '어']]

### 2. Build Vocab 

In [20]:
word2index={'<unk>' : 0}
for x in train_X:
    for token in x:
        if word2index.get(token)==None:
            word2index[token]=len(word2index)
            
class2index = {'FOOD' : 0, 'MEDIA' : 1}
print(word2index)
print(class2index)

{'<unk>': 0, '배고프': 1, '다': 2, '밥': 3, '주': 4, '어': 5, '뭐': 6, '먹': 7, '을': 8, '만하': 9, 'ㄴ': 10, '거': 11, '없': 12, '냐': 13, '맛': 14, '집': 15, '추천': 16, '이': 17, '근처': 18, '맛있': 19, '는': 20, '음식': 21, '점': 22, '좀': 23, '지': 24, '?': 25, '삼겹살': 26, '고': 27, '싶': 28, '영화': 29, '보': 30, '요즘': 31, '볼만': 32, '하': 33, '있': 34, '나': 35, '예능': 36, '재밌': 37, '드라마': 38, '보여주': 39, '신': 40, '과': 41, '함께': 42, '줄거리': 43, '알려주': 44, '고등': 45, '랩': 46, '푸': 47, '다시': 48, '기': 49, '영상': 50, '하이라이트': 51, '만': 52}
{'FOOD': 0, 'MEDIA': 1}


In [39]:
len(word2index)

53

### 3. Prepare tensor 

In [25]:
word2index.get("패스트"), word2index.get("삼겹살")

(None, 26)

In [26]:
def make_BoW(seq, word2index):
    tensor = torch.zeros(len(word2index))
    for w in seq:
        index = word2index.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index = word2index['<unk>']
            tensor[index]+=1.
    
    return tensor

In [27]:
train_X = torch.cat([Variable(make_BoW(x,word2index)).view(1,-1) for x in train_X])
train_y = torch.cat([Variable(torch.LongTensor([class2index[y]])) for y in train_y])

In [29]:
print(train_X.size(), train_y.size())

torch.Size([14, 53]) torch.Size([14])


## view(*args) → Tensor
### Returns a new tensor with the same data as the self tensor but of a different size.

The returned tensor shares the same data and must have the same number of elements, <br/>
but may have a different size. <br/>
For a tensor to be viewed, the new view size must be compatible with its original size and stride

In [32]:
x = torch.randn(4, 4)
print(x.size())

y = x.view(16)
print(y.size())

z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(z.size())


torch.Size([4, 4])
torch.Size([16])
torch.Size([2, 8])


### 4. Modeling

In [33]:
train_X.size()

torch.Size([14, 53])

In [34]:
class BoWClassifier(nn.Module):
    def __init__(self,vocab_size,output_size):
        super(BoWClassifier,self).__init__()
        
        self.linear = nn.Linear(vocab_size,output_size)
    
    def forward(self,inputs):
        return self.linear(inputs)

### 5. Train 

In [35]:
STEP = 100
LR = 0.1
model = BoWClassifier(len(word2index),2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [36]:
for step in range(STEP):
    model.zero_grad()
    preds = model(train_X)
    loss = loss_function(preds,train_y)
    if step % 10 == 0:
        print(loss.data[0])
    loss.backward()
    optimizer.step()

0.7514253854751587
0.5621613264083862
0.44524627923965454
0.36636489629745483
0.30981746315956116
0.267465204000473
0.2346717119216919
0.20860494673252106
0.18743959069252014
0.169948011636734


### 6. Test 

In [37]:
index2class = {v:k for k,v in class2index.items()}

In [38]:
for test in test_data:
    X = kor_tagger.morphs(test[0])
    X = Variable(make_BoW(X,word2index)).view(1,-1)
    
    pred = model(X)
    pred = pred.max(1)[1].data[0]
    print("Input : %s" % test[0])
    print("Prediction : %s" % index2class[pred])
    print("Truth : %s" % test[1])
    print("\n")

Input : 쭈꾸미 맛집 좀 찾아줘
Prediction : FOOD
Truth : FOOD


Input : 매콤한 떡볶이 먹고싶다
Prediction : FOOD
Truth : FOOD


Input : 강남 씨지비 조조 영화 스케줄표 좀
Prediction : MEDIA
Truth : MEDIA


Input : 효리네 민박 보고싶엉
Prediction : MEDIA
Truth : MEDIA


