In [8]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()

## Word Similarity

In [9]:
pytorch = torch.Tensor([0.6,-0.2,0.7,0.3,0.7,-0.2,0.1,0.1])
tensorflow = torch.Tensor([0.4,-0.1,0.6,-0.2,0.6,-0.2,0.3,0.4])
cat = torch.Tensor([-0.3,0.2,0.1,0.2,-0.2,0.1,-0.3,0.1])

In [10]:
pytorch.dot(tensorflow) # inner product = word similarity

1.1500000953674316

In [11]:
pytorch.dot(cat)

-0.26999998092651367

Returns cosine similarity between x1 and x2, computed along dim.<br/>

\\(similarity= \frac{x_{1}\cdot x_{2}}{max(\left\|x_{1}\right\|_{2}\cdot\left\|x_2\right\|_2,\epsilon)}\\)

<br/>

Parameters:	<br/>
- x1 (Tensor) – First input.<br/>
- x2 (Tensor) – Second input (of size matching x1).<br/>
- dim (int, optional) – Dimension of vectors. Default: 1<br/>
- eps (float, optional) – Small value to avoid division by zero. Default: 1e-8

In [12]:
F.cosine_similarity(pytorch.view(1,-1),tensorflow.view(1,-1))


 0.8417
[torch.FloatTensor of size 1]

In [20]:
F.cosine_similarity(pytorch.view(1,-1),cat.view(1,-1))


-0.3800
[torch.FloatTensor of size 1]

## nn.Embedding() 
##### torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, _weight=None)

A simple lookup table that stores embeddings of a fixed dictionary and size.<br/>
This module is often used to store word embeddings and retrieve them using indices. <br/>The input to the module is a list of indices, and the output is the corresponding word embeddings.

> Parameters:	
- num_embeddings (int) – size of the dictionary of embeddings
- embedding_dim (int) – the size of each embedding vector
- padding_idx (int, optional) – If given, pads the output with the embedding vector at - padding_idx (initialized to zeros) whenever it encounters the index.
- max_norm (float, optional) – If given, will renormalize the embeddings to always have a norm lesser than this
- norm_type (float, optional) – The p of the p-norm to compute for the max_norm option
- scale_grad_by_freq (bool, optional) – if given, this will scale gradients by the frequency of the words in the mini-batch.
- sparse (bool, optional) – if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients.

Output: (*, embedding_dim), where * is the input shape

In [14]:
embed = nn.Embedding(7,5) # 총 단어 갯수, 임베딩 시킬 차원
embed.weight

Parameter containing:
 2.1016 -1.0317  0.1484  0.1075  1.0484
-0.6429  0.8057 -0.3392 -0.1415 -1.1187
 0.6939  0.4873 -1.1391 -0.5029  0.7813
-0.2998 -0.7556  0.4820 -0.8570  0.1664
-0.6996 -0.0582  1.9101  0.6987  1.2526
 1.4845 -0.3605 -0.8905 -0.9029  0.8305
-0.3070 -1.1148 -0.0758 -2.7965 -0.0669
[torch.FloatTensor of size 7x5]

In [15]:
apple = torch.LongTensor([2]) # 2은 사과의 인덱스
embed(Variable(apple))

Variable containing:
 0.6939  0.4873 -1.1391 -0.5029  0.7813
[torch.FloatTensor of size 1x5]

In [16]:
sentence = torch.LongTensor([1,3,5]) # 단어의 시퀀스(문장)
embed(Variable(sentence))

Variable containing:
-0.6429  0.8057 -0.3392 -0.1415 -1.1187
-0.2998 -0.7556  0.4820 -0.8570  0.1664
 1.4845 -0.3605 -0.8905 -0.9029  0.8305
[torch.FloatTensor of size 3x5]

### nn.EmbeddingBag 

각 워드 벡터를 합하거나 평균해서 하나의 벡터를 반환 ==> Sentence Embedding

In [17]:
embedding_sum = nn.EmbeddingBag(7, 5, mode='mean') # mean or sum
sentence = Variable(torch.LongTensor([[1,3,5]])) # 토큰 1,3,5로 이루어진 문장
embedding_sum(sentence)

Variable containing:
-1.1083 -1.0966 -0.0888  0.5519  0.1315
[torch.FloatTensor of size 1x5]

## TODO 

- 다음의 코퍼스를 형태소로 tokenize하고 Vocabulary를 구축하라(Word2index)
- 각 단어를 10차원으로 임베딩하기 위한 nn.Embedding을 선언하라
- "토치"와 "텐서플로우"라는 단어를 Embedding matrix에서 읽어서 두 벡터를 내적하라

In [18]:
corpus = ["안녕하세요 저는 파이토치를 공부중입니다.","파이토치는 딥러닝 라이브러리이다", "파이토치와 유사한 것으로 텐서플로우와 케라스 등이 있다",
         "파이토치는 정말 쉽다","텐서플로우는 구글이 만들었다","페북에서는 파이토치를 만들었다", "파이썬과 쉽게 사용할 수 있는 것이 장점이다",
         "그 중 특히 자연어처리할 때 파이토치가 좋다","하지만 아직 베타 버전인게 파이토치의 단점이다","원래는 루아라는 언어로 만들어진 토치라는 프레임워크였다",
         "텐서플로우나 파이토치는 자동미분 기능을 제공한다", "이를 이용하면 딥러닝 모델을 쉽게 만들 수 있다"]

In [19]:
tokenized = [kor_tagger.morphs(c) for c in corpus]

In [20]:
tokenized

[['안녕', '하', '세요', '저', '는', '파이', '토치', '를', '공부', '중', '이', 'ㅂ니다', '.'],
 ['파이', '토치', '는', '딥', '러닝', '라이브러리', '이', '다'],
 ['파이',
  '토치',
  '와',
  '유사',
  '하',
  'ㄴ',
  '것',
  '으로',
  '텐서플로우',
  '와',
  '하',
  '게',
  '라스',
  '등',
  '이',
  '있',
  '다'],
 ['파이', '토치', '는', '정말', '쉽', '다'],
 ['텐서플로우', '는', '구', '글', '이', '만들', '었', '다'],
 ['페북에서', '는', '파이', '토치', '를', '만들', '었', '다'],
 ['파이',
  '썰',
  'ㄴ',
  '과',
  '쉽',
  '게',
  '사용',
  '하',
  'ㄹ',
  '수',
  '있',
  '는',
  '것',
  '이',
  '장점',
  '이',
  '다'],
 ['그', '중', '특히', '자연어', '처리', '하', 'ㄹ', '때', '파이', '토치', '가', '좋', '다'],
 ['하지만',
  '아직',
  '베타',
  '버전',
  '이',
  'ㄴ',
  '것',
  '이',
  '파이',
  '토치',
  '의',
  '단점',
  '이',
  '다'],
 ['원래',
  '는',
  '루',
  '아',
  '이',
  '라는',
  '언어',
  '로',
  '만들',
  '어',
  '지',
  'ㄴ',
  '토치',
  '이',
  '라는',
  '프레임',
  '워크',
  '이',
  '었',
  '다'],
 ['텐서플로우', '나', '파이', '토치', '는', '자동', '미분', '기능', '을', '제공', '하', 'ㄴ다'],
 ['이르',
  'ㄹ',
  '이용',
  '하',
  '면',
  '딥',
  '러닝',
  '모델',
  '을',
  '쉽',
  '게',
  '만

In [27]:
word2index={}
for sent in tokenized:
    for word in sent:
        if word2index.get(word)==None:
            word2index[word]=len(word2index)

In [28]:
word2index

{'안녕': 0,
 '하': 1,
 '세요': 2,
 '저': 3,
 '는': 4,
 '파이': 5,
 '토치': 6,
 '를': 7,
 '공부': 8,
 '중': 9,
 '이': 10,
 'ㅂ니다': 11,
 '.': 12,
 '딥': 13,
 '러닝': 14,
 '라이브러리': 15,
 '다': 16,
 '와': 17,
 '유사': 18,
 'ㄴ': 19,
 '것': 20,
 '으로': 21,
 '텐서플로우': 22,
 '게': 23,
 '라스': 24,
 '등': 25,
 '있': 26,
 '정말': 27,
 '쉽': 28,
 '구': 29,
 '글': 30,
 '만들': 31,
 '었': 32,
 '페북에서': 33,
 '썰': 34,
 '과': 35,
 '사용': 36,
 'ㄹ': 37,
 '수': 38,
 '장점': 39,
 '그': 40,
 '특히': 41,
 '자연어': 42,
 '처리': 43,
 '때': 44,
 '가': 45,
 '좋': 46,
 '하지만': 47,
 '아직': 48,
 '베타': 49,
 '버전': 50,
 '의': 51,
 '단점': 52,
 '원래': 53,
 '루': 54,
 '아': 55,
 '라는': 56,
 '언어': 57,
 '로': 58,
 '어': 59,
 '지': 60,
 '프레임': 61,
 '워크': 62,
 '나': 63,
 '자동': 64,
 '미분': 65,
 '기능': 66,
 '을': 67,
 '제공': 68,
 'ㄴ다': 69,
 '이르': 70,
 '이용': 71,
 '면': 72,
 '모델': 73}

In [22]:
len(word2index)

74

In [23]:
matrix = nn.Embedding(len(word2index),10)

In [24]:
matrix.weight.size()

torch.Size([74, 10])

In [29]:
word2index['러닝']

14

In [36]:
pytorch_tensor = Variable(torch.LongTensor([word2index['러닝']]))
tensorflow_tensor = Variable(torch.LongTensor([word2index['자연어']]))
pytorch_tensor, tensorflow_tensor

(Variable containing:
  14
 [torch.LongTensor of size 1], Variable containing:
  42
 [torch.LongTensor of size 1])

In [37]:
pytorch_vector = matrix(pytorch_tensor)
tensorflow_vector = matrix(tensorflow_tensor)
pytorch_vector,tensorflow_vector

(Variable containing:
  0.7360  0.0456  1.3025  0.3129  0.2624  1.8678  1.0026 -0.5596 -0.3155 -0.4390
 [torch.FloatTensor of size 1x10], Variable containing:
 -1.0361  0.3765 -0.0546 -0.1214  0.3796 -1.2015  0.9841  1.5352 -0.9129  0.2730
 [torch.FloatTensor of size 1x10])

In [32]:
print(pytorch_vector.size(),tensorflow_vector.size())

torch.Size([1, 10]) torch.Size([1, 10])


In [33]:
pytorch_vector.dot(tensorflow_vector)

Variable containing:
-2.7035
[torch.FloatTensor of size 1]

In [38]:
F.cosine_similarity(pytorch_vector,tensorflow_vector)

Variable containing:
-0.3715
[torch.FloatTensor of size 1]