# 2 실습: Word2Vec

1. 주어진 단어들을 word2vec 모델에 들어갈 수 있는 형태로 만듭니다.
2. CBOW, Skip-gram 모델을 각각 구현합니다.
3. 모델을 실제로 학습해보고 결과를 확인합니다.

<br>

## 2.1 필요 패키지 import

In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 60.7 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.3 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 29.1 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [None]:
from tqdm import tqdm
from konlpy.tag import Okt
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

import torch
import copy
import numpy as np

<br>

## 2.2 데이터 전처리

- 데이터를 확인하고 Word2Vec 형식에 맞게 전처리합니다.  
- 학습 데이터는 1번 실습과 동일하고, 테스트를 위한 단어를 아래와 같이 가정해봅시다.

In [None]:
train_data = [
  "정말 맛있습니다. 추천합니다.",
  "기대했던 것보단 별로였네요.",
  "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
  "완전 최고입니다! 재방문 의사 있습니다.",
  "음식도 서비스도 다 만족스러웠습니다.",
  "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
  "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
  "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
  "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
  "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."       
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

<br>

- Tokenization과 vocab을 만드는 과정은 이전 실습과 유사합니다.

In [None]:
tokenizer = Okt()

In [None]:
def make_tokenized(data):
  tokenized = []
  for sent in tqdm(data):
    tokens = tokenizer.morphs(sent, stem=True)
    tokenized.append(tokens)

  return tokenized

In [None]:
train_tokenized = make_tokenized(train_data)

100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


In [None]:
word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):
  for token in tokens:
    word_count[token] += 1

100%|██████████| 10/10 [00:00<00:00, 26035.41it/s]


In [None]:
word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
print(list(word_count))

[('.', 14), ('도', 7), ('이다', 4), ('좋다', 4), ('별로', 3), ('다', 3), ('이', 3), ('너무', 3), ('음식', 3), ('서비스', 3), ('하다', 2), ('방문', 2), ('위생', 2), ('좀', 2), ('더', 2), ('에', 2), ('조금', 2), ('정말', 1), ('맛있다', 1), ('추천', 1), ('기대하다', 1), ('것', 1), ('보단', 1), ('가격', 1), ('비싸다', 1), ('다시', 1), ('가다', 1), ('싶다', 1), ('생각', 1), ('안', 1), ('드네', 1), ('요', 1), ('완전', 1), ('최고', 1), ('!', 1), ('재', 1), ('의사', 1), ('있다', 1), ('만족스럽다', 1), ('상태', 1), ('가', 1), ('개선', 1), ('되다', 1), ('기르다', 1), ('바라다', 1), ('맛', 1), ('직원', 1), ('분들', 1), ('친절하다', 1), ('기념일', 1), ('분위기', 1), ('전반', 1), ('적', 1), ('으로', 1), ('짜다', 1), ('저', 1), ('는', 1), ('신경', 1), ('써다', 1), ('불쾌하다', 1)]


In [None]:
w2i = {}
for pair in tqdm(word_count):
  if pair[0] not in w2i:
    w2i[pair[0]] = len(w2i)

100%|██████████| 60/60 [00:00<00:00, 480263.82it/s]


In [None]:
print(train_tokenized)

[['정말', '맛있다', '.', '추천', '하다', '.'], ['기대하다', '것', '보단', '별로', '이다', '.'], ['다', '좋다', '가격', '이', '너무', '비싸다', '다시', '가다', '싶다', '생각', '이', '안', '드네', '요', '.'], ['완전', '최고', '이다', '!', '재', '방문', '의사', '있다', '.'], ['음식', '도', '서비스', '도', '다', '만족스럽다', '.'], ['위생', '상태', '가', '좀', '별로', '이다', '.', '좀', '더', '개선', '되다', '기르다', '바라다', '.'], ['맛', '도', '좋다', '직원', '분들', '서비스', '도', '너무', '친절하다', '.'], ['기념일', '에', '방문', '하다', '음식', '도', '분위기', '도', '서비스', '도', '다', '좋다', '.'], ['전반', '적', '으로', '음식', '이', '너무', '짜다', '.', '저', '는', '별로', '이다', '.'], ['위생', '에', '조금', '더', '신경', '써다', '좋다', '.', '조금', '불쾌하다', '.']]


In [None]:
print(w2i)

{'.': 0, '도': 1, '이다': 2, '좋다': 3, '별로': 4, '다': 5, '이': 6, '너무': 7, '음식': 8, '서비스': 9, '하다': 10, '방문': 11, '위생': 12, '좀': 13, '더': 14, '에': 15, '조금': 16, '정말': 17, '맛있다': 18, '추천': 19, '기대하다': 20, '것': 21, '보단': 22, '가격': 23, '비싸다': 24, '다시': 25, '가다': 26, '싶다': 27, '생각': 28, '안': 29, '드네': 30, '요': 31, '완전': 32, '최고': 33, '!': 34, '재': 35, '의사': 36, '있다': 37, '만족스럽다': 38, '상태': 39, '가': 40, '개선': 41, '되다': 42, '기르다': 43, '바라다': 44, '맛': 45, '직원': 46, '분들': 47, '친절하다': 48, '기념일': 49, '분위기': 50, '전반': 51, '적': 52, '으로': 53, '짜다': 54, '저': 55, '는': 56, '신경': 57, '써다': 58, '불쾌하다': 59}


<br>

## 2.3 Dataset 클래스 정의

- 실제 모델에 들어가기 위한 input을 만들기 위해 `Dataset` 클래스를 정의합니다.

In [None]:
class CBOWDataset(Dataset):
    def __init__(self, train_tokenized, window_size=2):
        self.x = []
        self.y = []

        for tokens in tqdm(train_tokenized):
            token_ids = [w2i[token] for token in tokens]
            for i, id in enumerate(token_ids):
                if i - window_size >= 0 and i + window_size < len(token_ids):
                    self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.y.append(id)

        self.x = torch.LongTensor(self.x) # (전체 데이터 개수, 2 * window_size)
        self.y = torch.LongTensor(self.y) # (전체 데이터 개수)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [None]:
class SkipGramDataset(Dataset):
  def __init__(self, train_tokenized, window_size=2):
    self.x = []
    self.y = []

    for tokens in tqdm(train_tokenized):
      token_ids = [w2i[token] for token in tokens]
      for i, id in enumerate(token_ids):
        if i-window_size >= 0 and i+window_size < len(token_ids):
          self.y += (token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
          self.x += [id] * 2 * window_size

    self.x = torch.LongTensor(self.x)  # (전체 데이터 개수)
    self.y = torch.LongTensor(self.y)  # (전체 데이터 개수)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

<br>

- 각 모델에 맞는 Dataset 객체를 생성합니다.

In [None]:
cbow_set = CBOWDataset(train_tokenized)
skipgram_set = SkipGramDataset(train_tokenized)

100%|██████████| 10/10 [00:00<00:00, 33689.19it/s]
100%|██████████| 10/10 [00:00<00:00, 47393.27it/s]


In [None]:
print(list(skipgram_set))

[(tensor(0), tensor(17)), (tensor(0), tensor(18)), (tensor(0), tensor(19)), (tensor(0), tensor(10)), (tensor(19), tensor(18)), (tensor(19), tensor(0)), (tensor(19), tensor(10)), (tensor(19), tensor(0)), (tensor(22), tensor(20)), (tensor(22), tensor(21)), (tensor(22), tensor(4)), (tensor(22), tensor(2)), (tensor(4), tensor(21)), (tensor(4), tensor(22)), (tensor(4), tensor(2)), (tensor(4), tensor(0)), (tensor(23), tensor(5)), (tensor(23), tensor(3)), (tensor(23), tensor(6)), (tensor(23), tensor(7)), (tensor(6), tensor(3)), (tensor(6), tensor(23)), (tensor(6), tensor(7)), (tensor(6), tensor(24)), (tensor(7), tensor(23)), (tensor(7), tensor(6)), (tensor(7), tensor(24)), (tensor(7), tensor(25)), (tensor(24), tensor(6)), (tensor(24), tensor(7)), (tensor(24), tensor(25)), (tensor(24), tensor(26)), (tensor(25), tensor(7)), (tensor(25), tensor(24)), (tensor(25), tensor(26)), (tensor(25), tensor(27)), (tensor(26), tensor(24)), (tensor(26), tensor(25)), (tensor(26), tensor(27)), (tensor(26), tens

<br>

## 2.4 모델 Class 구현

- 차례대로 두 가지 Word2Vec 모델을 구현합니다.  
- `self.embedding`: `vocab_size` 크기의 one-hot vector를 특정 크기의 `dim` 차원으로 embedding 시키는 layer.
- `self.linear`: 변환된 embedding vector를 다시 원래 `vocab_size`로 바꾸는 layer.

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
        self.linear = nn.Linear(dim, vocab_size)

    # B: batch size, W: window size, d_w: word embedding size, V: vocab size
    def forward(self, x):  # x: (B, 2W)
        embeddings = self.embedding(x)  # (B, 2W, d_w)
        embeddings = torch.sum(embeddings, dim=1)  # (B, d_w)
        output = self.linear(embeddings)  # (B, V)
        return output

In [None]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, dim):
    super(SkipGram, self).__init__()
    self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, vocab_size)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x): # x: (B)
    embeddings = self.embedding(x)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V)
    return output

<br>

- 두 가지 모델을 생성합니다.

In [None]:
cbow = CBOW(vocab_size=len(w2i), dim=256)
skipgram = SkipGram(vocab_size=len(w2i), dim=256)

<br>

## 2.5 모델 학습

### 2.5.1 DataLoader 정의

- 다음과 같이 hyperparamter를 세팅하고 `DataLoader` 객체를 만듭니다.

In [None]:
batch_size = 4
learning_rate = 5e-4
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size=batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size=batch_size)

<br>

### 2.5.2 CBOW 모델 학습

In [None]:
cbow.train()
cbow = cbow.to(device)
optim = torch.optim.SGD(cbow.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
    print("#" * 50)
    print(f"Epoch: {e}")
    for batch in tqdm(cbow_loader):
        x, y = batch
        x, y = x.to(device), y.to(device) # (B, W), (B)
        output = cbow(x)  # (B, V)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

        print(f"Train loss: {loss.item()}")

print("Finished.")       

##################################################
Epoch: 1


100%|██████████| 16/16 [00:00<00:00, 138.24it/s]


Train loss: 4.054550647735596
Train loss: 5.67555046081543
Train loss: 5.198138236999512
Train loss: 4.369802951812744
Train loss: 5.133298873901367
Train loss: 5.997370719909668
Train loss: 5.482280731201172
Train loss: 5.186917304992676
Train loss: 4.893204212188721
Train loss: 4.759692192077637
Train loss: 5.073273181915283
Train loss: 6.199517726898193
Train loss: 4.551670074462891
Train loss: 5.207858085632324
Train loss: 5.625341892242432
Train loss: 4.308653354644775
##################################################
Epoch: 2


100%|██████████| 16/16 [00:00<00:00, 627.15it/s]


Train loss: 3.9145796298980713
Train loss: 5.526880264282227
Train loss: 5.061664581298828
Train loss: 4.24098014831543
Train loss: 4.999307155609131
Train loss: 5.667733192443848
Train loss: 5.278000831604004
Train loss: 5.0507612228393555
Train loss: 4.755553722381592
Train loss: 4.5520219802856445
Train loss: 4.918086528778076
Train loss: 5.777098178863525
Train loss: 4.378524303436279
Train loss: 5.077892303466797
Train loss: 5.428253173828125
Train loss: 4.175396919250488
##################################################
Epoch: 3


100%|██████████| 16/16 [00:00<00:00, 736.45it/s]


Train loss: 3.7776684761047363
Train loss: 5.380146503448486
Train loss: 4.926895618438721
Train loss: 4.114384174346924
Train loss: 4.8666672706604
Train loss: 5.349849224090576
Train loss: 5.0769147872924805
Train loss: 4.916474342346191
Train loss: 4.622776508331299
Train loss: 4.352174282073975
Train loss: 4.770570278167725
Train loss: 5.36696720123291
Train loss: 4.20764684677124
Train loss: 4.949923515319824
Train loss: 5.233956336975098
Train loss: 4.04570198059082
##################################################
Epoch: 4


100%|██████████| 16/16 [00:00<00:00, 616.01it/s]


Train loss: 3.643686056137085
Train loss: 5.235286712646484
Train loss: 4.793839454650879
Train loss: 3.9900174140930176
Train loss: 4.735400676727295
Train loss: 5.0454254150390625
Train loss: 4.879087448120117
Train loss: 4.784059524536133
Train loss: 4.494814872741699
Train loss: 4.160479545593262
Train loss: 4.630369186401367
Train loss: 4.970902442932129
Train loss: 4.039138317108154
Train loss: 4.823968887329102
Train loss: 5.042507171630859
Train loss: 3.9195713996887207
##################################################
Epoch: 5


100%|██████████| 16/16 [00:00<00:00, 712.05it/s]

Train loss: 3.5125274658203125
Train loss: 5.092247009277344
Train loss: 4.662514686584473
Train loss: 3.867877960205078
Train loss: 4.605532169342041
Train loss: 4.756099700927734
Train loss: 4.684622764587402
Train loss: 4.653524398803711
Train loss: 4.371607780456543
Train loss: 3.9773011207580566
Train loss: 4.4969658851623535
Train loss: 4.590827465057373
Train loss: 3.8731191158294678
Train loss: 4.700056076049805
Train loss: 4.853995323181152
Train loss: 3.7970173358917236
Finished.





<br>

### 2.5.3 Skip-gram 모델 학습

In [None]:
skipgram.train()
skipgram = skipgram.to(device)
optim = torch.optim.SGD(skipgram.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
  print("#" * 50)
  print(f"Epoch: {e}")
  for batch in tqdm(skipgram_loader):
    x, y = batch
    x, y = x.to(device), y.to(device) # (B, W), (B)
    output = skipgram(x)  # (B, V)

    optim.zero_grad()
    loss = loss_function(output, y)
    loss.backward()
    optim.step()

    print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


100%|██████████| 64/64 [00:00<00:00, 842.44it/s]


Train loss: 3.9033303260803223
Train loss: 4.652159214019775
Train loss: 4.0611982345581055
Train loss: 4.882054328918457
Train loss: 3.982480049133301
Train loss: 4.266542911529541
Train loss: 4.451872825622559
Train loss: 3.9264135360717773
Train loss: 4.0522918701171875
Train loss: 3.8711843490600586
Train loss: 3.8621044158935547
Train loss: 4.104809284210205
Train loss: 4.287799835205078
Train loss: 5.089155197143555
Train loss: 3.946296453475952
Train loss: 3.6794650554656982
Train loss: 4.245267391204834
Train loss: 4.141561508178711
Train loss: 4.5511393547058105
Train loss: 4.4197821617126465
Train loss: 3.967581272125244
Train loss: 4.299617767333984
Train loss: 4.084833145141602
Train loss: 4.1435227394104
Train loss: 4.628061294555664
Train loss: 4.901200771331787
Train loss: 3.741903066635132
Train loss: 4.589731693267822
Train loss: 4.560562610626221
Train loss: 4.7752275466918945
Train loss: 3.953383445739746
Train loss: 4.116762638092041
Train loss: 4.284392356872559
Tr

100%|██████████| 64/64 [00:00<00:00, 757.27it/s]


Train loss: 3.8831441402435303
Train loss: 4.602764129638672
Train loss: 4.03122615814209
Train loss: 4.822909355163574
Train loss: 3.951694965362549
Train loss: 4.2340569496154785
Train loss: 4.414535045623779
Train loss: 3.9008073806762695
Train loss: 4.021469593048096
Train loss: 3.8466506004333496
Train loss: 3.8413734436035156
Train loss: 4.075228691101074
Train loss: 4.263861656188965
Train loss: 5.0533552169799805
Train loss: 3.921790838241577
Train loss: 3.6568098068237305
Train loss: 4.208268642425537
Train loss: 4.109065055847168
Train loss: 4.522836685180664
Train loss: 4.388424873352051
Train loss: 3.876401662826538
Train loss: 4.210205078125
Train loss: 4.0284504890441895
Train loss: 4.114841461181641
Train loss: 4.593697547912598
Train loss: 4.8444929122924805
Train loss: 3.7071638107299805
Train loss: 4.5620927810668945
Train loss: 4.523254871368408
Train loss: 4.736072063446045
Train loss: 3.928731918334961
Train loss: 4.092088222503662
Train loss: 4.254177093505859
Tra

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 3.8634145259857178
Train loss: 4.553588390350342
Train loss: 4.0014238357543945
Train loss: 4.7642669677734375
Train loss: 3.921130418777466
Train loss: 4.201992511749268


100%|██████████| 64/64 [00:00<00:00, 690.38it/s]


Train loss: 4.377692222595215
Train loss: 3.8753621578216553
Train loss: 3.9908132553100586
Train loss: 3.8222789764404297
Train loss: 3.820754051208496
Train loss: 4.04581880569458
Train loss: 4.24029016494751
Train loss: 5.0176801681518555
Train loss: 3.8974289894104004
Train loss: 3.6345226764678955
Train loss: 4.171518325805664
Train loss: 4.076775550842285
Train loss: 4.494784832000732
Train loss: 4.357227325439453
Train loss: 3.7870168685913086
Train loss: 4.12297248840332
Train loss: 3.972623109817505
Train loss: 4.086327075958252
Train loss: 4.5595903396606445
Train loss: 4.7882537841796875
Train loss: 3.6729190349578857
Train loss: 4.534857273101807
Train loss: 4.486227989196777
Train loss: 4.6972575187683105
Train loss: 3.9042255878448486
Train loss: 4.0675458908081055
Train loss: 4.224115371704102
Train loss: 4.327514171600342
Train loss: 4.236458778381348
Train loss: 4.138345241546631
Train loss: 3.7953412532806396
Train loss: 4.224229335784912
Train loss: 4.248685836791992

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 3.844132423400879
Train loss: 4.504638195037842
Train loss: 3.971794605255127
Train loss: 4.7061309814453125
Train loss: 3.8907878398895264

100%|██████████| 64/64 [00:00<00:00, 774.48it/s]



Train loss: 4.170349597930908
Train loss: 4.341348171234131
Train loss: 3.8500795364379883
Train loss: 3.9603271484375
Train loss: 3.798069953918457
Train loss: 3.800245523452759
Train loss: 4.0165815353393555
Train loss: 4.217080593109131
Train loss: 4.982129096984863
Train loss: 3.873210906982422
Train loss: 3.612604856491089
Train loss: 4.135018825531006
Train loss: 4.044693946838379
Train loss: 4.466983318328857
Train loss: 4.3261895179748535
Train loss: 3.6995816230773926
Train loss: 4.038067817687988
Train loss: 3.917372465133667
Train loss: 4.057980060577393
Train loss: 4.5257415771484375
Train loss: 4.732486724853516
Train loss: 3.639176845550537
Train loss: 4.508022308349609
Train loss: 4.449483871459961
Train loss: 4.658787250518799
Train loss: 3.879864454269409
Train loss: 4.043135643005371
Train loss: 4.194211006164551
Train loss: 4.288120269775391
Train loss: 4.198476791381836
Train loss: 4.113302707672119
Train loss: 3.7414159774780273
Train loss: 4.176581859588623
Train

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 3.825288772583008
Train loss: 4.455923080444336
Train loss: 3.94234037399292
Train loss: 4.648508071899414
Train loss: 3.860668420791626
Train loss: 4.139127254486084
Train loss: 4.305509567260742
Train loss: 3.824960231781006
Train loss: 3.9300127029418945


100%|██████████| 64/64 [00:00<00:00, 639.78it/s]

Train loss: 3.7740259170532227
Train loss: 3.779848575592041
Train loss: 3.987520217895508
Train loss: 4.194230079650879
Train loss: 4.946702003479004
Train loss: 3.8491363525390625
Train loss: 3.591054916381836
Train loss: 4.0987749099731445
Train loss: 4.012825012207031
Train loss: 4.439432621002197
Train loss: 4.295313835144043
Train loss: 3.6142609119415283
Train loss: 3.955641508102417
Train loss: 3.862720012664795
Train loss: 4.029802322387695
Train loss: 4.49215030670166
Train loss: 4.67719841003418
Train loss: 3.6059460639953613
Train loss: 4.481581211090088
Train loss: 4.413023471832275
Train loss: 4.620664119720459
Train loss: 3.85564923286438
Train loss: 4.018856048583984
Train loss: 4.164465427398682
Train loss: 4.249178886413574
Train loss: 4.160789489746094
Train loss: 4.088322639465332
Train loss: 3.689164400100708
Train loss: 4.130793571472168
Train loss: 4.180795669555664
Train loss: 3.760241746902466
Train loss: 4.104795455932617
Train loss: 4.411996841430664
Train lo




<br>

## 2.6 테스트

- 학습된 각 모델을 이용하여 test 단어들의 word embedding을 확인합니다.

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = cbow.embedding(input_id)

    print(f"Word: {word}")
    print(emb.squeeze(0))
    print()

Word: 음식
tensor([ 1.7712e-01,  1.7069e+00,  2.4660e-01,  9.0252e-01, -1.3213e-01,
        -1.1808e+00,  2.3575e+00, -9.1700e-01, -5.4660e-01,  1.1838e+00,
        -1.2626e+00, -1.1808e-01, -3.8100e-02,  7.3559e-01,  1.1051e+00,
         2.1922e+00,  1.0338e+00, -1.3936e+00,  7.9383e-01,  4.5002e-01,
         1.6132e+00, -2.0661e+00,  3.6536e-02,  9.0191e-01, -1.5138e-01,
        -5.7717e-01,  4.6874e-01,  8.3352e-01, -8.9390e-01, -1.0004e+00,
         6.9371e-01, -9.0990e-01, -1.6480e-01,  1.1343e+00, -5.5651e-01,
        -1.7073e-01,  8.8515e-01,  3.2971e-01, -1.0262e+00, -1.0740e-01,
         1.1418e+00, -1.0126e+00,  1.2385e+00, -1.5794e+00,  2.3313e-01,
         1.1163e-01, -1.0098e-01, -1.3666e+00,  9.5331e-01,  9.7725e-01,
         1.0431e+00,  3.2246e+00,  2.9030e-01, -6.6350e-01,  3.5262e-01,
        -1.3325e+00,  1.8576e+00,  1.7077e-01, -1.0483e+00, -6.3113e-01,
        -3.8485e-01, -1.2340e-02, -1.5933e+00, -4.6251e-01, -2.6286e-02,
        -4.1586e-01,  1.3341e+00,  8.2086e

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = skipgram.embedding(input_id)

    print(f"Word: {word}")
    print(max(emb.squeeze(0)))
    print()

Word: 음식
tensor(4.1343, grad_fn=<UnbindBackward>)

Word: 맛
tensor(2.7054, grad_fn=<UnbindBackward>)

Word: 서비스
tensor(2.6690, grad_fn=<UnbindBackward>)

Word: 위생
tensor(2.6545, grad_fn=<UnbindBackward>)

Word: 가격
tensor(2.9495, grad_fn=<UnbindBackward>)



<br>

## 2.7 notebook to pdf

In [None]:
!apt-get install -qq texlive texlive-xetex texlive-latex-extra pandoc
!pip install -qq pypandoc

Extracting templates from packages: 100%
Preconfiguring packages ...
Selecting previously unselected package fonts-droid-fallback.
(Reading database ... 148492 files and directories currently installed.)
Preparing to unpack .../00-fonts-droid-fallback_1%3a6.0.1r16-1.1_all.deb ...
Unpacking fonts-droid-fallback (1:6.0.1r16-1.1) ...
Selecting previously unselected package fonts-lato.
Preparing to unpack .../01-fonts-lato_2.0-2_all.deb ...
Unpacking fonts-lato (2.0-2) ...
Selecting previously unselected package poppler-data.
Preparing to unpack .../02-poppler-data_0.4.8-2_all.deb ...
Unpacking poppler-data (0.4.8-2) ...
Selecting previously unselected package tex-common.
Preparing to unpack .../03-tex-common_6.09_all.deb ...
Unpacking tex-common (6.09) ...
Selecting previously unselected package fonts-lmodern.
Preparing to unpack .../04-fonts-lmodern_2.004.5-3_all.deb ...
Unpacking fonts-lmodern (2.004.5-3) ...
Selecting previously unselected package fonts-noto-mono.
Preparing to unpack .

In [None]:
!jupyter nbconvert --to PDF '/content/drive/MyDrive/boostcamp/04_U-stage/004_NLP/02강 - Word Embedding.ipynb'

[NbConvertApp] Converting notebook /content/drive/MyDrive/boostcamp/04_U-stage/004_NLP/02강 - Word Embedding.ipynb to PDF
[NbConvertApp] Writing 114350 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: [u'xelatex', u'./notebook.tex', '-quiet']
[NbConvertApp] CRITICAL | xelatex failed: [u'xelatex', u'./notebook.tex', '-quiet']
This is XeTeX, Version 3.14159265-2.6-0.99998 (TeX Live 2017/Debian) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./notebook.tex
LaTeX2e <2017-04-15>
Babel <3.18> and hyphenation patterns for 3 language(s) loaded.
(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
Document Class: article 2014/09/29 v1.4h Standard LaTeX document class
(/usr/share/texlive/texmf-dist/tex/latex/base/size11.clo))
(/usr/share/texlive/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty
(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty
(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.