##**2. Word2Vec**
1. 주어진 단어들을 word2vec 모델에 들어갈 수 있는 형태로 만듭니다.
2. CBOW, Skip-gram 모델을 각각 구현합니다.
3. 모델을 실제로 학습해보고 결과를 확인합니다.

### **필요 패키지 import**

In [1]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [2]:
from tqdm.auto import tqdm

from konlpy.tag import Okt

from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from collections import defaultdict

import torch
import copy
import numpy as np

### **데이터 전처리**

데이터를 확인하고 Word2Vec 형식에 맞게 전처리합니다.  
학습 데이터는 1번 실습과 동일하고, 테스트를 위한 단어를 아래와 같이 가정해봅시다.

In [3]:
train_data = [
  "정말 맛있습니다. 추천합니다.",
  "기대했던 것보단 별로였네요.",
  "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
  "완전 최고입니다! 재방문 의사 있습니다.",
  "음식도 서비스도 다 만족스러웠습니다.",
  "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
  "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
  "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
  "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
  "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

Tokenization과 vocab을 만드는 과정은 이전 실습과 유사합니다.

In [4]:
tokenizer = Okt()

In [5]:
def make_tokenized(data):

    tokenized = []

    for sent in tqdm(data):

        tokens = tokenizer.morphs(sent, stem = True)

        tokenized.append(tokens)

    return tokenized

In [6]:
train_tokenized = make_tokenized(train_data)

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):

    for token in tokens:

        word_count[token] += 1


  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
word_count = sorted(word_count.items(), key = lambda x: x[1], reverse=True)

print(list(word_count))

[('.', 14), ('도', 7), ('이다', 4), ('좋다', 4), ('별로', 3), ('다', 3), ('이', 3), ('너무', 3), ('음식', 3), ('서비스', 3), ('하다', 2), ('방문', 2), ('위생', 2), ('좀', 2), ('더', 2), ('에', 2), ('조금', 2), ('정말', 1), ('맛있다', 1), ('추천', 1), ('기대하다', 1), ('것', 1), ('보단', 1), ('가격', 1), ('비싸다', 1), ('다시', 1), ('가다', 1), ('싶다', 1), ('생각', 1), ('안', 1), ('드네', 1), ('요', 1), ('완전', 1), ('최고', 1), ('!', 1), ('재', 1), ('의사', 1), ('있다', 1), ('만족스럽다', 1), ('상태', 1), ('가', 1), ('개선', 1), ('되다', 1), ('기르다', 1), ('바라다', 1), ('맛', 1), ('직원', 1), ('분들', 1), ('친절하다', 1), ('기념일', 1), ('분위기', 1), ('전반', 1), ('적', 1), ('으로', 1), ('짜다', 1), ('저', 1), ('는', 1), ('신경', 1), ('써다', 1), ('불쾌하다', 1)]


In [9]:
w2i = {}

for pair in tqdm(word_count):

    if pair[0] not in w2i:

        w2i[pair[0]] = len(w2i)

  0%|          | 0/60 [00:00<?, ?it/s]

In [10]:
print(train_tokenized)

print(w2i)

[['정말', '맛있다', '.', '추천', '하다', '.'], ['기대하다', '것', '보단', '별로', '이다', '.'], ['다', '좋다', '가격', '이', '너무', '비싸다', '다시', '가다', '싶다', '생각', '이', '안', '드네', '요', '.'], ['완전', '최고', '이다', '!', '재', '방문', '의사', '있다', '.'], ['음식', '도', '서비스', '도', '다', '만족스럽다', '.'], ['위생', '상태', '가', '좀', '별로', '이다', '.', '좀', '더', '개선', '되다', '기르다', '바라다', '.'], ['맛', '도', '좋다', '직원', '분들', '서비스', '도', '너무', '친절하다', '.'], ['기념일', '에', '방문', '하다', '음식', '도', '분위기', '도', '서비스', '도', '다', '좋다', '.'], ['전반', '적', '으로', '음식', '이', '너무', '짜다', '.', '저', '는', '별로', '이다', '.'], ['위생', '에', '조금', '더', '신경', '써다', '좋다', '.', '조금', '불쾌하다', '.']]
{'.': 0, '도': 1, '이다': 2, '좋다': 3, '별로': 4, '다': 5, '이': 6, '너무': 7, '음식': 8, '서비스': 9, '하다': 10, '방문': 11, '위생': 12, '좀': 13, '더': 14, '에': 15, '조금': 16, '정말': 17, '맛있다': 18, '추천': 19, '기대하다': 20, '것': 21, '보단': 22, '가격': 23, '비싸다': 24, '다시': 25, '가다': 26, '싶다': 27, '생각': 28, '안': 29, '드네': 30, '요': 31, '완전': 32, '최고': 33, '!': 34, '재': 35, '의사': 36, '있다': 37, '만족스럽다': 38, '상태

실제 모델에 들어가기 위한 input을 만들기 위해 `Dataset` 클래스를 정의합니다.

In [13]:
#(주변, 중심)
class CBOWDataset(Dataset):

    def __init__(self, train_tokenized, window_size = 2):

        self.x = []
        self.y = []

        for tokens in tqdm(train_tokenized):

            token_ids = [w2i[token] for token in tokens]

            for i,id in enumerate(token_ids):

                if i-window_size >= 0 and i+window_size < len(token_ids):

                    self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.y.append(id)


        self.x = torch.LongTensor(self.x) # (전체 데이터 개수, 2 * window_size)
        self.y = torch.LongTensor(self.y) # (전체 데이터 개수)

    def __len__(self):

        return self.x.shape[0]

    def __getitem__(self, idx):

        return self.x[idx], self.y[idx]


In [14]:
#(중심, 주변)
class SkipGramDataset(Dataset):

    def __init__(self, train_tokenized, window_size = 2):

        self.x = []
        self.y = []

        for tokens in tqdm(train_tokenized):

            token_ids = [w2i[token] for token in tokens]

            for i,id in enumerate(token_ids):

                if i-window_size >= 0 and i + window_size < len(token_ids):

                    self.x += [id] * 2 * window_size
                    self.y += (token_ids[i - window_size:i] + token_ids[i+1: i+window_size+1])


        self.x = torch.LongTensor(self.x) # (전체 데이터 개수)
        self.y = torch.LongTensor(self.y) # (전체 데이터 개수)

    def __len__(self):

        return self.x.shape[0]

    def __getitem__(self,idx):

        return self.x[idx], self.y[idx]

각 모델에 맞는 `Dataset` 객체를 생성합니다.

In [15]:
cbow_set = CBOWDataset(train_tokenized)

skipgram_set = SkipGramDataset(train_tokenized)

print(list(skipgram_set))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[(tensor(0), tensor(17)), (tensor(0), tensor(18)), (tensor(0), tensor(19)), (tensor(0), tensor(10)), (tensor(19), tensor(18)), (tensor(19), tensor(0)), (tensor(19), tensor(10)), (tensor(19), tensor(0)), (tensor(22), tensor(20)), (tensor(22), tensor(21)), (tensor(22), tensor(4)), (tensor(22), tensor(2)), (tensor(4), tensor(21)), (tensor(4), tensor(22)), (tensor(4), tensor(2)), (tensor(4), tensor(0)), (tensor(23), tensor(5)), (tensor(23), tensor(3)), (tensor(23), tensor(6)), (tensor(23), tensor(7)), (tensor(6), tensor(3)), (tensor(6), tensor(23)), (tensor(6), tensor(7)), (tensor(6), tensor(24)), (tensor(7), tensor(23)), (tensor(7), tensor(6)), (tensor(7), tensor(24)), (tensor(7), tensor(25)), (tensor(24), tensor(6)), (tensor(24), tensor(7)), (tensor(24), tensor(25)), (tensor(24), tensor(26)), (tensor(25), tensor(7)), (tensor(25), tensor(24)), (tensor(25), tensor(26)), (tensor(25), tensor(27)), (tensor(26), tensor(24)), (tensor(26), tensor(25)), (tensor(26), tensor(27)), (tensor(26), tens

### **모델 Class 구현**

차례대로 두 가지 Word2Vec 모델을 구현합니다.  


*   `self.embedding`: `vocab_size` 크기의 one-hot vector를 특정 크기의 `dim` 차원으로 embedding 시키는 layer.
*   `self.linear`: 변환된 embedding vector를 다시 원래 `vocab_size`로 바꾸는 layer.

In [22]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, dim):

        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
        self.linear = nn.Linear(dim, vocab_size)

    #B: Batch size, W : window size, d_w: word_embedding size, V: Vocab_size

    def forward(self, x): #x:(B, 2W)

        embeddings = self.embedding(x) #(B, 2W, d_w)
        embeddings = torch.sum(embeddings, dim = 1) #(B,d_w) #summation of input embedding vectors
        output = self.linear(embeddings) #(B,V)

        return output

In [17]:
class SkipGram(nn.Module):

    def __init__(self, vocab_size, dim):

        super(SkipGram, self).__init__()

        self.embedding = nn.Embedding(vocab_size, dim, sparse = True)
        self.linear = nn.Linear(dim,vocab_size)

    #B: Batch size, W : window size, d_w: word_embedding size, V: Vocab_size

    def forward(self, x): #x:(B)

        embeddings = self.embedding(x) #(B,d_w)
        output = self.linear(embeddings) #(B,V)

        return output

두 가지 모델을 생성합니다.

In [23]:
cbow = CBOW(vocab_size = len(w2i), dim = 256)

skipgram = SkipGram(vocab_size = len(w2i), dim = 256)

In [19]:
len(w2i)

60

### **모델 학습**

다음과 같이 hyperparamter를 세팅하고 `DataLoader` 객체를 만듭니다.

In [24]:
batch_size = 4
learning_rate = 5e-4
num_epochs = 5

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size = batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size = batch_size)

첫번째로 CBOW 모델 학습입니다.

In [25]:
cbow.train()

cbow = cbow.to(device)

optim = torch.optim.SGD(cbow.parameters(), lr = learning_rate)

loss_function = nn.CrossEntropyLoss()

for e in range(1,num_epochs + 1):

    print("#"*50)
    print(f"Epoch: {e}")

    for batch in tqdm(cbow_loader):

        x,y = batch
        x,y = x.to(device), y.to(device) #(B,W), (B)
        output = cbow(x)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

        print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 4.998669624328613
Train loss: 5.055840492248535
Train loss: 4.695262908935547
Train loss: 6.018867015838623
Train loss: 5.089680194854736
Train loss: 4.704496383666992
Train loss: 4.442612648010254
Train loss: 4.824930191040039
Train loss: 5.106001377105713
Train loss: 4.790855407714844
Train loss: 3.6926157474517822
Train loss: 3.951298236846924
Train loss: 4.242554187774658
Train loss: 4.4338178634643555
Train loss: 4.319639682769775
Train loss: 4.885290145874023
##################################################
Epoch: 2


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 4.836484432220459
Train loss: 4.90249490737915
Train loss: 4.565328598022461
Train loss: 5.880249500274658
Train loss: 4.958094596862793
Train loss: 4.455463886260986
Train loss: 4.288689136505127
Train loss: 4.694179534912109
Train loss: 4.990781784057617
Train loss: 4.606451034545898
Train loss: 3.560987949371338
Train loss: 3.618292808532715
Train loss: 4.093495845794678
Train loss: 4.328906536102295
Train loss: 4.167192459106445
Train loss: 4.751956939697266
##################################################
Epoch: 3


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 4.676461696624756
Train loss: 4.7511396408081055
Train loss: 4.436976909637451
Train loss: 5.74268102645874
Train loss: 4.827861309051514
Train loss: 4.214081287384033
Train loss: 4.137861251831055
Train loss: 4.565401554107666
Train loss: 4.879560470581055
Train loss: 4.426187515258789
Train loss: 3.4344606399536133
Train loss: 3.2994627952575684
Train loss: 3.946713924407959
Train loss: 4.2258124351501465
Train loss: 4.017054557800293
Train loss: 4.622084617614746
##################################################
Epoch: 4


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 4.518679618835449
Train loss: 4.601819038391113
Train loss: 4.310239791870117
Train loss: 5.606167793273926
Train loss: 4.698976516723633
Train loss: 3.9818387031555176
Train loss: 3.9902877807617188
Train loss: 4.438615798950195
Train loss: 4.772646903991699
Train loss: 4.250661849975586
Train loss: 3.314117670059204
Train loss: 2.998009443283081
Train loss: 3.8022937774658203
Train loss: 4.124486923217773
Train loss: 3.8693246841430664
Train loss: 4.495598793029785
##################################################
Epoch: 5


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 4.363227844238281
Train loss: 4.454581260681152
Train loss: 4.185157299041748
Train loss: 5.47071647644043
Train loss: 4.57144021987915
Train loss: 3.7603940963745117
Train loss: 3.8461337089538574
Train loss: 4.313839912414551
Train loss: 4.670224666595459
Train loss: 4.08051061630249
Train loss: 3.2010574340820312
Train loss: 2.717442750930786
Train loss: 3.6603312492370605
Train loss: 4.02488899230957
Train loss: 3.724113702774048
Train loss: 4.372406005859375
Finished.


다음으로 Skip-gram 모델 학습입니다.

In [26]:
skipgram.train()
skipgram = skipgram.to(device)

optim = torch.optim.SGD(skipgram.parameters(), lr = learning_rate)

loss_function = nn.CrossEntropyLoss()

for e in range(1,num_epochs + 1):

    print("#"*50)
    print(f"Epoch: {e}")

    for batch in tqdm(skipgram_loader):

        x,y = batch
        x,y = x.to(device), y.to(device) #(B,W), (B)

        output = skipgram(x) #(B,V)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

        print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.517376899719238
##################################################
Epoch: 2


  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.47509765625
##################################################
Epoch: 3


  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.433172225952148
##################################################
Epoch: 4


  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.391600608825684
##################################################
Epoch: 5


  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.350382328033447
Finished.


### **테스트**

학습된 각 모델을 이용하여 test 단어들의 word embedding을 확인합니다.

In [27]:
for word in test_words:

    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = cbow.embedding(input_id) #cbow 모델의 self.embedding에 넣어줌

    print(f"Word: {word}")
    print(emb.squeeze(0))

Word: 음식
tensor([-5.5071e-01,  6.0320e-01,  9.1886e-01,  1.2881e+00, -1.0419e+00,
         6.2107e-01,  1.6309e-01, -2.4791e-01, -1.1016e+00,  2.1013e+00,
         5.7410e-01, -2.7454e-01, -1.1777e+00, -2.5627e-01, -5.5529e-01,
        -1.6289e+00,  2.7615e-02,  1.2069e+00,  1.8319e+00, -1.0153e+00,
        -1.5085e+00,  9.1160e-01,  1.5129e+00,  8.6093e-01, -2.0467e-02,
        -1.6544e-01, -1.0389e+00,  7.0622e-02, -9.7334e-01, -1.4998e-01,
         7.5131e-01,  5.4421e-01,  1.5134e+00, -1.7924e+00, -4.8499e-01,
        -2.9823e-02,  7.1889e-01,  9.6634e-01, -1.8919e+00, -1.1779e+00,
        -2.2674e-01,  2.0760e-01,  7.9796e-01, -1.5152e+00, -1.8259e+00,
         2.2202e-01,  5.9232e-01,  6.0727e-01, -1.6640e+00, -8.8632e-01,
        -2.1129e-01, -1.4111e+00, -4.7432e-03,  8.7209e-02, -9.4779e-01,
         3.9153e-01,  4.3499e-01, -3.3298e-02,  1.3447e+00,  1.0951e+00,
        -6.1913e-01, -3.0500e-02, -4.1859e-01, -2.2061e-01, -3.6907e-01,
         1.1116e+00,  1.3195e+00, -3.2619e

In [28]:
for word in test_words:

    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = skipgram.embedding(input_id)

    print(f"Word: {word}")
    print(emb.squeeze(0))

Word: 음식
tensor([ 1.0847e+00, -1.8312e+00,  3.4982e-01, -1.2369e+00, -3.1424e-01,
         5.5184e-01,  6.0820e-01,  1.2232e-01, -1.4413e-01,  1.4185e-01,
        -3.5492e-01,  3.4940e-01,  8.2831e-01, -9.6391e-01, -4.4457e-01,
         8.2745e-01, -4.8808e-01, -1.0765e+00, -6.5935e-01, -6.3846e-01,
         7.0555e-01,  1.3240e+00, -5.1978e-01, -1.7075e-03, -4.5028e-01,
        -1.0363e+00,  1.4185e+00, -1.0740e+00,  1.2054e+00,  5.4588e-02,
        -3.6388e-01, -1.7409e+00, -4.9656e-01, -1.9145e+00, -1.2126e+00,
        -1.4036e+00, -5.2734e-01,  3.5698e-01, -7.3549e-01, -5.5043e-01,
        -1.0936e-01,  4.0335e-01,  1.3138e-01,  2.9741e-02,  1.8077e-01,
        -5.7740e-01, -2.7002e-01, -1.0702e+00, -1.0108e+00,  2.0020e+00,
        -1.1820e+00, -8.6610e-01,  4.5532e-01, -1.4719e+00,  6.2662e-01,
        -1.0821e+00,  1.1757e+00,  7.1022e-02, -3.9596e-01,  2.4023e-01,
        -1.1542e+00, -1.0567e+00, -1.7434e-01, -2.3581e-01, -7.7791e-01,
         2.3312e-02,  2.3060e-01,  9.6101e