In [1]:
from tqdm import tqdm
from konlpy.tag import Okt
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

import torch
import copy
import numpy as np

### 데이터 구성

In [2]:
train_data = [
  "정말 맛있습니다. 추천합니다.",
  "기대했던 것보단 별로였네요.",
  "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
  "완전 최고입니다! 재방문 의사 있습니다.",
  "음식도 서비스도 다 만족스러웠습니다.",
  "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
  "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
  "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
  "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
  "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."       
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

In [3]:
tokenizer = Okt()

In [4]:
def make_tokenized(data):
    tokenized = []
    for sent in tqdm(data):
        tokens = tokenizer.morphs(sent, stem = True)
        tokenized.append(tokens)
    return tokenized

In [5]:
train_tokenized = make_tokenized(train_data)

100%|██████████| 10/10 [00:02<00:00,  4.43it/s]


In [6]:
word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):
    for token in tokens:
        word_count[token] += 1

100%|██████████| 10/10 [00:00<00:00, 73326.99it/s]


In [7]:
word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)


In [8]:
w2i = {} 
for pair in tqdm(word_count):
    if pair[0] not in w2i:
        w2i[pair[0]] = len(w2i)
print(w2i)

100%|██████████| 60/60 [00:00<00:00, 2762.80it/s]

{'.': 0, '도': 1, '이다': 2, '좋다': 3, '별로': 4, '다': 5, '이': 6, '너무': 7, '음식': 8, '서비스': 9, '하다': 10, '방문': 11, '위생': 12, '좀': 13, '더': 14, '에': 15, '조금': 16, '정말': 17, '맛있다': 18, '추천': 19, '기대하다': 20, '것': 21, '보단': 22, '가격': 23, '비싸다': 24, '다시': 25, '가다': 26, '싶다': 27, '생각': 28, '안': 29, '드네': 30, '요': 31, '완전': 32, '최고': 33, '!': 34, '재': 35, '의사': 36, '있다': 37, '만족스럽다': 38, '상태': 39, '가': 40, '개선': 41, '되다': 42, '기르다': 43, '바라다': 44, '맛': 45, '직원': 46, '분들': 47, '친절하다': 48, '기념일': 49, '분위기': 50, '전반': 51, '적': 52, '으로': 53, '짜다': 54, '저': 55, '는': 56, '신경': 57, '써다': 58, '불쾌하다': 59}





In [42]:
class CBOWDataset(Dataset):
    def __init__(self, train_tokenized, window_size = 2):
        self.x = []
        self.y = []
        
        for tokens in tqdm(train_tokenized):
            token_ids = [w2i[token] for token in tokens]
            for i, id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.y.append(id)
        self.x = torch.LongTensor(self.x)
        self.y = torch.LongTensor(self.y)
        
    def __len__(self):
        return self.x.shape[0]
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [43]:
class SkipGramDataset(Dataset):
    def __init__(self, train_tokenized, window_size = 2 ):
        self.x = []
        self.y = []
        
        for tokens in tqdm(train_tokenized):
            token_ids = [w2i[token] for token in tokens]
            for i, id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    self.y += (token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.x += [id] * 2 * window_size
        self.x = torch.LongTensor(self.x)
        self.y = torch.LongTensor(self.y)
        
    def __len__(self):
        return self.x.shape[0]
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
                    

In [44]:
cbow_set = CBOWDataset(train_tokenized)
skipgram_set = SkipGramDataset(train_tokenized)

100%|██████████| 10/10 [00:00<00:00, 11134.34it/s]
100%|██████████| 10/10 [00:00<00:00, 56223.91it/s]


In [45]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
        self.linear = nn.Linear(dim, vocab_size)
        
    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = torch.sum(embeddings, dim=1)
        output = self.linear(embeddings)
        return output

In [46]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, dim):
        super(SkipGram, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
        self.linear = nn.Linear(dim, vocab_size)
        
    def forward(self, x):
        embeddings = self.embedding(x)
        output = self.linear(embeddings)
        return output

모델 생성

In [47]:
cbow = CBOW(vocab_size=len(w2i), dim=256)
skipgram = SkipGram(vocab_size=len(w2i), dim=256)

In [48]:
batch_size = 4
learning_rate = 5e-4
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size=batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size=batch_size)

In [49]:
cbow.train()
cbow = cbow.to(device)
optim = torch.optim.SGD(cbow.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
    print("#" * 50)
    print(f"Epoch: {e}")
    for batch in tqdm(cbow_loader):
        x, y = batch
        x, y = x.to(device), y.to(device) # (B, W), (B)
        output = cbow(x)  # (B, V)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step() 

        print(f"Train loss: {loss.item()}")
print("Finished")

100%|██████████| 16/16 [00:00<00:00, 199.79it/s]
100%|██████████| 16/16 [00:00<00:00, 1025.77it/s]
100%|██████████| 16/16 [00:00<00:00, 849.12it/s]
100%|██████████| 16/16 [00:00<00:00, 858.51it/s]
100%|██████████| 16/16 [00:00<00:00, 389.20it/s]

##################################################
Epoch: 1
Train loss: 4.06065559387207
Train loss: 3.2805368900299072
Train loss: 5.238283157348633
Train loss: 5.654989242553711
Train loss: 4.657295227050781
Train loss: 5.284740447998047
Train loss: 6.04366397857666
Train loss: 3.4834835529327393
Train loss: 4.7581586837768555
Train loss: 4.869666576385498
Train loss: 4.379453659057617
Train loss: 4.701166152954102
Train loss: 3.7360734939575195
Train loss: 5.297375679016113
Train loss: 4.67045783996582
Train loss: 4.100661277770996
##################################################
Epoch: 2
Train loss: 3.9094510078430176
Train loss: 3.1645917892456055
Train loss: 5.103442192077637
Train loss: 5.521780967712402
Train loss: 4.538938999176025
Train loss: 4.990396022796631
Train loss: 5.808369159698486
Train loss: 3.3620362281799316
Train loss: 4.640562057495117
Train loss: 4.712558269500732
Train loss: 4.196480751037598
Train loss: 4.322608470916748
Train loss: 3.595561981201172
Train 




In [None]:
skipgram.train()
skipgram = skipgram.to(device)
optim = torch.optim.SGD(skipgram.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
    print("#" * 50)
    print(f"Epoch: {e}")
    for batch in tqdm(skipgram_loader):
        x, y = batch
        x, y = x.to(device), y.to(device) # (B, W), (B)
        output = skipgram(x)  # (B, V)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

        print(f"Train loss: {loss.item()}")

print("Finished.")

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = cbow.embedding(input_id)

    print(f"Word: {word}")
    print(emb.squeeze(0))

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = skipgram.embedding(input_id)

    print(f"Word: {word}")
    print(max(emb.squeeze(0)))

In [79]:
x,y = list(cbow_set)[2]
print(x)
print(y)
embedding = nn.Embedding(num_embeddings=60 , embedding_dim=10, sparse=True)
a = embedding(x)
print(a.shape)
torch.sum(a, dim = 1)

tensor([20, 21,  4,  2])
tensor(22)
torch.Size([4, 10])


tensor([ 1.5033, -0.6327,  1.9857,  1.4902], grad_fn=<SumBackward1>)

In [81]:
x,y = list(skipgram_set)[2]
print(x)
print(y)
embedding = nn.Embedding(num_embeddings=60 , embedding_dim=10, sparse=True)
a = embedding(x)
print(a.shape)

tensor(0)
tensor(19)
torch.Size([10])


In [75]:
len(cbow_set), len(skipgram_set)

(64, 256)

In [86]:
a.view(-1,1)

tensor([[-0.9207],
        [ 0.0544],
        [ 1.0189],
        [-0.0885],
        [ 0.5778],
        [ 0.1303],
        [ 1.1415],
        [ 0.4251],
        [ 1.6866],
        [-0.8761]], grad_fn=<ViewBackward>)

In [87]:
a

tensor([-0.9207,  0.0544,  1.0189, -0.0885,  0.5778,  0.1303,  1.1415,  0.4251,
         1.6866, -0.8761], grad_fn=<EmbeddingBackward>)