# 1 Tokenization

In [1]:
vocab = ['king', 'queen', 'man', 'woman', 'apple', 'orange', 'fruit', 'dog', 'cat', 'animal']
word_to_idx = {word: i for i, word in enumerate(vocab)}

# 각 단어와 인덱스를 출력
for word, idx in word_to_idx.items():
    print(f"{word:8}: {idx}")

king    : 0
queen   : 1
man     : 2
woman   : 3
apple   : 4
orange  : 5
fruit   : 6
dog     : 7
cat     : 8
animal  : 9


# 2. Embedding

In [2]:
import torch.nn as nn

embedding_dim = 5
embedding = nn.Embedding(len(vocab), embedding_dim)

for word, idx in word_to_idx.items():
    vector = embedding.weight[idx]
    print(f"{word:8} (index {idx}): {vector.tolist()}")

king     (index 0): [-0.9843325614929199, 1.1391586065292358, -0.9179290533065796, 0.7631993889808655, 0.1090579703450203]
queen    (index 1): [0.6273496747016907, 1.3188352584838867, -0.328174889087677, -0.24093422293663025, 0.7450929284095764]
man      (index 2): [1.5874661207199097, -0.984518826007843, -1.4914472103118896, 0.8793133497238159, -0.8942144513130188]
woman    (index 3): [-0.5352128148078918, -0.47506096959114075, -0.9547194242477417, 0.2387956827878952, -0.7360312938690186]
apple    (index 4): [0.6903787851333618, 1.5064266920089722, -1.8485966920852661, -1.1875098943710327, -0.8520223498344421]
orange   (index 5): [0.9441819190979004, 0.0930686667561531, -0.14445760846138, 1.0427806377410889, 0.7968577146530151]
fruit    (index 6): [2.117994785308838, -0.38488301634788513, 1.0496927499771118, 0.09504067152738571, -0.1869911551475525]
dog      (index 7): [0.6117108464241028, 0.8869150280952454, -0.6443786025047302, 1.357578992843628, -0.044430434703826904]
cat      (ind

# 3. Cosine Similarity

In [3]:
import torch.nn.functional as F

def cosine_similarity(vec1, vec2):
    return F.cosine_similarity(vec1, vec2).item()

In [4]:
import torch

# 기준 단어 설정
base_word = 'king'
base_idx = torch.tensor([word_to_idx[base_word]], dtype=torch.long)
base_vec = embedding(base_idx)

# 다른 단어들과의 유사도 계산
print(f"\n'{base_word}'와 다른 단어들의 유사도:")
for word in vocab:
    if word == base_word:
        continue
    idx = torch.tensor([word_to_idx[word]], dtype=torch.long)
    vec = embedding(idx)
    sim = cosine_similarity(base_vec, vec)
    print(f"{word:8}: {sim:>7.4f}")


'king'와 다른 단어들의 유사도:
queen   :  0.3333
man     : -0.1427
woman   :  0.3522
apple   :  0.3128
orange  :  0.0614
fruit   : -0.7424
dog     :  0.5706
cat     :  0.6080
animal  :  0.1530


# 4. Training

In [5]:
import torch.optim as optim

# 학습 설정
optimizer = optim.SGD(embedding.parameters(), lr=0.01)
loss_fn = nn.CosineEmbeddingLoss()

# 간단한 학습 데이터 (임의의 유사성 쌍)
pairs = [
    ('king', 'queen'),
    ('king', 'man'),
    ('queen', 'woman'),
    ('man', 'woman'),
    ('apple', 'orange'),
    ('dog', 'cat'),
    ('fruit', 'apple'),
    ('fruit', 'orange'),
    ('animal', 'dog'),
    ('animal', 'cat')
]

for epoch in range(500):
    total_loss = 0
    for w1, w2 in pairs:
        idx1 = torch.tensor([word_to_idx[w1]], dtype=torch.long)
        idx2 = torch.tensor([word_to_idx[w2]], dtype=torch.long)
        vec1 = embedding(idx1)
        vec2 = embedding(idx2)
        target = torch.tensor([1.0])  # 유사한 단어는 1.0
        loss = loss_fn(vec1, vec2, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1:4}, Loss: {total_loss:.4f}")

Epoch   20, Loss: 6.6738
Epoch   40, Loss: 5.7269
Epoch   60, Loss: 4.8702
Epoch   80, Loss: 4.1160
Epoch  100, Loss: 3.4666
Epoch  120, Loss: 2.9158
Epoch  140, Loss: 2.4524
Epoch  160, Loss: 2.0638
Epoch  180, Loss: 1.7380
Epoch  200, Loss: 1.4645
Epoch  220, Loss: 1.2344
Epoch  240, Loss: 1.0406
Epoch  260, Loss: 0.8772
Epoch  280, Loss: 0.7394
Epoch  300, Loss: 0.6231
Epoch  320, Loss: 0.5250
Epoch  340, Loss: 0.4422
Epoch  360, Loss: 0.3724
Epoch  380, Loss: 0.3136
Epoch  400, Loss: 0.2640
Epoch  420, Loss: 0.2222
Epoch  440, Loss: 0.1870
Epoch  460, Loss: 0.1574
Epoch  480, Loss: 0.1325
Epoch  500, Loss: 0.1115


In [6]:
# 기준 단어 설정
base_word = 'king'
base_idx = torch.tensor([word_to_idx[base_word]], dtype=torch.long)
base_vec = embedding(base_idx)

# 다른 단어들과의 유사도 계산
print(f"\n'{base_word}'와 다른 단어들의 유사도:")
for word in vocab:
    if word == base_word:
        continue
    idx = torch.tensor([word_to_idx[word]], dtype=torch.long)
    vec = embedding(idx)
    sim = cosine_similarity(base_vec, vec)
    print(f"{word:8}: {sim:>7.4f}")


'king'와 다른 단어들의 유사도:
queen   :  0.9988
man     :  0.9706
woman   :  0.9947
apple   :  0.5313
orange  :  0.4586
fruit   :  0.3885
dog     :  0.7412
cat     :  0.7405
animal  :  0.7401
