In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict


In [2]:
# Datos
text = "The black cat sat on the couch and the brown dog slept on the rug".lower().split()

In [3]:
print(text)

['the', 'black', 'cat', 'sat', 'on', 'the', 'couch', 'and', 'the', 'brown', 'dog', 'slept', 'on', 'the', 'rug']


In [4]:
# Crear vocabulario
vocab = set(text)
word2idx = {w: idx for idx, w in enumerate(vocab)}
idx2word = {idx: w for w, idx in word2idx.items()}
vocab_size = len(vocab)

In [5]:
print(word2idx)

{'and': 0, 'black': 1, 'on': 2, 'slept': 3, 'brown': 4, 'cat': 5, 'sat': 6, 'the': 7, 'rug': 8, 'couch': 9, 'dog': 10}


In [6]:
print(idx2word)

{0: 'and', 1: 'black', 2: 'on', 3: 'slept', 4: 'brown', 5: 'cat', 6: 'sat', 7: 'the', 8: 'rug', 9: 'couch', 10: 'dog'}


In [7]:
print(vocab_size)

11


In [8]:
# Crear datos de entrenamiento (skip-gram con ventana de contexto 2)
def generate_skipgram_data(text, window_size=2):
    pairs = []
    for i, target in enumerate(text):
        for j in range(-window_size, window_size + 1):
            if j == 0 or i + j < 0 or i + j >= len(text):
                continue
            context = text[i + j]
            pairs.append((target, context))
    return pairs

pairs = generate_skipgram_data(text)

In [9]:
print(pairs)

[('the', 'black'), ('the', 'cat'), ('black', 'the'), ('black', 'cat'), ('black', 'sat'), ('cat', 'the'), ('cat', 'black'), ('cat', 'sat'), ('cat', 'on'), ('sat', 'black'), ('sat', 'cat'), ('sat', 'on'), ('sat', 'the'), ('on', 'cat'), ('on', 'sat'), ('on', 'the'), ('on', 'couch'), ('the', 'sat'), ('the', 'on'), ('the', 'couch'), ('the', 'and'), ('couch', 'on'), ('couch', 'the'), ('couch', 'and'), ('couch', 'the'), ('and', 'the'), ('and', 'couch'), ('and', 'the'), ('and', 'brown'), ('the', 'couch'), ('the', 'and'), ('the', 'brown'), ('the', 'dog'), ('brown', 'and'), ('brown', 'the'), ('brown', 'dog'), ('brown', 'slept'), ('dog', 'the'), ('dog', 'brown'), ('dog', 'slept'), ('dog', 'on'), ('slept', 'brown'), ('slept', 'dog'), ('slept', 'on'), ('slept', 'the'), ('on', 'dog'), ('on', 'slept'), ('on', 'the'), ('on', 'rug'), ('the', 'slept'), ('the', 'on'), ('the', 'rug'), ('rug', 'on'), ('rug', 'the')]


In [10]:
# Convertir a índices
train_data = [(word2idx[target], word2idx[context]) for target, context in pairs]

In [11]:
print(train_data)

[(7, 1), (7, 5), (1, 7), (1, 5), (1, 6), (5, 7), (5, 1), (5, 6), (5, 2), (6, 1), (6, 5), (6, 2), (6, 7), (2, 5), (2, 6), (2, 7), (2, 9), (7, 6), (7, 2), (7, 9), (7, 0), (9, 2), (9, 7), (9, 0), (9, 7), (0, 7), (0, 9), (0, 7), (0, 4), (7, 9), (7, 0), (7, 4), (7, 10), (4, 0), (4, 7), (4, 10), (4, 3), (10, 7), (10, 4), (10, 3), (10, 2), (3, 4), (3, 10), (3, 2), (3, 7), (2, 10), (2, 3), (2, 7), (2, 8), (7, 3), (7, 2), (7, 8), (8, 2), (8, 7)]


In [None]:
# Modelo Skip-Gram
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, input_word):
        embed = self.embeddings(input_word)
        out = self.output(embed)
        return out

embedding_dim = 15
model = SkipGramModel(vocab_size, embedding_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [32]:
print(model)

SkipGramModel(
  (embeddings): Embedding(11, 15)
  (output): Linear(in_features=15, out_features=11, bias=True)
)


In [33]:
# Entrenamiento
for epoch in range(100):
    total_loss = 0
    for input_idx, target_idx in train_data:
        input_tensor = torch.tensor([input_idx], dtype=torch.long)
        target_tensor = torch.tensor([target_idx], dtype=torch.long)

        optimizer.zero_grad()
        output = model(input_tensor)
        loss = loss_fn(output, target_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

Epoch 0, Loss: 142.2775
Epoch 10, Loss: 142.2472
Epoch 20, Loss: 142.2176
Epoch 30, Loss: 142.1879
Epoch 40, Loss: 142.1582
Epoch 50, Loss: 142.1286
Epoch 60, Loss: 142.0990
Epoch 70, Loss: 142.0694
Epoch 80, Loss: 142.0399
Epoch 90, Loss: 142.0104


In [22]:
# Obtener embeddings de "black" y "brown"
black_idx = word2idx["black"]
brown_idx = word2idx["brown"]

black_embedding = model.embeddings(torch.tensor([black_idx])).detach().numpy()
brown_embedding = model.embeddings(torch.tensor([brown_idx])).detach().numpy()

print(f"Embedding de 'black': {black_embedding}")
print(f"Embedding de 'brown': {brown_embedding}")

Embedding de 'black': [[ 0.6081965  -0.27331546 -1.2913392  -0.15249166  1.9993314   0.8663557
  -2.3744898   0.54114026 -1.1109648   2.085534    1.1988776   2.0892808
  -0.1214344   1.9393797   0.85767335]]
Embedding de 'brown': [[-0.2402617   2.252105    1.9834756   0.45905763 -0.96503824 -0.57575446
   2.0313282  -1.4046042   0.6116017   1.3756244  -1.3545616   1.3084308
   0.6670409  -1.4410648  -0.3797013 ]]


In [23]:
#  Guardar modelo entrenado
torch.save(model.state_dict(), f"skipgram_model_{embedding_dim}d.pth")
print(f"Modelo guardado como skipgram_model_{embedding_dim}d.pth")


# Cargar modelo guardado
loaded_model = SkipGramModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
loaded_model.load_state_dict(torch.load(f"skipgram_model_{embedding_dim}d.pth"))
loaded_model.eval()
print(f"Modelo cargado desde skipgram_model_{embedding_dim}d.pth")

Modelo guardado como skipgram_model_15d.pth
Modelo cargado desde skipgram_model_15d.pth


In [24]:
# Obtener embeddings de "black" y "brown"
black_idx = word2idx["black"]
brown_idx = word2idx["brown"]

black_embedding = loaded_model.embeddings(torch.tensor([black_idx])).detach().numpy()
brown_embedding = loaded_model.embeddings(torch.tensor([brown_idx])).detach().numpy()

print(f"Embedding de 'black': {black_embedding}")
print(f"Embedding de 'brown': {brown_embedding}")

Embedding de 'black': [[ 0.6081965  -0.27331546 -1.2913392  -0.15249166  1.9993314   0.8663557
  -2.3744898   0.54114026 -1.1109648   2.085534    1.1988776   2.0892808
  -0.1214344   1.9393797   0.85767335]]
Embedding de 'brown': [[-0.2402617   2.252105    1.9834756   0.45905763 -0.96503824 -0.57575446
   2.0313282  -1.4046042   0.6116017   1.3756244  -1.3545616   1.3084308
   0.6670409  -1.4410648  -0.3797013 ]]
