In [1]:
import torch
import torch.nn as nn
import os
import torch.optim as optim
import sys
import os
current_dir = os.path.dirname(os.path.abspath(__file__)) if "__file__" in locals() else os.getcwd()
root_dir = current_dir
while not os.path.isdir(os.path.join(root_dir, "nmt_cmr_parallels")):
    parent_dir = os.path.dirname(root_dir)
    if parent_dir == root_dir:
        raise FileNotFoundError("Could not find 'nmt_cmr_parallels' folder in any parent directories")
    root_dir = parent_dir

# Add project root to sys.path
if root_dir not in sys.path:
    sys.path.append(root_dir)
    
from nmt_cmr_parallels.data.word_vectors import load_pretrained_inverse_embedding
from torch.utils.data import DataLoader, TensorDataset

from nmt_cmr_parallels.data.sequence_data import load_cached_vocabulary, create_pretrained_semantic_embedding
DEFAULT_DATA_PATH = os.path.expanduser(os.path.join('~', '.seq_nlp_data'))

In [13]:
pretrained_embedding = create_pretrained_semantic_embedding(DEFAULT_DATA_PATH,50)
vocab = load_cached_vocabulary("peers_vocab.json")
vocab = [x.lower() for x in vocab]
vocab = ['<null>'] + vocab + ['<SoS>','<EoS>']

In [14]:
embedding_layer = nn.Embedding(len(vocab), 50)
    
# Initialize the weights of the Embedding layer with the GloVe vectors
word_to_index = {word: idx for idx, word in enumerate(vocab)}
for word, index in word_to_index.items():
    if word in pretrained_embedding:
        embedding_layer.weight.data[index] = torch.tensor(pretrained_embedding[word], dtype=torch.float32)

# Freeze the embedding layer
for param in embedding_layer.parameters():
    param.requires_grad = False

In [15]:
# Step 1: Data Preparation
def create_data(embeddings, vocab_size):
    x = embeddings.weight.data.clone()
    y = torch.eye(vocab_size)
    return x, y

vocab_size = len(vocab)  # replace with actual vocabulary size
x, y = create_data(embedding_layer, vocab_size)

dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Step 2: MLP Model Definition
class InverseEmbeddingMLP(nn.Module):
    def __init__(self, embedding_dim, vocab_size):
        super(InverseEmbeddingMLP, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, vocab_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

inverse_embedding = InverseEmbeddingMLP(embedding_layer.embedding_dim, vocab_size)
inverse_embedding = load_pretrained_inverse_embedding(inverse_embedding, seq_tokens=True)

In [17]:
#Evaluation
for word in vocab:
    if word in ['<SoS>', '<EoS>', '<null>']:
        continue
    word_index = word_to_index[word]
    embedded_word = pretrained_embedding[word]
    retrieved = inverse_embedding(torch.tensor(embedded_word))
    predicted_indices = torch.argmax(retrieved, dim=0)
    assert predicted_indices.item() == word_index