# Training a Simple GAN Model for Sentence Embeddings

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

MAX_LENGTH = 768


class Discriminator(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.01),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.disc(x)

class Generator(nn.Module):
    def __init__(self, z_dim, emb_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),
            nn.LeakyReLU(0.01),
            nn.Linear(256, emb_dim),
            nn.Tanh(),  # Assuming you want to normalize the outputs
        )

    def forward(self, x):
        return self.gen(x)

class SentenceEmbeddingsDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx]


# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 3e-4
z_dim = 64
image_dim = 28 * 28 * 1  # 784
batch_size = 32
num_epochs = 50

disc = Discriminator(image_dim).to(device)
gen = Generator(z_dim, image_dim).to(device)
fixed_noise = torch.randn((batch_size, z_dim)).to(device)
transforms = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
    ]
)

# Assuming embeddings is a 768 x 768 tensor of sentence embeddings (768 examples of 768-dimensional sentence embeddings)
embeddings = torch.randn(768, 768)  # Placeholder for actual sentence embeddings
labels = torch.zeros(768, 1)  # Dummy labels, not used in training but required to create the dataset

dataset = TensorDataset(embeddings, labels)  # Create a dataset of embeddings
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizers
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)

# Loss function
criterion = nn.BCELoss()

# TensorBoard writers
writer_fake = SummaryWriter(f"logs/fake")
writer_real = SummaryWriter(f"logs/real")
step = 0

for epoch in range(num_epochs):
    for batch_idx, (real, _) in enumerate(loader):
        # Adjust for embeddings size
        real = real.to(device)
        batch_size = real.shape[0]

        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
        noise = torch.randn(batch_size, z_dim).to(device)
        fake = gen(noise)
        disc_real = disc(real).view(-1)
        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        disc_fake = disc(fake).view(-1)
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2
        disc.zero_grad()
        lossD.backward(retain_graph=True)
        opt_disc.step()

        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
        output = disc(fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        if batch_idx == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
            )


## Tokenizing Ancient Greek Texts

In [7]:
from transformers import AutoTokenizer, AutoModel

# Initialize the tokenizer and model
aristoberto_tokenizer = AutoTokenizer.from_pretrained("Jacobo/aristoBERTo")
aristoberto_model = AutoModel.from_pretrained("Jacobo/aristoBERTo")

Some weights of BertModel were not initialized from the model checkpoint at Jacobo/aristoBERTo and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
sample_text = """ οἳ μὲν γὰρ Δρακάνῳ σ᾽, οἳ δ᾽ Ἰκάρῳ ἠνεμοέσσῃ
φάσ᾽, οἳ δ᾽ ἐν Νάξῳ, δῖον γένος, εἰραφιῶτα,
οἳ δέ σ᾽ ἐπ᾽ Ἀλφειῷ ποταμῷ βαθυδινήεντι
κυσαμένην Σεμέλην τεκέειν Διὶ τερπικεραύνῳ:
5ἄλλοι δ᾽ ἐν Θήβῃσιν, ἄναξ, σε λέγουσι γενέσθαι,
ψευδόμενοι: σὲ δ᾽ ἔτικτε πατὴρ ἀνδρῶν τε θεῶν τε
πολλὸν ἀπ᾽ ἀνθρώπων, κρύπτων λευκώλενον Ἥρην.
ἔστι δέ τις Νύση, ὕπατον ὄρος, ἀνθέον ὕλῃ,
τηλοῦ Φοινίκης, σχεδὸν Αἰγύπτοιο ῥοάων,
10... καί οἱ ἀναστήσουσιν ἀγάλματα πόλλ᾽ ἐνὶ νηοῖς.
ὣς δὲ τὰ μὲν τρία, σοὶ πάντως τριετηρίσιν αἰεὶ
ἄνθρωποι ῥέξουσι τεληέσσας ἑκατόμβας"""

inputs = aristoberto_tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length = MAX_LENGTH)

# Generate embeddings
with torch.no_grad():
    outputs = aristoberto_model(**inputs)

# outputs.last_hidden_state will contain the token-level embeddings
# For sentence-level embeddings, you can average the token embeddings
sentence_embedding = outputs.last_hidden_state.mean(dim=1)

In [12]:
sentence_embedding.shape

torch.Size([1, 768])

In [13]:
sentence_embedding

tensor([[ 1.1584e-01, -4.3876e-01,  7.6951e-02,  2.2028e-01, -7.1418e-01,
          3.3325e-01,  1.9584e-01, -1.3508e-02,  1.1990e-01, -4.0512e-01,
         -4.2947e-01, -3.4709e-01, -4.0955e-02,  4.4730e-02, -8.5780e-02,
          2.2162e-01,  2.5557e-01, -6.5016e-01,  2.3086e-02,  3.0195e-01,
          4.6583e-01, -4.7642e-01, -1.0109e-01,  1.0951e-01, -2.7022e-01,
          3.3001e-01,  2.7636e-01, -1.5663e-01,  5.1859e-01, -2.0124e-01,
          5.7697e-01,  1.5916e-01,  4.5630e-01, -7.6024e-02,  2.4529e-01,
          4.8486e-01, -2.8936e-01,  3.5514e-01, -1.5313e-01,  2.5443e-01,
         -1.4930e-01, -3.8183e-01,  8.2846e-02,  2.7116e-01,  1.7931e-01,
          5.4954e-02, -1.2382e-01, -6.5090e-02, -3.3930e-01,  4.2462e-01,
          8.6823e-02, -2.3818e-01,  3.4314e-01,  1.3457e-02, -9.2304e-02,
          4.3891e-03, -1.6361e-01, -2.4262e-01,  3.0555e-01, -2.3122e-02,
          1.2961e-02, -8.5956e-02,  3.4778e-01,  8.8530e-02,  4.4103e-02,
         -2.1675e-01,  1.0899e-01, -2.

# Using Ancient-Greek BERT

In [16]:
tokeniser = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
model = AutoModel.from_pretrained("pranaydeeps/Ancient-Greek-BERT")

input_ids = tokeniser.encode('τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχωροῦν [MASK]')
tokens = tokeniser.convert_ids_to_tokens(input_ids)
idx = tokens.index("[MASK]")
print(idx, tokens)
outputs = model(torch.tensor([input_ids]))[0]
outputs.shape

13 ['[CLS]', 'του', 'βιου', 'του', 'καθ', '΄', 'εαυτους', 'πολλα', 'γινε', '##σθαι', 'συγχ', '##ωρου', '##ν', '[MASK]', '[SEP]']


torch.Size([1, 15, 768])