# Training a Simple GAN Model for Sentence Embeddings

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
import pandas as pd

MAX_LENGTH = 100
MAX_SEQUENCE_LENGTH = 100


class Discriminator(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.01),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.disc(x)

class Generator(nn.Module):
    def __init__(self, z_dim, emb_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),
            nn.LeakyReLU(0.01),
            nn.Linear(256, emb_dim),
            nn.Tanh(),  # Assuming you want to normalize the outputs
        )

    def forward(self, x):
        return self.gen(x)


class CustomDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx]



# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 3e-4
z_dim = 64
embed_dim = MAX_LENGTH  # 784
batch_size = 32
num_epochs = 50

disc = Discriminator(embed_dim).to(device)
gen = Generator(z_dim, embed_dim).to(device)
fixed_noise = torch.randn((batch_size, z_dim)).to(device)
transforms = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
    ]
)


# Preprocessing Data

In [2]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

model_names = ["Jacobo/aristoBERTo", "pranaydeeps/Ancient-Greek-BERT"]

max_length = MAX_SEQUENCE_LENGTH
model_name = model_names[1]

def get_cls_token(sentence):
    
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
    model = TFAutoModel.from_pretrained("pranaydeeps/Ancient-Greek-BERT")

    model.trainable = True

    # Tokenize the input sentence and prepare input tensors
    inputs = tokenizer(sentence, 
                       max_length=max_length,
                       truncation=True,
                       padding='max_length',
                       return_tensors="tf")  # Ensure to use "tf" for TensorFlow models

    bert_inputs = {'input_ids': inputs.input_ids,
                   'token_type_ids': inputs.token_type_ids,
                   'attention_mask': inputs.attention_mask}

    # Pass the inputs directly to the model
    outputs = model(bert_inputs)

    cls_token = outputs[0][:, 0, :]

    hidden = cls_token

    #Apply Dense Layer to bring down CLS token to max length tokens
    hidden = tf.keras.layers.Dense(max_length, activation='relu', name='hidden_layer')(cls_token)

    return hidden


In [17]:
#Limit number of rows for experimentation
num_rows = 2

df = pd.read_csv('ancient_greek.csv')[:num_rows]
df['encoded_author'] = le.fit_transform(df['author'])
df

Unnamed: 0.1,Unnamed: 0,author,title,line_idx,line_txt,encoded_author
0,0,Homer,Iliad,1,μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος,0
1,1,Homer,Iliad,2,"οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,",0


In [18]:
df['agc_embeddings'] = df['line_txt'].apply(get_cls_token)
df

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trai

Unnamed: 0.1,Unnamed: 0,author,title,line_idx,line_txt,encoded_author,agc_embeddings
0,0,Homer,Iliad,1,μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος,0,"((tf.Tensor(0.521089, shape=(), dtype=float32)..."
1,1,Homer,Iliad,2,"οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,",0,"((tf.Tensor(0.0, shape=(), dtype=float32), tf...."


In [22]:
# Load the text lines
texts = df['line_txt'].tolist()

#Wrapper function to turn texts to lists of tensors
def texts_to_embeddings(texts):

    return [get_cls_token(_) for _ in texts]

embeddings = texts_to_embeddings(texts)

#Turn EagerTensors list to Normal Tensors list
embeddings_pytorch = [torch.tensor(e.numpy()) for e in embeddings]

# Convert list of tensors to a single tensor
embeddings_tensor = torch.stack(embeddings_pytorch).squeeze(1)  # Adjust dimensions as needed

embeddings_tensor

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trai

torch.Size([2, 100])

In [11]:
#Will change this to include the actual dataset when it's done tokenizing
embeddings = torch.randn(768, 768)  # Placeholder for actual sentence embeddings

# Instantiate the custom dataset
dataset = CustomDataset(embeddings, labels)

In [25]:
#Initialize Dataset
embeddings = embeddings_tensor 

# Instantiate the custom dataset
dataset = CustomDataset(embeddings)

# Training the Actual GAN model

In [31]:
# Now, create the DataLoader using the dataset
batch_size = 64  # Or any other batch size you wish to use
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizers
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)

# Loss function
criterion = nn.BCELoss()

# TensorBoard writers
writer_fake = SummaryWriter(f"logs/fake")
writer_real = SummaryWriter(f"logs/real")
step = 0

# Assuming the generator (gen), discriminator (disc), and their optimizers (opt_gen, opt_disc) are defined
# Also assuming a loss function (criterion) is defined
# z_dim is the dimensionality of the latent space (noise vector)

num_epochs = 10  # Number of epochs to train for

for epoch in range(num_epochs):
    for batch_idx, real_embeddings in enumerate(loader):
        batch_size = real_embeddings[0].size(0)
        real_embeddings = real_embeddings[0].to(device)

        # Train Discriminator
        # Generate fake embeddings
        noise = torch.randn(batch_size, z_dim, device=device)
        fake_embeddings = gen(noise)

        # Get discriminator predictions on real and fake data
        disc_real = disc(real_embeddings).view(-1)
        disc_fake = disc(fake_embeddings.detach()).view(-1)

        # Calculate loss on real and fake
        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2

        # Update discriminator
        opt_disc.zero_grad()
        lossD.backward()
        opt_disc.step()

        # Train Generator
        # Generate fake embeddings
        output = disc(fake_embeddings).view(-1)
        lossG = criterion(output, torch.ones_like(output))

        # Update generator
        opt_gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        # Optional: Print out loss values or save models/checkpoints here

       
        print(f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
              Loss D: {lossD:.4f}, loss G: {lossG:.4f}")


Epoch [0/10] Batch 0/1               Loss D: 0.6856, loss G: 0.6371
Epoch [1/10] Batch 0/1               Loss D: 0.6744, loss G: 0.6371
Epoch [2/10] Batch 0/1               Loss D: 0.6662, loss G: 0.6320
Epoch [3/10] Batch 0/1               Loss D: 0.6587, loss G: 0.6270
Epoch [4/10] Batch 0/1               Loss D: 0.7147, loss G: 0.6209
Epoch [5/10] Batch 0/1               Loss D: 0.6481, loss G: 0.6122
Epoch [6/10] Batch 0/1               Loss D: 0.7076, loss G: 0.6111
Epoch [7/10] Batch 0/1               Loss D: 0.7027, loss G: 0.6075
Epoch [8/10] Batch 0/1               Loss D: 0.6337, loss G: 0.6010
Epoch [9/10] Batch 0/1               Loss D: 0.6937, loss G: 0.5985


# Generating a Fake CLS Token

In [34]:
#Generating a single sample
noise = torch.randn(1, z_dim)

with torch.no_grad():  # We don't need to track gradients for generation
    fake_data = gen(noise)  # For generating a single sample
    
fake_data.shape

torch.Size([1, 100])

## Tokenizing Ancient Greek Texts

In [None]:
from transformers import AutoTokenizer, AutoModel

# Initialize the tokenizer and model
aristoberto_tokenizer = AutoTokenizer.from_pretrained("Jacobo/aristoBERTo")
aristoberto_model = AutoModel.from_pretrained("Jacobo/aristoBERTo")

In [10]:
sample_text = """ οἳ μὲν γὰρ Δρακάνῳ σ᾽, οἳ δ᾽ Ἰκάρῳ ἠνεμοέσσῃ
φάσ᾽, οἳ δ᾽ ἐν Νάξῳ, δῖον γένος, εἰραφιῶτα,
οἳ δέ σ᾽ ἐπ᾽ Ἀλφειῷ ποταμῷ βαθυδινήεντι
κυσαμένην Σεμέλην τεκέειν Διὶ τερπικεραύνῳ:
5ἄλλοι δ᾽ ἐν Θήβῃσιν, ἄναξ, σε λέγουσι γενέσθαι,
ψευδόμενοι: σὲ δ᾽ ἔτικτε πατὴρ ἀνδρῶν τε θεῶν τε
πολλὸν ἀπ᾽ ἀνθρώπων, κρύπτων λευκώλενον Ἥρην.
ἔστι δέ τις Νύση, ὕπατον ὄρος, ἀνθέον ὕλῃ,
τηλοῦ Φοινίκης, σχεδὸν Αἰγύπτοιο ῥοάων,
10... καί οἱ ἀναστήσουσιν ἀγάλματα πόλλ᾽ ἐνὶ νηοῖς.
ὣς δὲ τὰ μὲν τρία, σοὶ πάντως τριετηρίσιν αἰεὶ
ἄνθρωποι ῥέξουσι τεληέσσας ἑκατόμβας"""

inputs = aristoberto_tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length = MAX_LENGTH)

# Generate embeddings
with torch.no_grad():
    outputs = aristoberto_model(**inputs)

# outputs.last_hidden_state will contain the token-level embeddings
# For sentence-level embeddings, you can average the token embeddings
sentence_embedding = outputs.last_hidden_state.mean(dim=1)

In [12]:
sentence_embedding.shape

torch.Size([1, 768])

In [13]:
sentence_embedding

tensor([[ 1.1584e-01, -4.3876e-01,  7.6951e-02,  2.2028e-01, -7.1418e-01,
          3.3325e-01,  1.9584e-01, -1.3508e-02,  1.1990e-01, -4.0512e-01,
         -4.2947e-01, -3.4709e-01, -4.0955e-02,  4.4730e-02, -8.5780e-02,
          2.2162e-01,  2.5557e-01, -6.5016e-01,  2.3086e-02,  3.0195e-01,
          4.6583e-01, -4.7642e-01, -1.0109e-01,  1.0951e-01, -2.7022e-01,
          3.3001e-01,  2.7636e-01, -1.5663e-01,  5.1859e-01, -2.0124e-01,
          5.7697e-01,  1.5916e-01,  4.5630e-01, -7.6024e-02,  2.4529e-01,
          4.8486e-01, -2.8936e-01,  3.5514e-01, -1.5313e-01,  2.5443e-01,
         -1.4930e-01, -3.8183e-01,  8.2846e-02,  2.7116e-01,  1.7931e-01,
          5.4954e-02, -1.2382e-01, -6.5090e-02, -3.3930e-01,  4.2462e-01,
          8.6823e-02, -2.3818e-01,  3.4314e-01,  1.3457e-02, -9.2304e-02,
          4.3891e-03, -1.6361e-01, -2.4262e-01,  3.0555e-01, -2.3122e-02,
          1.2961e-02, -8.5956e-02,  3.4778e-01,  8.8530e-02,  4.4103e-02,
         -2.1675e-01,  1.0899e-01, -2.

# Using Ancient-Greek BERT

In [16]:
tokeniser = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
model = AutoModel.from_pretrained("pranaydeeps/Ancient-Greek-BERT")

input_ids = tokeniser.encode('τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχωροῦν [MASK]')
tokens = tokeniser.convert_ids_to_tokens(input_ids)
idx = tokens.index("[MASK]")
print(idx, tokens)
outputs = model(torch.tensor([input_ids]))[0]
outputs.shape

13 ['[CLS]', 'του', 'βιου', 'του', 'καθ', '΄', 'εαυτους', 'πολλα', 'γινε', '##σθαι', 'συγχ', '##ωρου', '##ν', '[MASK]', '[SEP]']


torch.Size([1, 15, 768])