In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

In [None]:
# Constants
TRAIN_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_source.txt"
TRAIN_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_target.txt"
DEV_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/dev_source.txt"
DEV_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/dev_target.txt"
GEN_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_source.txt"
GEN_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_target.txt"
SOURCE_VOCAB_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/source_vocab.txt"
TARGET_VOCAB_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/target_vocab.txt"

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 10

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class Seq2SeqDataset(Dataset):
    def __init__(self, source_file, target_file, source_vocab_file, target_vocab_file):
        self.source_data = self.load_data(source_file)
        self.target_data = self.load_data(target_file)
        self.source_vocab = self.load_vocab(source_vocab_file)
        self.target_vocab = self.load_vocab(target_vocab_file)
        self.max_seq_length = 100  # Example value, adjust based on your data

        # Ensure '<pad>' token exists in target_vocab, if not, add it
        if '<pad>' not in self.target_vocab:
            self.target_vocab['<pad>'] = len(self.target_vocab)

    def __len__(self):
        return len(self.source_data)

    def __getitem__(self, idx):
        source_seq = self.process_sequence(self.source_data[idx], self.source_vocab)
        target_seq = self.process_sequence(self.target_data[idx], self.target_vocab)
        return source_seq, target_seq

    def load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = [line.strip().split() for line in file.readlines()]
        return data

    def load_vocab(self, vocab_file):
        with open(vocab_file, 'r', encoding='utf-8') as file:
            vocab = {token.strip(): idx for idx, token in enumerate(file.readlines())}
        return vocab

    def process_sequence(self, sequence, vocab):
        # Convert tokens to indices; pad or truncate to max_seq_length
        indexed_seq = [vocab[token] if token in vocab else vocab['<unk>'] for token in sequence]
        indexed_seq = indexed_seq[:self.max_seq_length] + [vocab['<pad>']] * (self.max_seq_length - len(indexed_seq))
        return torch.tensor(indexed_seq, dtype=torch.long)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class UniversalTransformer(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model=512, num_heads=4, num_layers=2):
        super(UniversalTransformer, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)

        # Transformer Encoder Layers
        encoder_layers = nn.TransformerEncoderLayer(d_model, num_heads, dim_feedforward=512)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)

        self.fc = nn.Linear(d_model, output_vocab_size)

    def forward(self, src):
        src = self.embedding(src)
        src = self.positional_encoding(src)
        output = self.transformer_encoder(src)
        output = self.fc(output)
        return output

In [None]:
def train_epoch(model, optimizer, criterion, train_loader, device):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        output = model(src)
        loss = criterion(output.view(-1, output.shape[-1]), tgt.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (output.argmax(-1) == tgt).sum().item()

    return total_loss / len(train_loader.dataset), total_correct / len(train_loader.dataset)


In [None]:
def evaluate(model, criterion, data_loader, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src)
            loss = criterion(output.view(-1, output.shape[-1]), tgt.view(-1))
            total_loss += loss.item()
            total_correct += (output.argmax(-1) == tgt).sum().item()

    return total_loss / len(data_loader.dataset), total_correct / len(data_loader.dataset)

In [None]:
# Load datasets
train_dataset = Seq2SeqDataset(TRAIN_SOURCE_FILE, TRAIN_TARGET_FILE, SOURCE_VOCAB_FILE, TARGET_VOCAB_FILE)
dev_dataset = Seq2SeqDataset(DEV_SOURCE_FILE, DEV_TARGET_FILE, SOURCE_VOCAB_FILE, TARGET_VOCAB_FILE)
gen_dataset = Seq2SeqDataset(GEN_SOURCE_FILE, GEN_TARGET_FILE, SOURCE_VOCAB_FILE, TARGET_VOCAB_FILE)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
gen_loader = DataLoader(gen_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
source_vocab_size = len(train_dataset.source_vocab)
target_vocab_size = len(train_dataset.target_vocab)

In [None]:
model = UniversalTransformer(source_vocab_size, target_vocab_size).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.target_vocab['<pad>'])



In [None]:
# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, optimizer, criterion, train_loader, DEVICE)
    dev_loss, dev_acc = evaluate(model, criterion, dev_loader, DEVICE)

    print(f"Epoch {epoch + 1}:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Dev Loss: {dev_loss:.4f} | Dev Acc: {dev_acc:.4f}")

# Evaluate on generated data
gen_loss, gen_acc = evaluate(model, criterion, gen_loader, DEVICE)
print(f"Generated Data:")
print(f"  Gen Loss: {gen_loss:.4f} | Gen Acc: {gen_acc:.4f}")


Epoch 1:
  Train Loss: 0.0956 | Train Acc: 8.4623
  Dev Loss: 0.0946 | Dev Acc: 8.8867
Epoch 2:
  Train Loss: 0.0943 | Train Acc: 8.7081
  Dev Loss: 0.0939 | Dev Acc: 9.1677
Epoch 3:
  Train Loss: 0.0946 | Train Acc: 8.6733
  Dev Loss: 0.0945 | Dev Acc: 8.9097
Epoch 4:
  Train Loss: 0.0947 | Train Acc: 8.6134
  Dev Loss: 0.0940 | Dev Acc: 8.8867
Epoch 5:
  Train Loss: 0.0941 | Train Acc: 8.7987
  Dev Loss: 0.0942 | Dev Acc: 9.0857
Epoch 6:
  Train Loss: 0.0941 | Train Acc: 8.8043
  Dev Loss: 0.0940 | Dev Acc: 8.8233


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import time

# Constants
# Constants
TRAIN_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_source.txt"
TRAIN_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_target.txt"
DEV_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/dev_source.txt"
DEV_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/dev_target.txt"
GEN_SOURCE_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_source.txt"
GEN_TARGET_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_target.txt"
SOURCE_VOCAB_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/source_vocab.txt"
TARGET_VOCAB_FILE = "/content/drive/MyDrive/IIITH/COGS-main/output_path/target_vocab.txt"


BATCH_SIZE = 64  # Increase batch size for faster training
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
MAX_SEQ_LENGTH = 100

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset class
class Seq2SeqDataset(Dataset):
    def __init__(self, source_file, target_file, source_vocab_file, target_vocab_file):
        self.source_data = self.load_data(source_file)
        self.target_data = self.load_data(target_file)
        self.source_vocab = self.load_vocab(source_vocab_file)
        self.target_vocab = self.load_vocab(target_vocab_file)

        if '<pad>' not in self.target_vocab:
            self.target_vocab['<pad>'] = len(self.target_vocab)

    def __len__(self):
        return len(self.source_data)

    def __getitem__(self, idx):
        source_seq = self.process_sequence(self.source_data[idx], self.source_vocab)
        target_seq = self.process_sequence(self.target_data[idx], self.target_vocab)
        return source_seq, target_seq

    def load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = [line.strip().split() for line in file.readlines()]
        return data

    def load_vocab(self, vocab_file):
        with open(vocab_file, 'r', encoding='utf-8') as file:
            vocab = {token.strip(): idx for idx, token in enumerate(file.readlines())}
        return vocab

    def process_sequence(self, sequence, vocab):
        indexed_seq = [vocab[token] if token in vocab else vocab['<unk>'] for token in sequence]
        indexed_seq = indexed_seq[:MAX_SEQ_LENGTH] + [vocab['<pad>']] * (MAX_SEQ_LENGTH - len(indexed_seq))
        return torch.tensor(indexed_seq, dtype=torch.long)

# Positional Encoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Universal Transformer class
class UniversalTransformer(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model=512, num_heads=4, num_layers=2):
        super(UniversalTransformer, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layers = nn.TransformerEncoderLayer(d_model, num_heads, dim_feedforward=512, dropout=0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)

        self.fc = nn.Linear(d_model, output_vocab_size)

    def forward(self, src):
        src = self.embedding(src)
        src = self.positional_encoding(src)
        output = self.transformer_encoder(src)
        output = self.fc(output)
        return output

# Training function
def train_epoch(model, optimizer, criterion, train_loader, device, scheduler, scaler):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            output = model(src)
            loss = criterion(output.view(-1, output.shape[-1]), tgt.view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
        pred = output.argmax(-1)
        non_pad_elements = (tgt != train_loader.dataset.target_vocab['<pad>']).sum().item()
        total_correct += (pred == tgt).sum().item() - (pred[tgt == train_loader.dataset.target_vocab['<pad>']] == train_loader.dataset.target_vocab['<pad>']).sum().item()
        total_count += non_pad_elements

    return total_loss / len(train_loader.dataset), total_correct / total_count

# Load dataset and create DataLoader
train_dataset = Seq2SeqDataset(TRAIN_SOURCE_FILE, TRAIN_TARGET_FILE, SOURCE_VOCAB_FILE, TARGET_VOCAB_FILE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

# Create model, optimizer, and loss function
model = UniversalTransformer(len(train_dataset.source_vocab), len(train_dataset.target_vocab)).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.target_vocab['<pad>'])

scaler = torch.cuda.amp.GradScaler()

# Training loop
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_epoch(model, optimizer, criterion, train_loader, DEVICE, scheduler, scaler)
    elapsed_time = time.time() - start_time
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS} | Time: {elapsed_time:.2f}s")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

Epoch 1/20 | Time: 8.76s
Train Loss: 0.0497, Train Accuracy: 0.1772
Epoch 2/20 | Time: 9.00s
Train Loss: 0.0491, Train Accuracy: 0.1805
Epoch 3/20 | Time: 9.02s
Train Loss: 0.0491, Train Accuracy: 0.1801
Epoch 4/20 | Time: 8.94s
Train Loss: 0.0491, Train Accuracy: 0.1806
Epoch 5/20 | Time: 10.23s
Train Loss: 0.0491, Train Accuracy: 0.1803
Epoch 6/20 | Time: 7.94s
Train Loss: 0.0491, Train Accuracy: 0.1805
Epoch 7/20 | Time: 8.58s
Train Loss: 0.0491, Train Accuracy: 0.1805
Epoch 8/20 | Time: 8.14s
Train Loss: 0.0491, Train Accuracy: 0.1804
Epoch 9/20 | Time: 8.45s
Train Loss: 0.0491, Train Accuracy: 0.1807
Epoch 10/20 | Time: 8.58s
Train Loss: 0.0491, Train Accuracy: 0.1802
Epoch 11/20 | Time: 8.05s
Train Loss: 0.0491, Train Accuracy: 0.1803
Epoch 12/20 | Time: 9.10s
Train Loss: 0.0491, Train Accuracy: 0.1805
Epoch 13/20 | Time: 8.48s
Train Loss: 0.0491, Train Accuracy: 0.1805
Epoch 14/20 | Time: 8.30s
Train Loss: 0.0491, Train Accuracy: 0.1808
Epoch 15/20 | Time: 8.64s
Train Loss: 0.04

In [None]:
!pip install tensorflow
!pip install tensorflow-text


Collecting tensorflow-text
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<2.17,>=2.16.1 (from tensorflow-text)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB

In [None]:
!pip install tokenizers



In [None]:
pip install torch transformers tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import tensorflow as tf
import numpy as np

# Function to load vocabularies
def load_vocab(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        vocab = f.read().splitlines()
    return vocab

# Load source and target vocabularies
source_vocab = load_vocab("/content/drive/MyDrive/IIITH/COGS-main/output_path/source_vocab.txt")
target_vocab = load_vocab("/content/drive/MyDrive/IIITH/COGS-main/output_path/target_vocab.txt")

# Tokenizer initialization
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')

# Fit tokenizer on vocabularies
source_tokenizer.fit_on_texts(source_vocab)
target_tokenizer.fit_on_texts(target_vocab)

# Vocabulary sizes
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Function to load and preprocess data
def load_data(source_file, target_file, source_tokenizer, target_tokenizer, max_seq_length=None):
    # Load source and target data from files
    with open(source_file, 'r', encoding='utf-8') as f:
        source_data = f.read().splitlines()
    with open(target_file, 'r', encoding='utf-8') as f:
        target_data = f.read().splitlines()

    # Tokenize source and target data
    source_sequences = source_tokenizer.texts_to_sequences(source_data)
    target_sequences = target_tokenizer.texts_to_sequences(target_data)

    # Pad sequences to max_seq_length if provided, otherwise pad to maximum sequence length in the data
    if max_seq_length:
        source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, padding='post', maxlen=max_seq_length)
        target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post', maxlen=max_seq_length)
    else:
        source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, padding='post')
        target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post')

    return source_sequences, target_sequences

# File paths
train_source_file = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_source.txt"
train_target_file = "/content/drive/MyDrive/IIITH/COGS-main/output_path/train_target.txt"
gen_source_file = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_source.txt"
gen_target_file = "/content/drive/MyDrive/IIITH/COGS-main/output_path/gen_target.txt"

# Load and preprocess training and generation data
train_source, train_target = load_data(train_source_file, train_target_file, source_tokenizer, target_tokenizer)
gen_source, gen_target = load_data(gen_source_file, gen_target_file, source_tokenizer, target_tokenizer)

train_source = tf.cast(train_source, dtype=tf.float32)
train_target = tf.cast(train_target, dtype=tf.float32)
gen_source = tf.cast(gen_source, dtype=tf.float32)
gen_target = tf.cast(gen_target, dtype=tf.float32)

# Display shape of loaded data
print(f"Training Source shape: {train_source.shape}, Training Target shape: {train_target.shape}")
print(f"Generation Source shape: {gen_source.shape}, Generation Target shape: {gen_target.shape}")

# Example of how to access tokenized sequences
print("Example of tokenized source sequence:")
print(train_source[0])
print("Example of tokenized target sequence:")
print(train_target[0])


Training Source shape: (24155, 22), Training Target shape: (24155, 153)
Generation Source shape: (21000, 61), Generation Target shape: (21000, 480)
Example of tokenized source sequence:
tf.Tensor(
[  2. 521. 246. 570. 397.   2. 428. 701.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.], shape=(22,), dtype=float32)
Example of tokenized target sequence:
tf.Tensor(
[516.  91. 485. 569. 254. 282. 662. 634. 685. 181.  91. 485. 569.  60.
 101. 485. 569. 254. 282. 662. 634. 685. 170.  91. 485. 569.  60. 101.
 485. 569. 256. 282. 662. 418.  91. 485. 569. 256. 282.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import LayerNormalization, MultiHeadAttention, Dropout, Dense

# Positional Encoding function
def positional_encoding(position, d_model):
    angle_rads = np.linspace(0, 2 * np.pi, d_model // 2)
    angle_rads = angle_rads.reshape(1, d_model // 2)
    pos = np.arange(position).reshape(position, 1)
    pos_encoding = pos / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
    pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
    pos_encoding = tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)  # Ensure dtype is float32
    return pos_encoding

# Universal Transformer model
class UniversalTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(UniversalTransformer, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding_input = tf.cast(pe_input, dtype=tf.float32)
        self.pos_encoding_target = tf.cast(pe_target, dtype=tf.float32)
        self.encoder_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.decoder_layers = [TransformerDecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.final_layer = Dense(target_vocab_size)

    def call(self, inp, training=True):
        seq_len = tf.shape(inp)[1]
        inp = tf.cast(self.embedding(inp), dtype=tf.float32) + self.pos_encoding_input[:, :seq_len, :]

        for i in range(self.num_layers):
            inp = self.encoder_layers[i](inp, training)

        return inp

    def decode(self, tar, enc_output, training=True):
        seq_len = tf.shape(tar)[1]
        tar = self.embedding(tar) + self.pos_encoding_target[:, :seq_len, :]

        for i in range(self.num_layers):
            tar = self.decoder_layers[i](tar, enc_output, training)

        final_output = self.final_layer(tar)
        return final_output

# Transformer Encoder Layer
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.dropout1 = Dropout(rate)
        self.layer_norm1 = LayerNormalization(epsilon=1e-6)
        self.dense1 = Dense(dff, activation='relu')
        self.dense2 = Dense(d_model)
        self.dropout2 = Dropout(rate)
        self.layer_norm2 = LayerNormalization(epsilon=1e-6)

    def call(self, x, training=True):
        attn_output = self.multi_head_attention(x, x, return_attention_scores=False)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layer_norm1(x + attn_output)

        ffn_output = self.dense2(self.dense1(out1))
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layer_norm2(out1 + ffn_output)

        return out2

# Transformer Decoder Layer
class TransformerDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.dropout1 = Dropout(rate)
        self.layer_norm1 = LayerNormalization(epsilon=1e-6)
        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.dropout2 = Dropout(rate)
        self.layer_norm2 = LayerNormalization(epsilon=1e-6)
        self.dense1 = Dense(dff, activation='relu')
        self.dense2 = Dense(d_model)
        self.dropout3 = Dropout(rate)
        self.layer_norm3 = LayerNormalization(epsilon=1e-6)

    def call(self, x, enc_output, training=True):
        attn1 = self.multi_head_attention1(x, x, return_attention_scores=False)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layer_norm1(attn1 + x)

        attn2 = self.multi_head_attention2(enc_output, out1, return_attention_scores=False)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layer_norm2(attn2 + out1)

        ffn_output = self.dense2(self.dense1(out2))
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layer_norm3(out2 + ffn_output)

        return out3



# Assuming train_source, train_target, gen_source, gen_target are TensorFlow tensors
train_source = tf.cast(train_source, dtype=tf.int32)
train_target = tf.cast(train_target, dtype=tf.int32)
gen_source = tf.cast(gen_source, dtype=tf.int32)
gen_target = tf.cast(gen_target, dtype=tf.int32)

num_layers = 2
d_model = 128
num_heads = 4
dff = 512
input_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
dropout_rate = 0.1

# Initialize positional encodings
pe_input = positional_encoding(10000, d_model)
pe_target = positional_encoding(6000, d_model)

# Initialize model
transformer = UniversalTransformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size,
                                   pe_input=pe_input,
                                   pe_target=pe_target,
                                   rate=dropout_rate)

# Compile and fit model
transformer.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = transformer.fit(train_source, train_target, epochs=10, validation_data=(gen_source, gen_target))

# Evaluate model
loss, accuracy = transformer.evaluate(gen_source, gen_target)
print(f'Evaluation loss: {loss}, accuracy: {accuracy}')


Epoch 1/10


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1155, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1249, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 620, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/metrics_utils.py", line 77, in decorated
        result = update_state_fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 723, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/accuracy_metrics.py", line 459, in sparse_categorical_accuracy
        matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/metrics_utils.py", line 969, in sparse_categorical_matches
        matches = tf.cast(tf.equal(y_true, y_pred), backend.floatx())

    ValueError: Dimensions must be equal, but are 153 and 22 for '{{node Equal}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Cast_3, Cast_5)' with input shapes: [?,153], [?,22].


In [None]:
import tensorflow as tf
import numpy as np

# Parameters
max_source_length = 22  # Length of the source sentences
max_target_length = 153  # Length of the target logical forms
vocab_size = 10000  # Adjust based on your vocabulary size
embedding_dim = 256
num_heads = 4
num_layers = 2
dropout_rate = 0.1

# Load your data
def load_data(source_file, target_file):
    with open(source_file, 'r') as f:
        source_data = f.readlines()
    with open(target_file, 'r') as f:
        target_data = f.readlines()
    return source_data, target_data

# Convert text to sequences
def text_to_sequences(text, vocab):
    sequences = []
    for line in text:
        sequences.append([vocab.get(word, vocab['<unk>']) for word in line.strip().split()])
    return sequences

# Load vocab files
def load_vocab(vocab_file):
    with open(vocab_file, 'r') as f:
        vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
    return vocab

source_vocab = load_vocab('/content/drive/MyDrive/IIITH/COGS-main/output_path/source_vocab.txt')
target_vocab = load_vocab('/content/drive/MyDrive/IIITH/COGS-main/output_path/target_vocab.txt')

# Load data
train_source, train_target = load_data('/content/drive/MyDrive/IIITH/COGS-main/output_path/train_source.txt', '/content/drive/MyDrive/IIITH/COGS-main/output_path/train_target.txt')

# Convert text to sequences
train_source_sequences = text_to_sequences(train_source, source_vocab)
train_target_sequences = text_to_sequences(train_target, target_vocab)

# Pad sequences
train_source_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_source_sequences, maxlen=max_source_length, padding='post')
train_target_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_target_sequences, maxlen=max_target_length, padding='post')

# Prepare the target sequences for training the decoder
train_target_input = train_target_sequences[:, :-1]  # all tokens except the last one
train_target_output = train_target_sequences[:, 1:]  # all tokens except the first one
class UniversalTransformer(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, dropout_rate):
        super(UniversalTransformer, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.transformer_layers = [
            tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim) for _ in range(num_layers)
        ]
        self.layer_norm = [tf.keras.layers.LayerNormalization(epsilon=1e-6) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, x, training):
        x = self.embedding(x)
        for i in range(num_layers):
            attn_output = self.transformer_layers[i](x, x)
            attn_output = self.dropout(attn_output, training=training)
            x = self.layer_norm[i](x + attn_output)
        x = self.dense(x)
        return x

# Create and compile the model
model = UniversalTransformer(vocab_size, embedding_dim, num_heads, num_layers, dropout_rate)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Prepare the data for training
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((train_source_sequences, train_target_input, train_target_output))
train_dataset = train_dataset.batch(batch_size)

# Custom training loop
epochs = 10
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

@tf.function
def train_step(source_seq, target_inp, target_out):
    with tf.GradientTape() as tape:
        predictions = model(source_seq, training=True)
        # Match the target_out shape to predictions shape
        target_out = tf.reshape(target_out, [-1, tf.shape(predictions)[1], tf.shape(predictions)[2]])
        loss = loss_object(target_out, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(target_out, predictions)

for epoch in range(epochs):
    train_loss.reset_states()
    train_accuracy.reset_states()

    for batch, (source_seq, target_inp, target_out) in enumerate(train_dataset):
        train_step(source_seq, target_inp, target_out)

    print(f'Epoch {epoch + 1}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result() * 100}')

# Save the model
model.save('universal_transformer.h5')


ValueError: in user code:

    File "<ipython-input-11-8dc5a5d352f7>", line 91, in train_step  *
        target_out = tf.reshape(target_out, [-1, tf.shape(predictions)[1], tf.shape(predictions)[2]])

    ValueError: Dimension size must be evenly divisible by 220000 but is 9728 for '{{node Reshape}} = Reshape[T=DT_INT32, Tshape=DT_INT32](target_out, Reshape/shape)' with input shapes: [64,152], [3] and with input tensors computed as partial shapes: input[1] = [?,22,10000].
