In [1]:
import os

os.environ['HF_HOME'] = "./hf/"

os.environ['WANDB_DISABLED'] = 'true'

os.environ['CUDA_VISIBLE_DEVICES'] = '0'



In [2]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
# for Mbart
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import BertGenerationEncoder , BertGenerationDecoder, BertTokenizerFast
from transformers import EncoderDecoderModel, EncoderDecoderConfig


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
# model_name = "facebook/mbart-large-50"
# tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
# autoencoder_model_1 = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
# autoencoder_model_2 = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

In [5]:
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [6]:
encoder_1 = BertGenerationEncoder.from_pretrained(model_name , bos_token_id=101, eos_token_id=102).to(device)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder_1 = BertGenerationDecoder.from_pretrained(model_name,add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102).to(device)

autoencoder_model_1 = EncoderDecoderModel(encoder=encoder_1,decoder=decoder_1).to(device)

autoencoder_model_1.config.decoder_start_token_id = tokenizer.cls_token_id
autoencoder_model_1.config.pad_token_id = tokenizer.pad_token_id
autoencoder_model_1.config.vocab_size = autoencoder_model_1.config.decoder.vocab_size



You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of BertGenerationDecoder were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.en

In [7]:
encoder_2 = BertGenerationEncoder.from_pretrained(model_name , bos_token_id=101, eos_token_id=102).to(device)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder_2 = BertGenerationDecoder.from_pretrained(model_name,add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102).to(device)


autoencoder_model_2 = EncoderDecoderModel(encoder=encoder_2,decoder=decoder_2).to(device)

autoencoder_model_2.config.decoder_start_token_id = tokenizer.cls_token_id
autoencoder_model_2.config.pad_token_id = tokenizer.pad_token_id
autoencoder_model_2.config.vocab_size = autoencoder_model_2.config.decoder.vocab_size



You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of BertGenerationDecoder were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.en

In [8]:
optimizer = Adam(list(autoencoder_model_1.parameters()) + list(autoencoder_model_2.parameters()), lr=1e-5)

In [10]:
# Contrastive loss function
def contrastive_loss(output1, output2):
    return 1 - torch.nn.functional.cosine_similarity(output1, output2 , dim=-1).mean()


In [11]:
from datasets import load_dataset

dataset = load_dataset("cfilt/iitb-english-hindi")

In [12]:
from datasets import Dataset
def generate_dataset(dataset , split):
    filtered_dataset = dataset[split]['translation']
    english_dataset = [data['en'] for data in filtered_dataset]
    hindi_dataset = [data['hi'] for data in filtered_dataset]
    data_dictionary = {
        "english" : english_dataset,
        "hindi" : hindi_dataset
    }
    return Dataset.from_dict(data_dictionary)


In [13]:
train_dataset = generate_dataset(dataset, "train")
train_dataset

Dataset({
    features: ['english', 'hindi'],
    num_rows: 1659083
})

In [14]:
test_dataset = generate_dataset(dataset , "test")
test_dataset

Dataset({
    features: ['english', 'hindi'],
    num_rows: 2507
})

In [15]:
validation_dataset = generate_dataset(dataset , "validation")
validation_dataset

Dataset({
    features: ['english', 'hindi'],
    num_rows: 520
})

In [16]:
# def tokenize_example(example , lang):
#     return tokenizer(example[lang], truncation=True)

# def tokenize_dataset(example):
#     english_tokens = tokenize_example(example, "english")
#     # english_tokens['english_tokens'] = english_tokens['input_ids']
#     english_tokens['english_attention_mask'] = english_tokens['attention_mask']
#     hindi_tokens = tokenize_example(example , "hindi")
#     english_tokens['labels'] = hindi_tokens['input_ids']
#     english_tokens['hindi_attention_mask'] = hindi_tokens['attention_mask']
#     return english_tokens



def tokenize_dataset_new(example):
    model_inputs = tokenizer(example["english"], max_length=512, truncation=True)
    labels = tokenizer(example["hindi"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs['hindi_attn_mask'] = labels['attention_mask']

    return model_inputs

In [17]:
train_tokenised_dataset = train_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
train_tokenised_dataset = train_tokenised_dataset.remove_columns(['english' , 'hindi' , 'hindi_attn_mask'])
train_tokenised_dataset

Map (num_proc=5):   0%|          | 0/1659083 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1659083
})

In [18]:
test_tokenised_dataset = test_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
test_tokenised_dataset = test_tokenised_dataset.remove_columns(['english' , 'hindi' , 'hindi_attn_mask'])
test_tokenised_dataset

Map (num_proc=5):   0%|          | 0/2507 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2507
})

In [19]:
validation_tokenised_dataset = validation_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
validation_tokenised_dataset = validation_tokenised_dataset.remove_columns(['english' , 'hindi' , 'hindi_attn_mask'])
validation_tokenised_dataset

Map (num_proc=5):   0%|          | 0/520 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 520
})

In [20]:
from transformers import DataCollatorForSeq2Seq
data_collector = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [21]:
from torch.utils.data import DataLoader

train_batch_size = 15
test_batch_size = 15
train_dataloader = DataLoader(train_tokenised_dataset , shuffle=True,
                                batch_size = train_batch_size,
                                collate_fn = data_collector
                                )

test_dataloader = DataLoader(test_tokenised_dataset , shuffle=True,
                                batch_size = test_batch_size,
                                collate_fn = data_collector
                                )

train_dataloader, test_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x7f0455e208f0>,
 <torch.utils.data.dataloader.DataLoader at 0x7f04547b6600>)

In [25]:

for batch in test_dataloader:
    print({k:v.shape for k,v in batch.items()})
    labels = batch['labels'].to(device)
    labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
    inputs = labels
    print(inputs[1])
    value = autoencoder_model_1(input_ids = inputs , labels = inputs)
    loss = value.loss
    print(value.logits.shape)
    print("loss is " , loss)
    break

{'input_ids': torch.Size([15, 54]), 'token_type_ids': torch.Size([15, 54]), 'attention_mask': torch.Size([15, 54]), 'labels': torch.Size([15, 101])}
tensor([  101, 41993, 13088,   888, 35133, 15778, 12213,   885, 11845, 69002,
        11549,   887, 11549, 35127, 17203, 24734,   896, 41937,   889, 22078,
        15168,   920,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0], device='cuda:0')
torch.Size([15, 101, 119547])
loss is 

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


In [27]:
from tqdm import tqdm
import time
from transformers import get_scheduler
# Training loop
def train(model1 = autoencoder_model_1 , model2 = autoencoder_model_2, epochs=1):
    model1.train()
    model2.train()
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer = optimizer,
        num_warmup_steps= 100,
        num_training_steps = num_training_steps
    )
    steps = 0
    for epoch in range(epochs):
        print(f"epoch : {epoch}")
        total_autoencoder1_loss = 0.0
        total_autoencoder2_loss = 0.0
        total_contrastive_loss = 0.0
        report_time = time.time()

        for batch in tqdm(train_dataloader , desc = "batches"):
            steps += 1
            # english 
            src_input_ids = batch['input_ids'].to(device)
            # src_attention_mask = batch['attention_mask']
            # hindi
            tgt_input_ids = batch['labels'].to(device)
            
            tgt_input_ids = torch.where(tgt_input_ids != -100, tgt_input_ids, tokenizer.pad_token_id)
            

            # Forward pass through autoencoder 1 (source) english
            outputs1 = model1(input_ids=src_input_ids, labels=src_input_ids,
                            #   attention_mask=src_attention_mask, decoder_attention_mask=src_attention_mask
                            )
            
            autoencoder1_loss = outputs1.loss

            # Get encoder output for source
            # print("print shape is " , outputs1.encoder_last_hidden_state.shape)
            encoded_output1 = outputs1.encoder_last_hidden_state[:, 0, :]  # Assuming BERT-like model , taking cls token

            # Forward pass through autoencoder 2 (target)
            outputs2 = model2(input_ids=tgt_input_ids, labels=tgt_input_ids,
                            #   attention_mask=tgt_attention_mask, decoder_attention_mask=tgt_attention_mask
                            )
            autoencoder2_loss = outputs2.loss

            # Get encoder output for target
            encoded_output2 = outputs2.encoder_last_hidden_state[:, 0, :]  # Assuming BERT-like model, taking cls token

            # Calculate contrastive loss
            contrastive_loss_value = contrastive_loss(encoded_output1, encoded_output2)

            # Total loss
            loss = autoencoder1_loss + autoencoder2_loss + contrastive_loss_value

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            # Accumulate losses
            total_autoencoder1_loss += autoencoder1_loss.item()
            total_autoencoder2_loss += autoencoder2_loss.item()
            total_contrastive_loss += contrastive_loss_value.item()

            if steps % 100 == 0:
                print("Total Loss " , loss.item() ,f"Contrastive Loss: {contrastive_loss_value.item()}")
                with open("log.txt" , "a") as log:
                    log_message = f"""
                    Time = {time.time()}
                    f"Autoencoder 1 Loss: {autoencoder1_loss}
                    f"Autoencoder 2 Loss: {autoencoder2_loss}
                    f"Contrastive Loss: {contrastive_loss_value}
                    Total Loss: {loss.item()} 
                    *****************
                    """
                    log.write(log_message)
            # save checkpoint every thousand steps
            if (steps+1) % 1000 == 0:
                model_1_checkpoint = f"checkpoints/autoencoder_model_1_checkpoint_{steps+1}"
                model_2_checkpoint = f"checkpoints/autoencoder_model_2_checkpoint_{steps+1}"
                autoencoder_model_1.save_pretrained(model_1_checkpoint)
                autoencoder_model_2.save_pretrained(model_2_checkpoint)



        # Print epoch statistics
        print(f"Epoch {epoch + 1}:")
        print(f"Autoencoder 1 Loss: {total_autoencoder1_loss / len(dataloader)}")
        print(f"Autoencoder 2 Loss: {total_autoencoder2_loss / len(dataloader)}")
        print(f"Contrastive Loss: {total_contrastive_loss / len(dataloader)}")


In [29]:
train()

epoch : 0


batches:   0%|          | 25/110606 [00:10<12:56:45,  2.37it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.42 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.88 GiB is free. Process 59248 has 4.88 GiB memory in use. Including non-PyTorch memory, this process has 72.38 GiB memory in use. Of the allocated memory 67.63 GiB is allocated by PyTorch, and 4.25 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
autoencoder_model_1.save_pretrained("./autoencoder_model_1")
autoencoder_model_2.save_pretrained("./autoencoder_model_2")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [9]:
model1 = EncoderDecoderModel.from_pretrained("checkpoints/mBERT/autoencoder_model_1_checkpoint_17000")

model2 = EncoderDecoderModel.from_pretrained("checkpoints/mBERT/autoencoder_model_2_checkpoint_17000")


In [14]:
# Inference
def translate(src_sentence):
    print(src_sentence)
    tokens = tokenizer(src_sentence, return_tensors='pt')
    # print(tokens)
    encoded_src = model1.encoder(input_ids = tokens.input_ids , attention_mask = tokens.attention_mask).last_hidden_state
    # print(encoded_src)
    generated_tgt = model2.decoder.generate(encoder_hidden_states=encoded_src)
    # print(generated_tgt)
    decoded_tgt = tokenizer.decode(generated_tgt[0], skip_special_tokens=True)
    return decoded_tgt

In [15]:
src_sentence = "this is a test sentence"
translate(src_sentence)

this is a test sentence




''

In [13]:
model1.encoder()

ValueError: You have to specify either input_ids or inputs_embeds