In [1]:
# exp 4.1

In [2]:
# best submission generated via this notebook: 
# dev1: submission_7_exp41_max_ckp.zip            (358645)
# dev2: submission_8_exp41_maxckp_genv2.zip       (358690)

# test1: submission_3_exp4dot1_max_ckp.zip        (359202)
# test2: submission_4_exp4dot1_max_ckp_genv2.zip  (359211) 

In [32]:
import os
from dataclasses import dataclass
import torch
import numpy as np
import math
from tqdm import tqdm
import json 
import pandas as pd

from src.tokenizers.eng import EnglishTokenizer
from src.tokenizers.indic import IndicTokenizer
from src.components import TransformerMT as selftx
from src.torchlayers import TransformerMT as torchtx

In [57]:
def translate_batch(model, eng_tokenizer, indic_tokenizer, english_texts, device='cuda', verbose=False):
    
    # all texts to sequences at once
    english_ids = eng_tokenizer.texts_to_sequences(english_texts)
    english_tensor = torch.tensor(english_ids, device=device)
    
    with torch.no_grad():
        # generate translations for the entire batch from model.generate
        translation_ids = model.generatev2(english_tensor, max_length=30, temperature=0.8)

    if verbose:
        print(f"Batch size: {len(english_texts)}")
        print(f"Generated shape: {translation_ids.shape}")

    # decode from indic_detokenizer
    translation_array = translation_ids.cpu().numpy()
    indic_texts = indic_tokenizer.sequences_to_texts(translation_array)
    
    return indic_texts

In [58]:
# train phase
# test_data = json.load(open(os.path.join("data", "raw", "val_data1.json")))
# key = 'Validation'

In [59]:
# train phase
test_data = json.load(open(os.path.join("data", "raw", "test_data1_final.json")))
key = 'Test'

### ENG 2 HINDI

In [60]:
@dataclass
class TransformerConfig:
    SRC_VOCAB_SIZE: int = 30_000                      # source vocabulary size
    TGT_VOCAB_SIZE: int = 30_000                      # target vocabulary size
    SRC_MAX_LENGTH: int = 256                         # max sequence length source lang
    TGT_MAX_LENGTH: int = 256                         # max sequence length target lang
    D_MODEL: int = 128                                # embedding dimension
    N_HEADS: int = 8                                  # number of heads in attention
    N_LAYERS: int = 6                                 # number of transformer blocks
    D_FF: int = 128 * 4                               # dimension of feedforward (4x of embedding dims)
    MAX_SEQ_LEN: int = 256
    DROPOUT: float = 0.1
    BATCH_SIZE: int = 32
    EVAL_STEPS: int = 100
    EPOCHS: int = 30

In [61]:
hindi_checkpoint_path = os.path.join("checkpoints", "eng_hindi", "exp4.1-eng-hindi-transformer-built-in")
checkpoint = torch.load(os.path.join(hindi_checkpoint_path, "tx_epoch_21_step_45500.pt"), weights_only=False)

In [62]:
checkpoint.keys()

dict_keys(['step', 'epoch', 'model_state_dict', 'optimizer_state_dict', 'loss', 'config', 'tr_lossi', 'val_lossi', 'eng_tokenizer', 'indic_tokenizer', 'model_config'])

In [63]:
# recreate ENGLISH tokenizer
eng_tok = EnglishTokenizer(checkpoint['eng_tokenizer']['max_vocab_size'], 
                           checkpoint['eng_tokenizer']['max_length'])
eng_tok.word2idx = checkpoint['eng_tokenizer']['word2idx']
eng_tok.idx2word = checkpoint['eng_tokenizer']['idx2word']
eng_tok.vocab_size = checkpoint['eng_tokenizer']['vocab_size']

In [64]:
# recreate INDIC tokenizer
indic_tok = IndicTokenizer(checkpoint['indic_tokenizer']['max_vocab_size'], 
                           checkpoint['indic_tokenizer']['max_length'])
indic_tok.word2idx = checkpoint['indic_tokenizer']['word2idx']
indic_tok.idx2word = checkpoint['indic_tokenizer']['idx2word']
indic_tok.vocab_size = checkpoint['indic_tokenizer']['vocab_size']

In [65]:
model_config = checkpoint['model_config']
model = torchtx(**model_config)
model.to('cuda')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [66]:
translation = translate_batch(model, eng_tok, indic_tok, english_texts=["Hi how are you"] , device='cuda')

In [67]:
translation

['हाय आप कैसे हैं? RD _ PUNC एम. out करो? 11? अलग हैं!']

In [68]:
hindi_output = pd.DataFrame()
ids = []
texts = []
for id_, entry in test_data['English-Hindi'][key].items():
    ids.append(id_)
    texts.append(entry['source'])

print(f"Total samples: {len(texts)}")
print(f"Processing in batches of {32}")

all_translations = []
for i in tqdm(range(0, len(texts), 32), desc="Processing batches"):
    batch_texts = texts[i:i+32]
    batch_translations = translate_batch(model, eng_tok, indic_tok, batch_texts, device='cuda')
    all_translations.extend(batch_translations)

# gather in df
hindi_output['ID'] = ids
hindi_output['Translation'] = all_translations

Total samples: 23085
Processing in batches of 32


Processing batches:   1%|▌                                                                | 6/722 [00:03<07:32,  1.58it/s]


KeyboardInterrupt: 

### ENG 2 BENGALI

In [45]:
@dataclass
class TransformerConfig:
    SRC_VOCAB_SIZE: int = 30_000                      # source vocabulary size
    TGT_VOCAB_SIZE: int = 30_000                      # target vocabulary size
    SRC_MAX_LENGTH: int = 256                         # max sequence length source lang
    TGT_MAX_LENGTH: int = 256                         # max sequence length target lang
    D_MODEL: int = 128                                # embedding dimension
    N_HEADS: int = 4                                  # number of heads in attention
    N_LAYERS: int = 6                                 # number of transformer blocks
    D_FF: int = 128 * 4                               # dimension of feedforward (4x of embedding dims)
    MAX_SEQ_LEN: int = 256
    DROPOUT: float = 0.1
    BATCH_SIZE: int = 32
    EVAL_STEPS: int = 500
    EPOCHS: int = 30

In [46]:
bengali_checkpoint_path = os.path.join("checkpoints", "eng_bengali", "exp4.1-eng-bengali-transformer-built-in")
checkpoint = torch.load(os.path.join(bengali_checkpoint_path, "tx_epoch_13_step_25000.pt"), weights_only=False)

In [47]:
# recreate ENGLISH tokenizer
eng_tok = EnglishTokenizer(checkpoint['eng_tokenizer']['max_vocab_size'], 
                           checkpoint['eng_tokenizer']['max_length'])
eng_tok.word2idx = checkpoint['eng_tokenizer']['word2idx']
eng_tok.idx2word = checkpoint['eng_tokenizer']['idx2word']
eng_tok.vocab_size = checkpoint['eng_tokenizer']['vocab_size']

In [48]:
# recreate INDIC tokenizer
indic_tok = IndicTokenizer(checkpoint['indic_tokenizer']['max_vocab_size'], 
                           checkpoint['indic_tokenizer']['max_length'])
indic_tok.word2idx = checkpoint['indic_tokenizer']['word2idx']
indic_tok.idx2word = checkpoint['indic_tokenizer']['idx2word']
indic_tok.vocab_size = checkpoint['indic_tokenizer']['vocab_size']

In [49]:
model_config = checkpoint['model_config']
model = torchtx(**model_config)
model.to('cuda')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [50]:
translation = translate_batch(model, eng_tok, indic_tok, english_texts=["Hi how are you?"] , device='cuda')

In [51]:
translation

['হাই আপনি কিভাবে কত করছেন? মি? মি? "?" তুমি কি তুমি কি তুমি টা পারো']

In [52]:
bengali_output = pd.DataFrame()
ids = []
texts = []
for id_, entry in test_data['English-Bengali'][key].items():
    ids.append(id_)
    texts.append(entry['source'])

print(f"Total samples: {len(texts)}")
print(f"Processing in batches of {32}")

all_translations = []
for i in tqdm(range(0, len(texts), 32), desc="Processing batches"):
    batch_texts = texts[i:i+32]
    batch_translations = translate_batch(
        model, eng_tok, indic_tok, batch_texts, device='cuda'
    )
    all_translations.extend(batch_translations)


bengali_output['ID'] = ids
bengali_output['Translation'] = all_translations

Total samples: 19672
Processing in batches of 32


Processing batches: 100%|███████████████████████████████████████████████████████████████| 615/615 [04:26<00:00,  2.30it/s]


## Final output for submission

In [53]:
import csv

In [54]:
output = pd.concat([bengali_output, hindi_output]).reset_index(drop=True)

In [55]:
# output['Translation'] = output['Translation'].str.replace(",", "")

In [56]:
output.to_csv(
    "answers/test/answer4dot1genv2.csv",
    sep="\t",
    index=False,
    header=True,
    quoting=csv.QUOTE_ALL,
    lineterminator="\n",
    doublequote=True,
)

In [36]:
# output.to_csv("answers/val/answer1.csv", index=False)

In [93]:
# answer = "answers/val/answer1.csv"
# with open(answer, "w", newline="", encoding="utf-8") as f:
#     writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
#     writer.writerow(["ID", "Translation"])  # header
#     for i in range(output.shape[0]):
#         writer.writerow([output["ID"][i], output["Translation"][i]])