<a href="https://colab.research.google.com/github/varun29-git/translation_model/blob/main/inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Requirements
import os
import sys
import torch
from tokenizers import Tokenizer
from google.colab import drive
from pathlib import Path


# Google Drive to load trained weights
drive.mount('/content/drive')


In [2]:
# Clone Repo
REPO_NAME = "translation_model"
if not os.path.exists(f'/content/{REPO_NAME}'):
    !git clone https://github.com/varun29-git/{REPO_NAME}
sys.path.append(f'/content/{REPO_NAME}')

In [3]:
from model import build_transformer
from config import get_config
from train import greedy_decode, causal_mask

In [4]:
# Get config and setup the device
config = get_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# These tokenizers are the SAME ones used during training.
tokenizer_src = Tokenizer.from_file(f"/content/{REPO_NAME}/tokenizer_en.json")
tokenizer_tgt = Tokenizer.from_file(f"/content/{REPO_NAME}/tokenizer_hi.json")

In [6]:
# Initialize the model
model = build_transformer(
    src_vocab_size=tokenizer_src.get_vocab_size(),
    tar_vocab_size=tokenizer_tgt.get_vocab_size(),
    src_seq_len=config["seq_len"],
    tar_seq_len=config["seq_len"],
    d_model=config["d_model"],
    N=config["N"],
    h=config["h"],
    d_ff=config["d_ff"],
    dropout=config["dropout"]
).to(device)


In [7]:
weights_path = "/content/drive/MyDrive/Hindi_Translator_Project/t_model_24.pt"

# Weights import
if os.path.exists(weights_path):
    print(f"Loading weights from {weights_path}...")
    checkpoint = torch.load(weights_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print("Weights loaded")
else:
    print(f"Weights not found")

Loading weights from /content/drive/MyDrive/Hindi_Translator_Project/t_model_24.pt...
Weights loaded


In [8]:
# Convert raw text into padded token IDs
def encode_sentence(sentence, tokenizer, seq_len):
    tokens = tokenizer.encode(sentence).ids
    tokens = tokens[:seq_len - 2]

    tokens = (
        [tokenizer.token_to_id("[SOS]")] +
        tokens +
        [tokenizer.token_to_id("[EOS]")]
    )

    padding = [tokenizer.token_to_id("[PAD]")] * (seq_len - len(tokens))
    return torch.tensor(tokens + padding).unsqueeze(0)


In [9]:
@torch.no_grad()
def translate(sentence):
    model.eval()

    # Encode source sentence
    src = encode_sentence(
        sentence,
        tokenizer_src,
        config["seq_len"]
    ).to(device)

    # Source padding mask
    src_mask = (src != tokenizer_src.token_to_id("[PAD]")) \
        .unsqueeze(1).unsqueeze(2)

    # Greedy decoding
    output_tokens = greedy_decode(
        model,
        src,
        src_mask,
        max_len=config["seq_len"],
        device=device,
        tokenizer_src=tokenizer_src,
        tokenizer_tar=tokenizer_tgt
    )

    # Decode tokens to text and clean special symbols
    return tokenizer_tgt.decode(output_tokens.tolist()) \
        .replace("[SOS]", "") \
        .replace("[EOS]", "") \
        .strip()

In [10]:
# Common conversational sentences translate best
print(translate("How are you doing today?"))
print(translate("Do you know who I am?"))


आज तुम क्या कर रहे हो ?
क्या तुम जानते हो कि मैं कौन हूँ ?


In [11]:
# Proper nouns perform weaker due to limited representation in training data.
print(translate("My name is Varun"))
print(translate("I am a writer"))

मेरा नाम है
मैं 5 करता हूँ


In [12]:
# Demonstrates out-of-distribution behavior
print(translate("Namaste"))

. kgm
