### Install git repo (and restart instance when prompted)

Reference Links:

https://github.com/VarunGumma/IndicTransTokenizer

https://github.com/AI4Bharat/IndicTrans2


In [None]:
!git clone https://github.com/VarunGumma/IndicTransTokenizer

In [12]:
%cd IndicTransTokenizer

/content/IndicTransTokenizer/IndicTransTokenizer


In [None]:
pip install --editable ./

###Use HuggingFace model checkpoints

In [13]:
import torch
from transformers import AutoModelForSeq2SeqLM
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

tokenizer = IndicTransTokenizer(direction="en-indic")
ip = IndicProcessor(inference=True)
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)

sentences = [
    "This is a test sentence.",
    "This is another longer different test sentence.",
    "Please send an SMS to 9876543210 and an email on newemail123@xyz.com by 15th October, 2023.",
]

batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva")
batch = tokenizer(batch, src=True, return_tensors="pt")

with torch.inference_mode():
    outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

outputs = tokenizer.batch_decode(outputs, src=False)
outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
print(outputs)

['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक newemail123@xyz.com पर एक ईमेल भेजें।']


###Testing Transliterations

In [43]:
from copy import deepcopy

# batch size = 1
sent = ["my name is harsh vardhan and i am a teacher"] # harsh, harsha

# translate English to Kannada
batch = ip.preprocess_batch(sent, src_lang="eng_Latn", tgt_lang="kan_Knda")
batch = tokenizer(batch, src=True, return_tensors="pt")

with torch.inference_mode():
  output = model.generate(**batch, num_beams=4,
                          num_return_sequences=1, max_length=256)

# transliterate Kannada output to Hindi, Telugu
langs = ["kan_Knda", "hin_Deva", "tel_Telu"]
outputs = [deepcopy(output) for _ in range(3)]


for out, lang in zip(outputs, langs):
  decoded = tokenizer.batch_decode(out, src=False)
  print(ip._postprocess(decoded[0], placeholder_entity_map = {}, lang = lang))


ನನ್ನ ಹೆಸರು ಕಠೋರ ವರ್ಧನ್ ಮತ್ತು ನಾನು ಶಿಕ್ಷಕನಾಗಿದ್ದೇನೆ.
नन्न हॆसरु कठोर वर्धन् मत्तु नानु शिक्षकनागिद्देनॆ.
నన్న హెసరు కఠోర వర్ధన్ మత్తు నాను శిక్షకనాగిద్దేనె.


In [47]:
out # tensor output

tensor([[    2,  1586,  5004,  5815, 34620,   117,    41,   358,  3009,  6842,
          2508,     4,     2]])

###Trying out a CMS question...

In [54]:
sent = ["""The loop ABCD is moving with velocity v towards right.
The magnetic field is 4T. The loop is connected to a resistance of 8Ω.
If steady current of 2A flows in the loop, then value of v,
if loop has resistance of 4Ω, is (given, AB=30cm,AD=30cm)"""]

# sent = ["my name is harsh nisar and i am not a terrorist"]

batch = ip.preprocess_batch(sent, src_lang="eng_Latn", tgt_lang="hin_Deva")
batch = tokenizer(batch, src=True, return_tensors="pt")

with torch.inference_mode():
  output = model.generate(**batch, num_beams=4, num_return_sequences=1, max_length=256)

output = tokenizer.batch_decode(output, src=False)
output = ip.postprocess_batch(output, lang="hin_Deva")

for line in output[0].split("।"):
  print(line + "।") # we can use placeholder entity map option to replace "टी" with "T"


लूप ए. बी. सी. डी. वेग v के साथ दाईं ओर आगे बढ़ रहा है।
 चुंबकीय क्षेत्र 4टी है।
 लूप 8 ओहम के प्रतिरोध से जुड़ा हुआ है।
 यदि लूप में 2ए की स्थिर धारा बहती है, तो v का मान, यदि लूप में 4 ओहम का प्रतिरोध है, (दिया गया है, एबी = 30 सेमी, एडी = 30 सेमी)।


In [None]:
# google translate is better.. but advantages are multiple sentences at once, etc.