# **Installing required dependencies**

In [1]:
!pip install nltk transformers sentencepiece -q

In [2]:
import nltk
from nltk import word_tokenize, pos_tag
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline



# **Model Instantiation**

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
checkpoint = 'facebook/nllb-200-3.3B'
# checkpoint = ‘facebook/nllb-200–1.3B’
# checkpoint = ‘facebook/nllb-200–3.3B’
# checkpoint = ‘facebook/nllb-200-distilled-1.3B’

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [6]:
source_lang = "eng_Latn"
target_lang = "hin_Deva"

# **Setting up and configuring *translation* pipeline**

In [7]:
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source_lang, tgt_lang=target_lang, device=0, max_length = 400)

# **Helper functions**

In [8]:
#function that returns nouns
def return_nouns(text : str) -> list:
    tokens = word_tokenize(text)
    parts_of_speech = nltk.pos_tag(tokens)
#     print(parts_of_speech)
    nouns = list(filter(lambda x: x[1] == "NN" or x[1] == "NNS" or x[1] == "NNP" or x[1] == "NNPS", parts_of_speech))
    
    return nouns

In [9]:
#map key value pairs
def key_value_pair(nouns: list) -> dict:
    temp_dict = dict()
    for noun in nouns:
        value = translator(noun[0]+".")[0]['translation_text']
        if value[-1] == "." or value[-1] == "।":
            value = value[:-1]
        temp_dict[noun[0]] = value
        
    return temp_dict

In [10]:
#function for converting hindi text to hinglish
def hindi_to_hinglish(en_text: str, hi_text: str) -> str:
    #find nouns from english text
    nouns = return_nouns(en_text)
    #pair it with its hindi translation
    en_hi_transliteration = key_value_pair(nouns)
    
    hindi_text = hi_text.split()
    
    for k, v in en_hi_transliteration.items():
        for i in range(len(hindi_text)):
            if v in hindi_text[i]:
                hindi_text[i] = k
                
    return " ".join(hindi_text)

# **Main function**

In [11]:
#main pipeline function
def english_to_hinglish(en_text: str) -> str:
    hi_text = translator(en_text)[0]['translation_text']
    hinglish_text = hindi_to_hinglish(en_text, hi_text)
    
    return hinglish_text

# **Test Cases**

In [12]:
english_to_hinglish("Definitely share your feedback in the comment section.")

'comment अनुभाग में अपनी feedback अवश्य साझा करें।'

In [13]:
english_to_hinglish("So even if it's a big video, I will clearly mention all the products.")

'तो भले ही यह एक बड़ा video हो, मैं स्पष्ट रूप से सभी products का उल्लेख करूंगा।'

In [14]:
english_to_hinglish("I was waiting for my bag.")

'मैं अपने bag की प्रतीक्षा कर रहा था।'