# Question 1

## IndicTrans2 

In [1]:
# for reading a file
def read_file(path):
    data_set=[]
    with open(path, 'r', encoding='utf-8') as file:
        file_contents = file.read()
    file_contents = file_contents.split('\n')
    return file_contents

# for writing a data to a file
def write_to_file(file_path, string_list):
    with open(file_path, 'w', encoding='utf-8') as file:
        for string in string_list:
            file.write(string + '\n')  # Write the string followed by a newline character

In [2]:
# read files of 3 different languages (1 english, 1 mother tongue, 1 any other indic lang)
rand_hin = read_file('/kaggle/input/data-for-indictrans/random_hin_1000.txt')
rand_eng = read_file('/kaggle/input/data-for-indictrans/random_eng_1000.txt')
rand_guj = read_file('/kaggle/input/data-for-indictrans/random_guj_1000.txt')

### Setup

In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None

In [4]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

### English to Hindi

In [None]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)
ip = IndicProcessor(inference=True)
src_lang, tgt_lang = "eng_Latn", "hin_Deva"
hi_translations = batch_translate(rand_eng, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)
write_to_file('/kaggle/working/indicTrans_translated_eng_to_hin.txt', hi_translations)
# flush the models to free the GPU memory
del en_indic_tokenizer, en_indic_model

### Hindi to English

In [None]:
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"  # ai4bharat/indictrans2-indic-en-dist-200M
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, "indic-en", "")

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
en_translations = batch_translate(rand_hin, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)

write_to_file('/kaggle/working/indicTrans_translated_hin_to_eng.txt', en_translations)

# flush the models to free the GPU memory
del indic_en_tokenizer, indic_en_model

### Hindi to Gujrati

In [None]:
indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva", "guj_Gujr"
guj_translations = batch_translate(rand_hin, src_lang, tgt_lang, indic_indic_model, indic_indic_tokenizer, ip)

write_to_file('/kaggle/working/indicTrans_translated_hin_to_guj.txt', guj_translations)
# flush the models to free the GPU memory
del indic_indic_tokenizer, indic_indic_model

### Gujrati to Hindi

In [5]:
indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang =  "guj_Gujr", "hin_Deva"
hin_translations = batch_translate(rand_guj, src_lang, tgt_lang, indic_indic_model, indic_indic_tokenizer, ip)

write_to_file('/kaggle/working/indicTrans_translated_guj_to_hin.txt', hin_translations)
# flush the models to free the GPU memory
del indic_indic_tokenizer, indic_indic_model

config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## ChatGPT

In [6]:
rand_50_eng = rand_eng[:50]
rand_50_hin = rand_hin[:50]
rand_50_guj = rand_guj[:50]

In [8]:
for i,sent in enumerate(rand_50_eng):
    print(i+1,". ", sent)

1 .  He will release a document highlighting the achievements of the Government.
2 .  I am particularly grateful to Prime Minister Abe for his encouragement and support at a personal level.
3 .  In the next two months, we will be busy in the hurly-burly of the general elections.
4 .  He said the Union Government has got 5 crore toilets built in three years.
5 .  The unique festival Chhath Pooja is deeply linked with nature & worshiping nature.
6 .  This Agreement shall enter into force on the first day of the second month after the Contracting Parties notify each other in through diplomatic channels, that the necessary national legal requirements for entry into force of this Agreement have been fulfilled.
7 .  I greatly welcome the growing ties between our parliamentarians, and between our states and Japan’s prefectures.
8 .  India is now ripe, for a sweet revolution.
9 .  The MoU covers the following areas of cooperation:-
10 .  The Prime Minister added that these facilities will also

In [9]:
for i,sent in enumerate(rand_50_hin):
    print(i+1,". ", sent)

1 .  प्रधानमंत्री सरकार की उपलब्धियों को उजागर करने वाला एक दस्‍तावेज जारी करेंगे।
2 .  मैं प्रधानमंत्री आबे के प्रति उनके प्रोत्‍साहन और व्‍यक्तिगत स्‍तर पर समर्थन के लिए खास तौर पर आभारी हूं।
3 .  अगले दो महीने, हम सभी चुनाव की गहमा-गहमी में व्यस्त होगें।
4 .  उन्होंने कहा कि केंद्र सरकार ने पिछले 3 वर्षों में पांच करोड़ शौचालयों का निर्माण किया है।
5 .  छठ-पूजा का अनुपम-पर्व प्रकृति से और प्रकृति की उपासना से पूरी तरह जुड़ा हुआ है।
6 .  सीमा शुल्‍क मामलों में सहयोग और परस्‍पर सहायता पर भारत गणराज्‍य की सरकार और अर्मेनिया गणराज्‍य की सरकार के बीच करार पर भारत सरकार की ओर से हस्‍ताक्षर किए जाएंगे और यह क‍ि इस करार के लागू होने के लिए आवश्‍यक राष्‍ट्रीय कानूनी अपेक्षाओं को पूरा कर लिया गया है।
7 .  मैं अपने सांसदों, अपने राज्यों और जापान के प्रान्तों के बीच बढ़ते रिश्तों का हृदय से स्वागत करता हूँ।
8 .  भारत अब एक sweet revolution की ओर बढ़ रहा है।
9 .  समझौता-ज्ञापन के दायरे में निम्‍नलिखित सहयोग क्षेत्र है:-
10 .  प्रधानमंत्री ने यह भी कहा कि ये सुविधाएं पर्यटकों को भी आकर्षित करेंगी।

In [10]:
for i,sent in enumerate(rand_50_guj):
    print(i+1,". ", sent)

1 .  તેઓ સરકારની ઉપલબ્ધિઓ દર્શાવતા દસ્તાવેજ પ્રસિદ્ધ કરશે.
2 .  હું પ્રધાનમંત્રી આબેનો વ્યક્તિગત સ્તરે પ્રોત્સાહન અને સાથસહકાર આપવા બદલ વિશેષ આભારી છું.
3 .  આવતા બે મહિના આપણે બધા ચૂંટણીની દોડાદોડમાં વ્યસ્ત હોઇશું.
4 .  તેમણે કહ્યું હતું કે, કેન્દ્ર સરકારે ત્રણ વર્ષમાં 5 કરોડ શૌચાલયો બનાવ્યાં છે.
5 .  છઠ પૂજાનું અનુપમ પર્વ પ્રકૃતિથી અને પ્રકૃતિની ઉપાસના સાથે પૂરી રીતે જોડાયેલું છે.
6 .  કરાર કરનાર પક્ષકારો આ કરારમાં જરૂરી રાષ્ટ્રીય કાનૂની જરૂરિયાતો સંતોષવામાં આવી હોવા અંગે પોતાની ડિપ્લોમેટિક ચેનલો મારફતે નોટિફાય કરે તે પછી આ કરારને બીજા મહિનાના પ્રથમ દિવસથી અમલી બનાવવામાં આવશે.
7 .  હું મારા સાંસદો, પોતાનાં રાજ્યો અને જાપાનનાં પ્રાંતો વચ્ચે ગાઢ થઈ રહેલાં સંબંધોનું હૃદયપૂર્વક સ્વાગત કરું છું.
8 .  ભારત મધ ક્રાંતિ કરી રહ્યું છે.
9 .  માનવ સંસાધન વિકાસમાં સહાયતા અને સ્વાસ્થ્ય સુવિધાઓની સ્થાપના.
10 .  પ્રધાનમંત્રીએ ઉમેર્યું હતું કે, આ સુવિધાઓ પ્રવાસીઓને પણ આકર્ષશે.
11 .  વિશ્વ બેંકના વ્યાપાર-વાણિજ્ય માટે સરળતા માટેના 2018ના અહેવાલમાં ભારતે છેલ્લા ત્રણ વર્ષમાં 42 ક્રમની છલાંગ લગાવી છે અને 

In [12]:
# saving these 50 sentences from each langauge for using in evaluation part
write_to_file('/kaggle/working/eng_50_for_ChatGPT.txt', rand_50_eng)
write_to_file('/kaggle/working/hin_50_for_ChatGPT.txt', rand_50_hin)
write_to_file('/kaggle/working/guj_50_for_ChatGPT.txt', rand_50_guj)