### Installing All The Dependency

In [None]:
!git clone https://github.com/VarunGumma/indic_nlp_library.git

%cd indic_nlp_library
!pip install --editable ./

In [None]:
!pip install setuptools==68.2.2
!pip install torch
!pip install sacremoses
!pip install sentencepiece
!pip install transformers

In [None]:
!pip install git+https://github.com/VarunGumma/IndicTransTokenizer

In [None]:
!pip install sacrebleu

In [None]:
pip install rouge

In [None]:
pip install nltk

In [14]:
# pip install --upgrade nltk


### Importing the required files

In [20]:
import torch
from transformers import AutoModelForSeq2SeqLM
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

## Loding IndicTrans2-en-indic model

In [22]:
tokenizer_eng = IndicTransTokenizer(direction="en-indic")
ip = IndicProcessor(inference=True)
model_eng = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)


## Loding IndicTrans2-indic-en model

In [23]:
tokenizer_indic = IndicTransTokenizer(direction="indic-en")
ip = IndicProcessor(inference=True)
model_indic = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-en-dist-200M", trust_remote_code=True)


## Loding IndicTrans2-indic-indic model

In [24]:
tokenizer_indic_indic = IndicTransTokenizer(direction="indic-indic")
ip = IndicProcessor(inference=True)
model_indic_indic = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-indic-dist-320M", trust_remote_code=True)


## Reading English Data

In [27]:
with open("C:\\Users\\kanoj\\IITK\\Academic\\Computational Linguistic\\Assignment3\\A3_sol\\test.en","r",encoding="utf-8") as file:
    eng_sentences=file.readlines()

eng_sentences=[sentence for sentence in eng_sentences]


## Reading Hindi Data

In [30]:
with open("C:\\Users\\kanoj\\IITK\\Academic\\Computational Linguistic\\Assignment3\\A3_sol\\test.hi","r",encoding="utf-8") as file:
    hin_sentences=file.readlines()

hin_sentences=[sentence for sentence in hin_sentences]


## Reading Gujarati Data

In [31]:
with open("C:\\Users\\kanoj\\IITK\\Academic\\Computational Linguistic\\Assignment3\\A3_sol\\test.gu","r",encoding="utf-8") as file:
    guj_sentences=file.readlines()

guj_sentences=[sentence for sentence in guj_sentences]


## Sampling Random 1000 Sentences for Each Lang 

In [32]:
import random
sample_indices = random.sample(range(len(eng_sentences)), 10)
eng_sample = [eng_sentences[i] for i in sample_indices]
hin_sample = [hin_sentences[i] for i in sample_indices]
guj_sample = [guj_sentences[i] for i in sample_indices]

## Translating English to Hindi using IndicTrans2 eng-indic model

In [33]:
batch = ip.preprocess_batch(eng_sample, src_lang="eng_Latn", tgt_lang="hin_Deva")
batch = tokenizer_eng(batch, src=True, return_tensors="pt")

with torch.inference_mode():
    outputs = model_eng.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

outputs = tokenizer_eng.batch_decode(outputs, src=False)
translated_eng_to_hin = ip.postprocess_batch(outputs, lang="hin_Deva")


In [34]:
# printing translated sentence
# translated_eng_to_hin

##### Saving Translated English to Hindi Sentences

In [35]:
with open("Eng_to_Hindi.txt", "w",encoding="utf-8") as file:
    # Write each sentence to the file followed by a newline character
    for sentence in translated_eng_to_hin:
        file.write(sentence + "\n")

## Translating Hindi to English using IndicTrans2 indic-eng model

In [36]:
batch = ip.preprocess_batch(hin_sample, src_lang="hin_Deva", tgt_lang="eng_Latn")
batch = tokenizer_indic(batch, src=True, return_tensors="pt")

with torch.inference_mode():
    outputs = model_indic.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

outputs = tokenizer_indic.batch_decode(outputs, src=False)
translated_hin_to_eng = ip.postprocess_batch(outputs, lang="eng_Latn")


In [37]:
# printing translated sentence
# translated_hin_to_eng

#### Saving Translated Hindi To English Sentences

In [38]:
with open("Hindi_to_Eng.txt", "w",encoding="utf-8") as file:
    # Write each sentence to the file followed by a newline character
    for sentence in translated_hin_to_eng:
        file.write(sentence + "\n")

## Translating Hindi to Gujarati using IndicTrans2 indic-indic model

In [39]:
batch = ip.preprocess_batch(hin_sample, src_lang="hin_Deva", tgt_lang="guj_Gujr")
batch = tokenizer_indic_indic(batch, src=True, return_tensors="pt")

with torch.inference_mode():
    outputs = model_indic_indic.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

outputs = tokenizer_indic_indic.batch_decode(outputs, src=False)
translated_hin_to_guj = ip.postprocess_batch(outputs, lang="guj_Gujr")


In [40]:
# printing translated sentence
#  translated_hin_to_guj

#### Saving Translated Hindi To Gujarati Sentences

In [41]:
with open("Hindi_to_Guj.txt", "w",encoding="utf-8") as file:
    # Write each sentence to the file followed by a newline character
    for sentence in translated_hin_to_guj:
        file.write(sentence + "\n")

## Translating Gujarati To Hindi using IndicTrans2 indic-indic model

In [42]:
batch = ip.preprocess_batch(guj_sample, src_lang="guj_Gujr", tgt_lang="hin_Deva")
batch = tokenizer_indic_indic(batch, src=True, return_tensors="pt")

with torch.inference_mode():
    outputs = model_indic_indic.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

outputs = tokenizer_indic_indic.batch_decode(outputs, src=False)
translated_guj_to_hin = ip.postprocess_batch(outputs, lang="hin_Deva")


In [43]:
# printing translated sentence
# translated_guj_to_hin

#### Saving Translated Gujarati To Hindi Sentences

In [44]:
with open("Guj_to_Hindi.txt", "w",encoding="utf-8") as file:
    # Write each sentence to the file followed by a newline character
    for sentence in translated_guj_to_hin:
        file.write(sentence + "\n")

# Calculating BLEU and ROUG Scores

## BLEU Score

In [45]:
from nltk.translate.bleu_score import corpus_bleu

#BLEU Score for english to hindi conversion

In [46]:
bleuScore_eng_to_hin=corpus_bleu([[ref] for ref in hin_sample], translated_eng_to_hin)
print("BLEU Score English to Hindi:",bleuScore_eng_to_hin)

BLEU Score English to Hindi: 0.7037931419190754


#BLEU Score for hindi to english conversion

In [48]:
bleuScore_hin_to_eng=corpus_bleu([[ref] for ref in eng_sample], translated_hin_to_eng)
print("BLEU Score Hindi to English:",bleuScore_hin_to_eng)

BLEU Score Hindi to English: 0.7063899387952451


#BLEU Score for gujarati to hindi conversion

In [49]:
bleuScore_guj_to_hin=corpus_bleu([[ref] for ref in hin_sample], translated_guj_to_hin)
print("BLEU Score Gujarati to Hindi:",bleuScore_guj_to_hin)

BLEU Score Gujarati to Hindi: 0.7140860979518312


#BLEU Score for hindi to gujarati conversion

In [50]:
bleuScore_hin_to_guj=corpus_bleu([[ref] for ref in guj_sample], translated_hin_to_guj)
print("BLEU Score Hindi to Gujarati:",bleuScore_hin_to_guj)

BLEU Score Hindi to Gujarati: 0.6204324483594849


## ROUGE Score

In [53]:
from rouge import Rouge
def calculate_rouge_scores(reference_sentences, translated_sentences):
    rouge = Rouge()
    scores = rouge.get_scores(translated_sentences, reference_sentences, avg=True)
    return scores

#ROUGE Score for English to Hindi Translation

In [54]:
rouge_eng_to_hin=calculate_rouge_scores(hin_sample,translated_eng_to_hin)
print("ROUGE Score English to Hindi:")
print("ROUGE_1:",rouge_eng_to_hin['rouge-1'])
print("ROUGE_2:",rouge_eng_to_hin['rouge-2'])
print("ROUGE_L:",rouge_eng_to_hin['rouge-l'])

ROUGE Score English to Hindi:
ROUGE_1: {'r': 0.6826372549019608, 'p': 0.741045150755677, 'f': 0.7089113360953545}
ROUGE_2: {'r': 0.38201771336553947, 'p': 0.41190943043884226, 'f': 0.3944655143937074}
ROUGE_L: {'r': 0.6112397504456327, 'p': 0.6626279524568999, 'f': 0.6343637550214576}


#ROUGE Score for Hindi to English Translation

In [55]:
rouge_hin_to_eng=calculate_rouge_scores(eng_sample,translated_hin_to_eng)
print("ROUGE Score Hindi to English:")
print("ROUGE_1:",rouge_hin_to_eng['rouge-1'])
print("ROUGE_2:",rouge_hin_to_eng['rouge-2'])
print("ROUGE_L:",rouge_hin_to_eng['rouge-l'])

ROUGE Score Hindi to English:
ROUGE_1: {'r': 0.5659251859251859, 'p': 0.5617795469064819, 'f': 0.5616502297649105}
ROUGE_2: {'r': 0.3394565217391305, 'p': 0.3421175369253172, 'f': 0.337913466084995}
ROUGE_L: {'r': 0.516874236874237, 'p': 0.5133733844414958, 'f': 0.5130982733234192}


#ROUGE Score for Gujarati to Hindi Translation

In [56]:
rouge_guj_to_hin=calculate_rouge_scores(hin_sample,translated_guj_to_hin)
print("ROUGE Score Gujarati to Hindi:")
print("ROUGE_1:",rouge_guj_to_hin['rouge-1'])
print("ROUGE_2:",rouge_guj_to_hin['rouge-2'])
print("ROUGE_L:",rouge_guj_to_hin['rouge-l'])

ROUGE Score Gujarati to Hindi:
ROUGE_1: {'r': 0.6923262032085561, 'p': 0.7327754158316818, 'f': 0.7053108563033385}
ROUGE_2: {'r': 0.4669412238325282, 'p': 0.4931216931216931, 'f': 0.4735361625302832}
ROUGE_L: {'r': 0.6273983957219251, 'p': 0.6727781560582071, 'f': 0.6429930540910391}


#ROUGE Score for Hindi to Gujarati Translation

In [57]:
rouge_hin_to_guj=calculate_rouge_scores(guj_sample,translated_hin_to_guj)
print("ROUGE Score Hindi to Gujarati:")
print("ROUGE_1:",rouge_hin_to_guj['rouge-1'])
print("ROUGE_2:",rouge_hin_to_guj['rouge-2'])
print("ROUGE_L:",rouge_hin_to_guj['rouge-l'])

ROUGE Score Hindi to Gujarati:
ROUGE_1: {'r': 0.49223426212590304, 'p': 0.5012865497076023, 'f': 0.4920218007582878}
ROUGE_2: {'r': 0.24851668862538423, 'p': 0.2531998556998557, 'f': 0.24772485729818516}
ROUGE_L: {'r': 0.4817079463364293, 'p': 0.4907602339181286, 'f': 0.48149548496881406}
