### Installing All The Dependency

In [None]:
!pip install --editable ./

Obtaining file:///content/drive/MyDrive/IndicTransTokenizer
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library (from IndicTransTokenizer==0.1.3)
  Cloning https://github.com/VarunGumma/indic_nlp_library to /tmp/pip-install-uy5pfa5p/indic-nlp-library-it2_66439cc9f000427f92b653cce9d5f5d7
  Running command git clone --filter=blob:none --quiet https://github.com/VarunGumma/indic_nlp_library /tmp/pip-install-uy5pfa5p/indic-nlp-library-it2_66439cc9f000427f92b653cce9d5f5d7
  Resolved https://github.com/VarunGumma/indic_nlp_library to commit 1dd6683a6dd77be3c1dbe03c226201661235c72b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setuptools==68.2.2 (from IndicTransTokenizer==0.1.3)
  Downloading setuptools-68.2.2-py3-none-any.whl (807 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m807.9/807.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from IndicT

In [None]:
!pip install rouge
!pip install nltk
!pip install --upgrade nltk


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


### Importing the required files

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

## Loding all three version of IndicTrans2 model

In [1]:
def load_model(direction,name):
    tokenizer = IndicTransTokenizer(direction=direction)
    ip = IndicProcessor(inference=True)
    model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-"+name, trust_remote_code=True)
    return tokenizer,ip,model


In [4]:
en_indic_tokenizer,ip,en_indic_model = load_model("en-indic","en-indic-dist-200M")
indic_en_tokenizer,ip,indic_en_model = load_model("indic-en","indic-en-dist-200M")
indic_indic_tokenizer,ip,indic_indic_model = load_model("indic-indic","indic-indic-dist-320M")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Data Loading

In [2]:
def load_data(path):
    with open(path,"r",encoding = "utf-8") as file:
        return [line.strip() for line in file]



In [3]:
en_true = load_data("./data/test.en")
hi_true = load_data("./data/test.hi")
mar_true = load_data("./data/test.mr")

In [4]:
len(en_true),len(hi_true),len(mar_true)

(2390, 2390, 2390)

## sample 1000 sentences randomly for each language

In [5]:
import random
random.seed(27)
sample_indices = random.sample(range(len(hi_true)), 1000)
def random_sentences(language):
    random_sentences = [language[i] for i in sample_indices]
    return random_sentences


In [6]:
eng_1k_sentences = random_sentences(en_true)
hi_1k_sentences =  random_sentences(hi_true)
mar_1k_sentences = random_sentences(mar_true)


In [11]:
len(hi_1k_sentences)

1000

## Translation Using IndicTrans

In [9]:
def translating(source,target,tokenizer,model,sample_sentences):
    batch = ip.preprocess_batch(sample_sentences, src_lang=source, tgt_lang=target)
    batch = tokenizer(batch, src=True, return_tensors="pt")
    with torch.inference_mode():
        outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
    outputs = tokenizer.batch_decode(outputs, src=False)
    translated_sentence = ip.postprocess_batch(outputs, lang=target)
    return translated_sentence


### (1) Hindi to English

In [None]:
translated_sentence_hindi_to_eng = []
for i in range(0,20):
    hi_true_50 = hi_1k_sentences[i*50:(i+1)*50]
    output  = translating("hin_Deva","eng_Latn",indic_en_tokenizer,indic_en_model,hi_true_50)
    translated_sentence_hindi_to_eng.extend(output)


#### Saving Translated Hindi to English Sentences

In [None]:
with open('/content/drive/MyDrive/Hindi_to_Eng.txt', 'w') as file:
    file.write(str(translated_sentence_hindi_to_eng))

### (2) English to Hindi translation

In [None]:
translated_sentence_eng_to_hindi = []
for i in range(0,20):
    en_true_50 = eng_1k_sentences[i*50:(i+1)*50]
    output = translating("eng_Latn","hin_Deva",en_indic_tokenizer,en_indic_model,en_true_50)
    translated_sentence_eng_to_hindi.extend(output)


#### Saving Translated English to Hindi translation

In [None]:
with open('./Translation/Eng_to_hindi.txt', 'w') as file:
    file.write(str(translated_sentence_eng_to_hindi))

### (3) Marathi to Hindi translation

In [None]:
translated_sentence_mar_to_hindi = []
for i in range(0,20):
    mar_true_50 = mar_1k_sentences[i*50:(i+1)*50]
    output = translating("mar_Deva","hin_Deva",indic_indic_tokenizer,indic_indic_model,mar_true_50)
    translated_sentence_mar_to_hindi.extend(output)


In [None]:
with open('./Translation/Mar_to_hindi.txt', 'w') as file:
    file.write(str(translated_sentence_mar_to_hindi))

### (4) Hindi to marathi translation

In [None]:
translated_sentence_hindi_to_mar = []
for i in range(0,40):
    print(i,  "  ")
    hi_true_50 = hi_1k_sentences[i*25:(i+1)*25]
    output = translating("hin_Deva","mar_Deva",indic_indic_tokenizer,indic_indic_model,hi_true_50)
    translated_sentence_hindi_to_mar.extend(output)


### saving Translated hindi to marathi


In [13]:
with open('./Translation/Hindi_to_marathi.txt', 'w') as file:
    file.write(str(translated_sentence_hindi_to_mar))

## Loading translated data

In [7]:
import ast
def load_translated_data(path):
    with open(path, 'r', encoding='utf-8') as file:
    # Read the content of the file
        content = file.read()
        # Parse the string representation of the list
        sentences_list = ast.literal_eval(content)
    return sentences_list

In [8]:
translated_sentence_eng_to_hindi = load_translated_data("./Translation/Eng_to_Hindi.txt")
translated_sentence_hindi_to_eng = load_translated_data("./Translation/Hindi_to_Eng.txt")
translated_sentence_hin_to_mar = load_translated_data("./Translation/Hindi_to_Mar.txt")
translated_sentence_mar_to_hindi = load_translated_data("./Translation/Mar_to_Hindi.txt")

## BLEU Scores

In [27]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu,SmoothingFunction
import string

In [28]:


def courpus_level_bleu_score(original,translated):
      bleuScore_eng_to_hin=corpus_bleu([[ref] for ref in original], translated)
      return bleuScore_eng_to_hin


In [42]:
eng_to_hindi_blue = courpus_level_bleu_score(hi_1k_sentences,translated_sentence_eng_to_hindi)
hindi_to_eng_blue = courpus_level_bleu_score(eng_1k_sentences,translated_sentence_hindi_to_eng)
hindi_to_mar_blue = courpus_level_bleu_score(mar_1k_sentences,translated_sentence_hin_to_mar)
mar_to_hindi_blue = courpus_level_bleu_score(hi_1k_sentences,translated_sentence_mar_to_hindi)

print(f"Corpus Level Blue Scores:\nEnglish to hindi: {eng_to_hindi_blue}\nHindi to English: {hindi_to_eng_blue}\nHindi to Marathi: {hindi_to_mar_blue}\nMarathi to Hindi: {mar_to_hindi_blue}")

Corpus Level Blue Scores:
English to hindi: 0.6926573633438512
Hindi to English: 0.7479832197361774
Hindi to Marathi: 0.6054683949885025
Marathi to Hindi: 0.6086900050748287


In [30]:
def clean_sentence(sentence):
    # Convert to lowercase
    cleaned_sentence = sentence.lower()
    
    # Remove punctuation
    cleaned_sentence = cleaned_sentence.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    cleaned_sentence = ' '.join(cleaned_sentence.split())
    
    return cleaned_sentence

In [39]:
def sentence_level_bleu_score(original, translation):
    bleu_scores = []
    smoothing_function = SmoothingFunction()
    for original_sentence, translation_sentence in zip(original, translation):
        original_sentence = clean_sentence(original_sentence)
        translation_sentence = clean_sentence(translation_sentence)
        original_tokens = original_sentence.split()
        translation_tokens = translation_sentence.split()

        # Calculate BLEU score
        bleu_score = sentence_bleu([original_tokens], translation_tokens,smoothing_function=smoothing_function.method2)
        bleu_scores.append(bleu_score)
    
    # Calculate average BLEU score
    overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return overall_bleu_score




In [41]:
eng_to_hindi_blue = sentence_level_bleu_score(hi_1k_sentences,translated_sentence_eng_to_hindi)
hindi_to_eng_blue = sentence_level_bleu_score(eng_1k_sentences,translated_sentence_hindi_to_eng)
hindi_to_mar_blue = sentence_level_bleu_score(mar_1k_sentences,translated_sentence_hin_to_mar)
mar_to_hindi_blue = sentence_level_bleu_score(hi_1k_sentences,translated_sentence_mar_to_hindi)

print(f"Sentence Level Blue Scores:\nEnglish to hindi: {eng_to_hindi_blue}\nHindi to English: {hindi_to_eng_blue}\nHindi to Marathi: {hindi_to_mar_blue}\nMarathi to Hindi: {mar_to_hindi_blue}")

Sentence Level Blue Scores:
English to hindi: 0.3542139158937472
Hindi to English: 0.4450780577789042
Hindi to Marathi: 0.2146926372363359
Marathi to Hindi: 0.2687288100239655


## ROUGE Scores

In [16]:
from rouge import Rouge
def calculate_rouge_scores(original, translated):
    rouge = Rouge()
    scores = rouge.get_scores(translated, original, avg=True)
    return scores

#### English to Hindi Translation

In [17]:
rouge_eng_to_hin=calculate_rouge_scores(hi_1k_sentences,translated_sentence_eng_to_hindi)
print("ROUGE Score English to Hindi:")
print("ROUGE_1:",rouge_eng_to_hin['rouge-1'])
print("ROUGE_2:",rouge_eng_to_hin['rouge-2'])
print("ROUGE_L:",rouge_eng_to_hin['rouge-l'])

ROUGE Score English to Hindi:
ROUGE_1: {'r': 0.618272792679341, 'p': 0.623610505812477, 'f': 0.6177257530350299}
ROUGE_2: {'r': 0.3824409654525232, 'p': 0.3856898573440865, 'f': 0.38205775566133976}
ROUGE_L: {'r': 0.5825824910050512, 'p': 0.588555397863421, 'f': 0.582574785948368}


#### Hindi to English Translation

In [18]:
rouge_hin_to_eng=calculate_rouge_scores(eng_1k_sentences,translated_sentence_hindi_to_eng)
print("ROUGE Score Hindi to English:")
print("ROUGE_1:",rouge_hin_to_eng['rouge-1'])
print("ROUGE_2:",rouge_hin_to_eng['rouge-2'])
print("ROUGE_L:",rouge_hin_to_eng['rouge-l'])

ROUGE Score Hindi to English:
ROUGE_1: {'r': 0.6680526377978149, 'p': 0.6646624038681456, 'f': 0.6628035764437721}
ROUGE_2: {'r': 0.4555270206828759, 'p': 0.448890067850876, 'f': 0.44954301315232514}
ROUGE_L: {'r': 0.63436574860399, 'p': 0.6314363307974323, 'f': 0.6295756281285899}


#### Marathi to Hindi Translation

In [19]:
rouge_mar_to_hin=calculate_rouge_scores(hi_1k_sentences,translated_sentence_mar_to_hindi)
print("ROUGE Score Marathi to Hindi:")
print("ROUGE_1:",rouge_mar_to_hin['rouge-1'])
print("ROUGE_2:",rouge_mar_to_hin['rouge-2'])
print("ROUGE_L:",rouge_mar_to_hin['rouge-l'])

ROUGE Score Marathi to Hindi:
ROUGE_1: {'r': 0.5250729086785814, 'p': 0.5311842353709779, 'f': 0.5235051923314874}
ROUGE_2: {'r': 0.28386135335808316, 'p': 0.28718706792869236, 'f': 0.28288370879544933}
ROUGE_L: {'r': 0.4875967145937427, 'p': 0.4941243243945781, 'f': 0.4866813769372026}


#### Hindi to Marathi Translation

In [20]:
rouge_hin_to_mar=calculate_rouge_scores(mar_1k_sentences,translated_sentence_hin_to_mar)
print("ROUGE Score Hindi to Marathi:")
print("ROUGE_1:",rouge_hin_to_mar['rouge-1'])
print("ROUGE_2:",rouge_hin_to_mar['rouge-2'])
print("ROUGE_L:",rouge_hin_to_mar['rouge-l'])

ROUGE Score Hindi to Marathi:
ROUGE_1: {'r': 0.4147498463812588, 'p': 0.41411767940796157, 'f': 0.41120537037461863}
ROUGE_2: {'r': 0.18501212805608805, 'p': 0.18358572556084948, 'f': 0.1828575181355645}
ROUGE_L: {'r': 0.38550992609229445, 'p': 0.38462095831206716, 'f': 0.38208192255912904}


#                                      END