## Loading data

In [129]:
def load_data(path):
    with open(path,"r",encoding = "utf-8") as file:
        return [line.strip() for line in file]




In [130]:
en_true = load_data("./data/test.en")
hi_true = load_data("./data/test.hi")
mar_true = load_data("./data/test.mr")


## selecting random sentences

In [131]:
import random
random.seed(27)
sample_indices = random.sample(range(len(hi_true)), 1000)
def random_sentences(language):
    random_sentences = [language[i] for i in sample_indices]
    return random_sentences


In [132]:
eng_1k_sentences = random_sentences(en_true)
hi_1k_sentences =  random_sentences(hi_true)
mar_1k_sentences = random_sentences(mar_true)


In [115]:
# eng_1k_sentences

# Translation using NLLB

In [116]:
# Use a pipeline as a high-level helper
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


In [117]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
def NLLB(source,target,sentences):
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M",src=source,)

    inputs = tokenizer(sentences, return_tensors="pt",padding = "longest")

    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target], max_length=30
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

### (1) Hindi to English

In [18]:
len(hi_1k_sentences)

1000

In [128]:
translated_sentence_hindi_to_eng = []
for i in range(0,2):
    hi_true_50 = hi_1k_sentences[i*50:(i+1)*50]
    output = NLLB("hin_Deva","eng_Latn",hi_true_50)
    translated_sentence_hindi_to_eng.extend(output)


In [None]:
len(translated_sentence_hindi_to_eng)

In [None]:
translated_sentence_hindi_to_eng

In [12]:
with open('/content/drive/My Drive/NLLB_hi_to_en.txt', 'w') as file:
    file.write(str(translated_sentence_hindi_to_eng))

### (2) English to Hindi

In [13]:
translated_sentence_eng_to_hindi = []
for i in range(0,20):
    en_true_50 = eng_1k_sentences[i*50:(i+1)*50]
    output = NLLB("eng_Latn","hin_Deva",en_true_50)
    translated_sentence_eng_to_hindi.extend(output)


In [14]:
len(translated_sentence_eng_to_hindi)

1000

In [15]:
with open('/content/drive/My Drive/NLLB_en_to_hi.txt', 'w') as file:
    file.write(str(translated_sentence_eng_to_hindi))

### (3) Hindi to Marathi

In [16]:
translated_sentence_hin_to_mar = []
for i in range(0,20):
    hi_true_50 = hi_1k_sentences[i*50:(i+1)*50]
    output = NLLB("hin_Deva","mar_Deva",hi_true_50)
    translated_sentence_hin_to_mar.extend(output)


In [17]:
len(translated_sentence_hin_to_mar)

1000

In [18]:
with open('/content/drive/My Drive/NLLB_hi_to_mar.txt', 'w') as file:
    file.write(str(translated_sentence_hin_to_mar))

### (4) Marathi to Hindi

In [20]:
translated_sentence_mar_to_hindi = []
for i in range(0,20):
    mar_true_50 = mar_1k_sentences[i*50:(i+1)*50]
    output = NLLB("mar_Deva","hin_Deva",mar_true_50)
    translated_sentence_mar_to_hindi.extend(output)


In [21]:
len(translated_sentence_mar_to_hindi)

1000

In [22]:
with open('/content/drive/My Drive/NLLB_mar_to_hi.txt', 'w') as file:
    file.write(str(translated_sentence_mar_to_hindi))

## loading translated data

In [133]:
import ast
def load_translated_data(path):
    with open(path, 'r', encoding='utf-8') as file:
    # Read the content of the file
        content = file.read()
        # Parse the string representation of the list
        sentences_list = ast.literal_eval(content)
    return sentences_list

In [134]:
translated_sentence_eng_to_hindi = load_translated_data("./Translation/english_to_hindi.txt")
translated_sentence_hindi_to_eng = load_translated_data("./Translation/hindi_to_english.txt")
translated_sentence_hin_to_mar = load_translated_data("./Translation/hindi_to_marathi.txt")
translated_sentence_mar_to_hindi = load_translated_data("./Translation/marathi_to_hindi.txt")

## BLEU Scores

In [135]:
# !pip install nltk

In [136]:
from nltk.translate.bleu_score import corpus_bleu,sentence_bleu,SmoothingFunction
import string

## corpus Level bleu score

In [137]:

def corpus_level_bleu_score(original,translated):
      bleuScore=corpus_bleu([[ref] for ref in original], translated)
      return bleuScore


In [138]:
eng_to_hindi_blue = corpus_level_bleu_score(hi_1k_sentences,translated_sentence_eng_to_hindi)
hindi_to_eng_blue = corpus_level_bleu_score(eng_1k_sentences,translated_sentence_hindi_to_eng)
hindi_to_mar_blue = corpus_level_bleu_score(mar_1k_sentences,translated_sentence_hin_to_mar)
mar_to_hindi_blue = corpus_level_bleu_score(hi_1k_sentences,translated_sentence_mar_to_hindi)

print(f"Corpus Level bleu score:\nEnglish to hindi: {eng_to_hindi_blue}\nHindi to English: {hindi_to_eng_blue}\nHindi to Marathi: {hindi_to_mar_blue}\nMarathi to Hindi: {mar_to_hindi_blue}")

Corpus Level bleu score:
English to hindi: 0.623305403326375
Hindi to English: 0.6752860907113375
Hindi to Marathi: 0.5501861354870586
Marathi to Hindi: 0.5505502649480352


## Sentence lavel bleu score

In [139]:
def clean_sentence(sentence):
    # Convert to lowercase
    cleaned_sentence = sentence.lower()
    
    # Remove punctuation
    cleaned_sentence = cleaned_sentence.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    cleaned_sentence = ' '.join(cleaned_sentence.split())
    
    return cleaned_sentence

In [140]:
def sentence_level_bleu_score(original, translation):
    bleu_scores = []
    smoothing_function = SmoothingFunction()
    for original_sentence, translation_sentence in zip(original, translation):
        original_sentence = clean_sentence(original_sentence)
        translation_sentence = clean_sentence(translation_sentence)
        original_tokens = original_sentence.split()
        translation_tokens = translation_sentence.split()

        # Calculate BLEU score
        bleu_score = sentence_bleu([original_tokens], translation_tokens,smoothing_function=smoothing_function.method2)
        bleu_scores.append(bleu_score)
    
    # Calculate average BLEU score
    overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return overall_bleu_score




In [141]:
eng_to_hindi_blue = sentence_level_bleu_score(hi_1k_sentences,translated_sentence_eng_to_hindi)
hindi_to_eng_blue = sentence_level_bleu_score(eng_1k_sentences,translated_sentence_hindi_to_eng)
hindi_to_mar_blue = sentence_level_bleu_score(mar_1k_sentences,translated_sentence_hin_to_mar)
mar_to_hindi_blue = sentence_level_bleu_score(hi_1k_sentences,translated_sentence_mar_to_hindi)

print(f"Sentence level bleu Scores:\nEnglish to hindi: {eng_to_hindi_blue}\nHindi to English: {hindi_to_eng_blue}\nHindi to Marathi: {hindi_to_mar_blue}\nMarathi to Hindi: {mar_to_hindi_blue}")

Sentence level bleu Scores:
English to hindi: 0.3109705562655547
Hindi to English: 0.37704726819361367
Hindi to Marathi: 0.2006750070607821
Marathi to Hindi: 0.2313678622705313


## Rouge Score

In [142]:
# !pip install rouge

In [143]:
from rouge import Rouge
def calculate_rouge_scores(original, translated):
    rouge = Rouge()
    scores = rouge.get_scores(translated, original, avg=True)
    return scores

#### English to Hindi Translation

In [144]:
rouge_eng_to_hin=calculate_rouge_scores(hi_1k_sentences,translated_sentence_eng_to_hindi)
print("ROUGE Score English to Hindi:")
print("ROUGE_1:",rouge_eng_to_hin['rouge-1'])
print("ROUGE_2:",rouge_eng_to_hin['rouge-2'])
print("ROUGE_L:",rouge_eng_to_hin['rouge-l'])

ROUGE Score English to Hindi:
ROUGE_1: {'r': 0.5628643812289592, 'p': 0.6035171125800287, 'f': 0.5777951832073754}
ROUGE_2: {'r': 0.33137017849860567, 'p': 0.35671377851206043, 'f': 0.34069722186098916}
ROUGE_L: {'r': 0.525031141586268, 'p': 0.5630904909956712, 'f': 0.5390959827056208}


#### Hindi to English Translation

In [145]:
rouge_hin_to_eng=calculate_rouge_scores(eng_1k_sentences,translated_sentence_hindi_to_eng)
print("ROUGE Score Hindi to English:")
print("ROUGE_1:",rouge_hin_to_eng['rouge-1'])
print("ROUGE_2:",rouge_hin_to_eng['rouge-2'])
print("ROUGE_L:",rouge_hin_to_eng['rouge-l'])

ROUGE Score Hindi to English:
ROUGE_1: {'r': 0.6066762392449336, 'p': 0.6210699012524098, 'f': 0.6089889156702791}
ROUGE_2: {'r': 0.38571847623571365, 'p': 0.3905454996543592, 'f': 0.3847888482389781}
ROUGE_L: {'r': 0.5768846028422651, 'p': 0.5901536129561941, 'f': 0.5789018084610669}


#### Hindi to Marathi Translation




In [146]:
rouge_hin_to_mar=calculate_rouge_scores(mar_1k_sentences,translated_sentence_hin_to_mar)
print("ROUGE Score Hindi to Marathi:")
print("ROUGE_1:",rouge_hin_to_mar['rouge-1'])
print("ROUGE_2:",rouge_hin_to_mar['rouge-2'])
print("ROUGE_L:",rouge_hin_to_mar['rouge-l'])

ROUGE Score Hindi to Marathi:
ROUGE_1: {'r': 0.37816880654578217, 'p': 0.4053010590974208, 'f': 0.3872151192054459}
ROUGE_2: {'r': 0.16338782702974608, 'p': 0.1749336662434575, 'f': 0.16716151408499125}
ROUGE_L: {'r': 0.35351643890904333, 'p': 0.37860253430350144, 'f': 0.3618957693048932}


#### Marathi to Hindi Translation

In [147]:
rouge_mar_to_hin=calculate_rouge_scores(hi_1k_sentences,translated_sentence_mar_to_hindi)
print("ROUGE Score Marathi to Hindi:")
print("ROUGE_1:",rouge_mar_to_hin['rouge-1'])
print("ROUGE_2:",rouge_mar_to_hin['rouge-2'])
print("ROUGE_L:",rouge_mar_to_hin['rouge-l'])

ROUGE Score Marathi to Hindi:
ROUGE_1: {'r': 0.4708997023939873, 'p': 0.5108553496493046, 'f': 0.485097041203127}
ROUGE_2: {'r': 0.23502861995567412, 'p': 0.2545893446362097, 'f': 0.24188603513129844}
ROUGE_L: {'r': 0.43606401473640055, 'p': 0.473325077250901, 'f': 0.4494431068532833}


#                                                  END