In [1]:
!pip install rouge



In [3]:
import random

def select_and_save_random_sentences(english_file, hindi_file, gujarati_file, num_sentences=5):

    # Read all sentences from each file
    with open(english_file, "r") as f:
        english_sentences = f.readlines()
    with open(hindi_file, "r") as f:
        hindi_sentences = f.readlines()
    with open(gujarati_file, "r") as f:
        gujarati_sentences = f.readlines()

  # Ensure all files have the same number of sentences
    if len(english_sentences) != len(hindi_sentences) or len(english_sentences) != len(gujarati_sentences):
        raise ValueError("All files must have the same number of sentences")

  # Select random sentences (use set to avoid duplicates)
    selected_indices = random.sample(range(len(english_sentences)), num_sentences)

  # Create lists to store selected sentences with newlines
    selected_english = []
    selected_hindi = []
    selected_gujarati = []

  # Extract selected sentences from each list and add newlines
    for index in selected_indices:
        selected_english.append(english_sentences[index].strip())
        selected_hindi.append(hindi_sentences[index].strip())
        selected_gujarati.append(gujarati_sentences[index].strip())
    return(selected_english, selected_hindi, selected_gujarati)
    print(f"Successfully selected and saved {num_sentences} random sentences from each file (each sentence on a new line).")

# Replace with your actual file paths
english_file = "test.en"
hindi_file = "test.hi"
gujarati_file = "test.gu"

en, hi, gu = select_and_save_random_sentences(english_file, hindi_file, gujarati_file)


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")


def translate(lang, l):
  a = []
  for i in range(len(l)):
    inputs = tokenizer(l[i], return_tensors="pt")

    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang], max_length=30
    )
    a.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
  return a

In [9]:
hin = "hin_Deva"
eng = "eng_Latn"
guj = "guj_Gujr"
en_hi = translate(hin, en)
hi_en = translate(eng, hi)
hi_gu = translate(guj, hi)
gu_hi = translate(hin, gu)

In [10]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from rouge import Rouge

def calculate_scores(references, hypotheses):
    # Calculate BLEU score
    bleu_score = corpus_bleu([[ref] for ref in references], hypotheses)

    # Initialize rouge scorer
    rouge = Rouge()

    # Calculate ROUGE scores
    rouge_scores = rouge.get_scores(hypotheses, references, avg=True)

    return bleu_score, rouge_scores

In [11]:
BLEU_en_hi, ROUGE_en_hi = calculate_scores(hi, en_hi)
BLEU_hi_en, ROUGE_hi_en = calculate_scores(en, hi_en)
BLEU_hi_gu, ROUGE_hi_gu = calculate_scores(gu, hi_gu)
BLEU_gu_hi, ROUGE_gu_hi = calculate_scores(hi, gu_hi)
print("BLEU, ROUGE scores of en_hi", BLEU_en_hi, ROUGE_en_hi)
print("BLEU, ROUGE scores of hi_en", BLEU_hi_en, ROUGE_hi_en)
print("BLEU, ROUGE scores of hi_gu", BLEU_hi_gu, ROUGE_hi_gu)
print("BLEU, ROUGE scores of gu_hi", BLEU_gu_hi, ROUGE_gu_hi)

BLEU, ROUGE scores of en_hi 0.6780296262972747 {'rouge-1': {'r': 0.5818939393939393, 'p': 0.6337556561085972, 'f': 0.6031457496936823}, 'rouge-2': {'r': 0.35884920634920636, 'p': 0.381905744754042, 'f': 0.36872053374864844}, 'rouge-l': {'r': 0.5518939393939395, 'p': 0.5984615384615385, 'f': 0.57071331726125}}
BLEU, ROUGE scores of hi_en 0.7227394339376957 {'rouge-1': {'r': 0.6662745098039216, 'p': 0.6110079119367045, 'f': 0.6347054357794892}, 'rouge-2': {'r': 0.36921568627450985, 'p': 0.35804953560371516, 'f': 0.3633147813782457}, 'rouge-l': {'r': 0.6662745098039216, 'p': 0.6110079119367045, 'f': 0.6347054357794892}}
BLEU, ROUGE scores of hi_gu 0.5666891869662894 {'rouge-1': {'r': 0.4813888888888888, 'p': 0.5165418894830659, 'f': 0.494095700664276}, 'rouge-2': {'r': 0.27305764411027567, 'p': 0.28059829059829056, 'f': 0.27429691382786076}, 'rouge-l': {'r': 0.40694444444444444, 'p': 0.4412477718360071, 'f': 0.4195580050739488}}
BLEU, ROUGE scores of gu_hi 0.6586831835575037 {'rouge-1': {

In [12]:
def save_list_to_file(lst, filename):
    with open(f'{filename}.txt', 'w') as f:
        for item in lst:
            f.write(f'{item}\n')

save_list_to_file(en_hi, 'en_hi')
save_list_to_file(hi_en, 'hi_en')
save_list_to_file(hi_gu, 'hi_gu')
save_list_to_file(gu_hi, 'gu_hi')
save_list_to_file(hi, 'hi')
save_list_to_file(gu, 'gu')
save_list_to_file(en, 'en')