In [1]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [2]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [3]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None

In [2]:
import random

def select_and_save_random_sentences(english_file, hindi_file, gujarati_file, num_sentences=1000):

    # Read all sentences from each file
    with open(english_file, "r") as f:
        english_sentences = f.readlines()
    with open(hindi_file, "r") as f:
        hindi_sentences = f.readlines()
    with open(gujarati_file, "r") as f:
        gujarati_sentences = f.readlines()

  # Ensure all files have the same number of sentences
    if len(english_sentences) != len(hindi_sentences) or len(english_sentences) != len(gujarati_sentences):
        raise ValueError("All files must have the same number of sentences")

  # Select random sentences (use set to avoid duplicates)
    selected_indices = random.sample(range(len(english_sentences)), num_sentences)

  # Create lists to store selected sentences with newlines
    selected_english = []
    selected_hindi = []
    selected_gujarati = []

  # Extract selected sentences from each list and add newlines
    for index in selected_indices:
        selected_english.append(english_sentences[index].strip())
        selected_hindi.append(hindi_sentences[index].strip())
        selected_gujarati.append(gujarati_sentences[index].strip())
    return(selected_english, selected_hindi, selected_gujarati)
    print(f"Successfully selected and saved {num_sentences} random sentences from each file (each sentence on a new line).")

# Replace with your actual file paths
english_file = "/content/test.en"
hindi_file = "/content/test.hi"
gujarati_file = "/content/test.gu"

en, hi, gu = select_and_save_random_sentences(english_file, hindi_file, gujarati_file)


In [3]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, tokenizer, model, ip):


    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [4]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)
ip = IndicProcessor(inference=True)

hin = "hin_Deva"
eng = "eng_Latn"
guj = "guj_Gujr"

en_hi = batch_translate(en, eng, hin, en_indic_tokenizer, en_indic_model, ip)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

In [5]:
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"  # ai4bharat/indictrans2-indic-en-dist-200M
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, "indic-en", "")

ip = IndicProcessor(inference=True)
hi_en = batch_translate(hi, hin, eng, indic_en_tokenizer, indic_en_model, ip)


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/4.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

In [6]:
indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

hi_gu = batch_translate(hi, hin, guj, indic_indic_tokenizer, indic_indic_model, ip)


config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [7]:
indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

gu_hi = batch_translate(gu, guj, hin, indic_indic_tokenizer, indic_indic_model, ip)

In [9]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [10]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from rouge import Rouge

def calculate_scores(references, hypotheses):
    # Calculate BLEU score
    bleu_score = corpus_bleu([[ref] for ref in references], hypotheses)

    # Initialize rouge scorer
    rouge = Rouge()

    # Calculate ROUGE scores
    rouge_scores = rouge.get_scores(hypotheses, references, avg=True)

    return bleu_score, rouge_scores

In [11]:
BLEU_en_hi, ROUGE_en_hi = calculate_scores(hi, en_hi)
BLEU_hi_en, ROUGE_hi_en = calculate_scores(en, hi_en)
BLEU_hi_gu, ROUGE_hi_gu = calculate_scores(gu, hi_gu)
BLEU_gu_hi, ROUGE_gu_hi = calculate_scores(hi, gu_hi)
print("BLEU, ROUGE scores of en_hi", BLEU_en_hi, ROUGE_en_hi)
print("BLEU, ROUGE scores of hi_en", BLEU_hi_en, ROUGE_hi_en)
print("BLEU, ROUGE scores of hi_gu", BLEU_hi_gu, ROUGE_hi_gu)
print("BLEU, ROUGE scores of gu_hi", BLEU_gu_hi, ROUGE_gu_hi)

BLEU, ROUGE scores of en_hi 0.6975413764072621 {'rouge-1': {'r': 0.624242368524997, 'p': 0.6317329040621728, 'f': 0.6244974218384414}, 'rouge-2': {'r': 0.3939754508253442, 'p': 0.3985052838418069, 'f': 0.39393648824704036}, 'rouge-l': {'r': 0.588354695437273, 'p': 0.5958652290415527, 'f': 0.588846638745394}}
BLEU, ROUGE scores of hi_en 0.7530227737724526 {'rouge-1': {'r': 0.6732253732025327, 'p': 0.6692542058842587, 'f': 0.6675442736707977}, 'rouge-2': {'r': 0.4576924058980923, 'p': 0.45360715809208935, 'f': 0.4528552277863599}, 'rouge-l': {'r': 0.6382942924770795, 'p': 0.6349945508899312, 'f': 0.6332410689846868}}
BLEU, ROUGE scores of hi_gu 0.637047552681751 {'rouge-1': {'r': 0.5028705045102323, 'p': 0.5095664360958262, 'f': 0.5025922580962816}, 'rouge-2': {'r': 0.25285907802716945, 'p': 0.2547839281455134, 'f': 0.25196485961817905}, 'rouge-l': {'r': 0.47926919636037135, 'p': 0.48576887480056236, 'f': 0.479107714615445}}
BLEU, ROUGE scores of gu_hi 0.6725586571556171 {'rouge-1': {'r'

In [13]:
def save_list_to_file(lst, filename):
    with open(f'{filename}.txt', 'w') as f:
        for item in lst:
            f.write(f'{item}\n')

save_list_to_file(en_hi, 'en_hi')
save_list_to_file(hi_en, 'hi_en')
save_list_to_file(hi_gu, 'hi_gu')
save_list_to_file(gu_hi, 'gu_hi')
save_list_to_file(hi, 'hi')
save_list_to_file(gu, 'gu')
save_list_to_file(en, 'en')