In [1]:
#Checking if GPU is running or not

!nvidia-smi

Mon Jan  6 11:38:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [29]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [30]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [31]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

## Helsinki-NLP/opus-mt-en-hi model

source: https://huggingface.co/Helsinki-NLP/opus-mt-en-hi



# The Dataset

Source: https://huggingface.co/datasets/cfilt/iitb-english-hindi

In [32]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

In [33]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [78]:
raw_datasets['test']

Dataset({
    features: ['translation'],
    num_rows: 2507
})

#Preprocessing the data

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [36]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [12110, 2, 90, 23, 19, 8800, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[12110, 2, 90, 23, 19, 8800, 61, 0], [239, 23, 414, 8800, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [38]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[26618, 16155, 346, 33383, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}


In [39]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [40]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [41]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [42]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [43]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 10

In [44]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [45]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [46]:
# Limit the training dataset to the first 20,000 examples
limited_train_dataset = tokenized_datasets["train"].select(range(100000))

# Prepare the dataset for training
train_dataset = model.prepare_tf_dataset(
    limited_train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)


In [47]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [48]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [49]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=2)

Epoch 1/2
Epoch 2/2

# Model Testing

In [100]:
tokenized = tokenizer(raw_datasets['test']['translation'][9]['en'], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949   281  3247    47   664    11   719   200     2    95    18  5091
    396    78   273     5    25    60  6119  1625   604   163  1029    57
   1527     5    61     0 61949]], shape=(1, 29), dtype=int32)


In [101]:
with tokenizer.as_target_tokenizer():
    pred=tokenizer.decode(out[0], skip_special_tokens=True)

In [103]:
print(pred)

इन्हें भी ध्यान में रखते हुए, कुछ को अनुमानित किया जाता है कि यह गाड़ी कितनी अच्छी तरह बंद कर देती है!


In [102]:
test=raw_datasets['test']['translation'][9]['hi']

In [105]:
test

'हजारों मोटर-चालकों ने टेस्ट ड्राइव के लिए पहले ही ब्लैक बॉक्स ले लिया है, जिसमें से कुछ में जी.पी.एस. मॉनीटरिंग है।'

In [98]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [104]:
reference_tokens = test.split()
candidate_tokens = pred.split()

# Calculate BLEU score
bleu_score = sentence_bleu([test], pred)

print(f"BLEU score: {bleu_score}")

BLEU score: 0.17346220559819894


In [69]:
model.save_pretrained("tf_model3/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model2/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.
