In [18]:
import pandas as pd
import torch
import transformers
from transformers import MarianMTModel
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [19]:
df = pd.read_csv('/content/Hindi_English_Truncated_Corpus.csv')
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [20]:
df_ted = df[df['source']=='ted'][['english_sentence', 'hindi_sentence']]
df_ted.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,So there is some sort of justice,तो वहाँ न्याय है


In [28]:
class CSVtranslationdatast(Dataset):
  def __init__(self, dataframe, tokenizer, max_length=32):
    self.data = dataframe
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    src_text = self.data.iloc[idx]['english_sentence']
    tgt_text = self.data.iloc[idx]['hindi_sentence']

    src = self.tokenizer(src_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
    tgt = self.tokenizer(tgt_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

    return {
            "src_input_ids": src['input_ids'].squeeze(),
            "src_attention_mask": src['attention_mask'].squeeze(),
            "tgt_input_ids": tgt['input_ids'].squeeze(),
            "tgt_attention_mask": tgt['attention_mask'].squeeze(),
        }

In [24]:
#create dataset and dataloader
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
dataset = CSVtranslationdatast(df_ted, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [25]:
model.eval()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61950, 512, padding_idx=61949)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61950, 512, padding_idx=61949)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [33]:
#test
sample_text = "How are you so cute?"
input_ids = tokenizer.encode(sample_text, return_tensors="pt", truncation = True, padding= True, max_length= 32)

#generate translate
output = model.generate(input_ids= input_ids, max_length= 32)
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"English Input: {sample_text}")
print(f"Hindi Output: {translated_text}")

English Input: How are you so cute?
Hindi Output: तुम इतने प्यारा कैसे हो?
