In [None]:
#import necessary libraries
from google.colab import drive
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import spacy
from collections import Counter
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import random

In [None]:
!pip install torchtext==0.6.0 --quiet
from torchtext.data import Field, BucketIterator

In [None]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/train_data1.json', 'r') as file:
    data = json.load(file)

In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/test_data1_final.json', 'r') as file:
    test_data = json.load(file)

In [None]:
import string

# Define the remove_punctuations function
def remove_punctuations(sentence):
    punctuations = list(string.punctuation)
    punctuations.append('।')
    punctuations.append('৷')
    punctuations.append('’')
    punctuations.append('‘')
    cleaned = ""
    for letter in sentence:
        if letter not in punctuations:
            cleaned += letter
    return cleaned


In [None]:
import re

# Define a function to check if a sentence contains English words
def contains_english_words(sentence):
    return bool(re.search(r'[a-zA-Z]', sentence))

In [None]:
# Process JSON data
source_sentences_train = []
target_sentences_train = []
len_bengali = []
len_english = []

id_train = []

for language_pair, language_data in data.items():
    if(language_pair == "English-Bengali"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"].lower()
              target = entry_data["target"].lower()
              source = remove_punctuations(source)
              target = remove_punctuations(target)
              if not contains_english_words(target):
                source_sentences_train.append(source)
                target_sentences_train.append(target)
                id_train.append(entry_id)
                len_bengali.append(len(source.split(' ')))
                len_english.append(len(target.split(' ')))

Language Pair: English-Bengali
  Data Type: Train


In [None]:
# Calculate the maximum length of bengali and English sentences
max_length_bengali = max(len_bengali)
max_length_english = max(len_english)

print(f"Maximum Length of bengali Sentence: {max_length_bengali} words")
print(f"Maximum Length of English Sentence: {max_length_english} words")

Maximum Length of bengali Sentence: 100 words
Maximum Length of English Sentence: 84 words


In [None]:
len(target_sentences_train)

68849

In [None]:
test_source = []
valid_id = []
len_val=[]


for language_pair, language_data in test_data.items():
    if(language_pair == "English-Bengali"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"].lower()
              source = remove_punctuations(source)
              test_source.append(source)
              valid_id.append(entry_id)
              len_val.append(len(source.split(' ')))

Language Pair: English-Bengali
  Data Type: Validation


In [None]:
test_source[1]

'on one side is the spiti valley and to the other are numerous c b chandrabhaga  range peaks'

In [None]:
print(max(len_val))

175


In [None]:
nlp_en = spacy.load('en_core_web_sm')
# Tokenize English sentences
tokenized_source_sentences = []
for sentence in source_sentences_train:
    doc = nlp_en(sentence)
    tokens = [token.text for token in doc]
    tokenized_source_sentences.append(tokens)

In [None]:
tokenized_test_source = []
for sentence in test_source:
    doc = nlp_en(sentence)
    tokens = [token.text for token in doc]
    tokenized_test_source.append(tokens)

In [None]:
pip install bnlp_toolkit



In [None]:
from bnlp import NLTKTokenizer
bnlp_tokenizer = NLTKTokenizer()
tokenized_target_sentences=[]
for sentence in target_sentences_train:
  token_sen = bnlp_tokenizer.word_tokenize(sentence)
  tokenized_target_sentences.append(token_sen)

In [None]:
from torchtext.data import Dataset, Example, Field

def tokenize_english(text):
    return text

def tokenize_bengali(text):
    return text

class CustomTranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_field, target_field):
        fields = [('source', source_field), ('target', target_field)]
        examples = []
        for src, tgt in zip(source_sentences, target_sentences):
            examples.append(Example.fromlist([src, tgt], fields))
        super().__init__(examples, fields)

# Create Fields for English (source) and bengali (target) text
english = Field(tokenize=tokenize_english,
                lower=True,
                init_token="<sos>",
                eos_token="<eos>")

bengali = Field(tokenize=tokenize_bengali,
              lower=True,
              init_token="<sos>",
              eos_token="<eos>")


train_dataset = CustomTranslationDataset(tokenized_source_sentences, tokenized_target_sentences, english, bengali)

english.build_vocab(train_dataset,max_size=30000, min_freq=3)
bengali.build_vocab(train_dataset,max_size=30000, min_freq=3)

In [None]:
print(len(english.vocab), len(bengali.vocab))

20608 26261


In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator = BucketIterator(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.source),
    device=device
)


In [None]:
a = list(english.vocab.__dict__.values())
word_2_idx = dict(a[3])
idx_2_word = {}
for i,j in word_2_idx.items():
  idx_2_word[j] = i

In [None]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()
    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.tag = True

    self.embedding = nn.Embedding(input_size, embedding_size)
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  def forward(self, x):

    embedding = self.dropout(self.embedding(x))
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)
    return hidden_state, cell_state


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_size_encoder = len(english.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(20608, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [None]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()
    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.output_size = output_size

    self.dropout = nn.Dropout(p)

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)
    self.fc = nn.Linear(hidden_size, output_size)


  def forward(self, x, hidden_state, cell_state):
    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden_state, cell_state

In [None]:
input_size_decoder = len(bengali.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(bengali.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(26261, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=26261, bias=True)
)


In [None]:
for batch in train_iterator:
  print(batch.source.shape)
  print(batch.target.shape)
  break

x = batch.target[1]
print(x)

torch.Size([37, 32])
torch.Size([40, 32])
tensor([ 6755,  3114,  5376,   754,     6,   222,   275,    80,   283,  4351,
         3930,     6,   178,  1470,    42,    45,   164,     0,    96,     0,
         1188,  4999,     0,   126, 19340,     6,   590, 12934,  1533, 11097,
          201,     0], device='cuda:0')


In [None]:

class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(bengali.vocab)
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    hidden_state, cell_state = self.Encoder_LSTM(source)

    x = target[0]

    for i in range(1, target_len):
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1)
      x = target[i] if random.random() < tfr else best_guess
    return outputs

In [None]:

# Hyperparameters
learning_rate = 0.001
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = bengali.vocab.stoi[""]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
model

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(20608, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(26261, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=26261, bias=True)
  )
)

In [None]:
def pred_trans(model, tokens, english, bengali, device, max_length=175):
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [bengali.vocab.stoi[""]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        if output.argmax(1).item() == bengali.vocab.stoi[""]:
            break

    translated_sentence = [bengali.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [None]:
import time
from tqdm import tqdm
epoch_loss = 0.0
num_epochs = 10
sentence1 = "on one side is the spiti valley and to the other are numerous chandrabhaga range peaks"


for epoch in range(num_epochs):
  start_time = time.time()
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  doc = nlp_en(sentence1)
  tok = [token.text for token in doc]
  translated_sentence1 = pred_trans(model, tok, english, bengali, device, max_length=128)
  print(f"Translated example sentence 1: \n {translated_sentence1}")

  model.train(True)
  for batch_idx, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
    input = batch.source.to(device)
    target = batch.target.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp
    optimizer.step()
    step += 1
    epoch_loss += loss.item()

  # Calculate the time taken for the epoch
  end_time = time.time()
  epoch_time = end_time - start_time
  print(f"Time taken for epoch {epoch + 1}: {epoch_time:.2f} seconds")

  print("Epoch_Loss - {}".format(loss.item()))
  print()

print(epoch_loss / len(train_iterator))

Epoch - 1 / 10
Translated example sentence 1: 
 ['শিলিগুড়ি', 'বিক্রমকে', 'পারসিয়ান', 'ছাঁচে', 'কখনো', 'নন্দার', 'তাইই', 'তাইই', 'ঝোলা', 'ঝোলা', 'চিনাবাদামের', 'ইনফেকশন', 'থাকবেন', 'গ্যানিমা', 'লাইফস্টাইল', 'লাইফস্টাইল', 'তত্ত্বাবধানে', 'কটরা', 'এলিশান', 'এলিশান', 'এমনকি', 'ফলে', 'ফলে', 'মিথ্যে', 'মিথ্যে', 'লাইফস্টাইল', 'লাইফস্টাইল', 'তত্ত্বাবধানে', 'এলিশান', 'এলিশান', 'এমনকি', 'এমনকি', 'ফলে', 'কখনো', 'মায়ানমার', 'মায়ানমার', 'মায়ানমার', 'হাওয়াতে', 'বালুকাপাথরের', 'বালুকাপাথরের', 'বালুকাপাথরের', 'প্রতিনিধিত্বমূলক', 'পিএইচডির', 'যাদুঘর', 'যাদুঘর', 'যাদুঘর', 'যাদুঘর', 'যাদুঘর', 'রাজস্ব', 'রামায়ণ', 'টিকিটের', 'যোজনার', 'দৈত্য', 'অ্যাসিড', 'টিকিটের', 'অ্যাসিড', 'টিকিটের', 'পরি', 'পরি', 'পরি', 'পুরনো', 'জড়িয়ে', 'নগরহোল', 'আশি', 'শক্তিদায়ক', 'যতক্ষণ', 'গৌতম', 'নন্দার', 'সত্যিকারের', 'ঠাণ্ডায়', 'পেরিস', 'পেরিস', 'উশু', 'উশু', 'উশু', 'উশু', 'ভাবপ্রদর্শন', 'মনোযোগ', 'হিন্দিভাষার', 'হিন্দিভাষার', 'ছিলেন', 'ছিলেন', 'লাওসের', 'পরিবর্তে', 'ছবিটির', 'ইণ্ডিয়ান', 'ইণ্ডিয়ান', 'ম্যাজিস্ট্রেটের', 'ম্

100%|██████████| 2152/2152 [09:07<00:00,  3.93it/s]


Time taken for epoch 1: 549.03 seconds
Epoch_Loss - 5.547065734863281

Epoch - 2 / 10
Translated example sentence 1: 
 ['এই', 'হল', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

100%|██████████| 2152/2152 [09:06<00:00,  3.94it/s]


Time taken for epoch 2: 546.96 seconds
Epoch_Loss - 2.793395757675171

Epoch - 3 / 10
Translated example sentence 1: 
 ['একটি', 'হল', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

100%|██████████| 2152/2152 [09:05<00:00,  3.95it/s]


Time taken for epoch 3: 545.79 seconds
Epoch_Loss - 4.809362888336182

Epoch - 4 / 10
Translated example sentence 1: 
 ['একটি', 'একটি', 'ও', 'একটি', 'ও', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

100%|██████████| 2152/2152 [09:04<00:00,  3.95it/s]


Time taken for epoch 4: 546.15 seconds
Epoch_Loss - 4.2409772872924805

Epoch - 5 / 10
Translated example sentence 1: 
 ['একটি', 'একটি', 'হল', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

100%|██████████| 2152/2152 [09:05<00:00,  3.94it/s]


Time taken for epoch 5: 546.27 seconds
Epoch_Loss - 2.325906276702881

Epoch - 6 / 10
Translated example sentence 1: 
 ['একটি', 'সুন্দর', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<p

100%|██████████| 2152/2152 [09:06<00:00,  3.94it/s]


Time taken for epoch 6: 547.32 seconds
Epoch_Loss - 3.6644577980041504

Epoch - 7 / 10
Translated example sentence 1: 
 ['একটি', 'জলপ্রপাত', 'হল', 'একটি', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

100%|██████████| 2152/2152 [09:06<00:00,  3.94it/s]


Time taken for epoch 7: 546.87 seconds
Epoch_Loss - 3.4456684589385986

Epoch - 8 / 10
Translated example sentence 1: 
 ['একটি', 'জলপ্রপাত', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

100%|██████████| 2152/2152 [09:07<00:00,  3.93it/s]


Time taken for epoch 8: 548.61 seconds
Epoch_Loss - 3.19138503074646

Epoch - 9 / 10
Translated example sentence 1: 
 ['একদিকে', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

100%|██████████| 2152/2152 [09:01<00:00,  3.98it/s]


Time taken for epoch 9: 541.80 seconds
Epoch_Loss - 3.630573034286499

Epoch - 10 / 10
Translated example sentence 1: 
 ['একদিকে', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', 'এবং', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<p

100%|██████████| 2152/2152 [09:06<00:00,  3.93it/s]

Time taken for epoch 10: 547.72 seconds
Epoch_Loss - 3.5685863494873047

36.99560723269573





In [None]:
s = tokenized_test_source[10]
print(s)
translation = pred_trans(model, s, english, bengali, device, max_length=128)
cleaned_tokens = [token for token in translation if token not in ['<eos>', '<pad>']]
readable_translation = ' '.join(cleaned_tokens)
print(readable_translation)

['by', 'feeding', 'medicines', 'made', 'of', 'sarpgandha', 'patient', 'gets', 'a', 'lot', 'of', 'benefit', 'in', 'malikholia']
রোগীর রোগীর ফলে রোগীর ফলে ফলে ফলে ফলে আরাম পাওয়া যায়


In [None]:
bengali_pred = []

for tokens in tokenized_test_source:
  translation = pred_trans(model, tokens, english, bengali, device, max_length=175)
  cleaned_tokens = [token for token in translation if token not in ['<eos>', '<pad>','।','৷']]
  pred = ' '.join(cleaned_tokens)
  bengali_pred.append(pred)
  print(pred)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
এক ব্রাহ্মণের মতে মহাকাব্যের সেন ছিলেন ব্রোঞ্জ তাঁর কুস্তির ছিলেন যেখানে তাঁর তাঁর
এয়ার নাড়ু কোন কোন ওষুধের ছাড় আছে
প্রাচীন ৩য় সাতবাহন শতাব্দী শতাব্দী ১৪শ শাসক সঙ্গীত শুরু করে
একবার আপনি আপনি একটি একটি পারেন আপনি আপনি এবং এবং এবং বাঁচাতে গিয়ার করতে পারেন
সূর্যে পাতার অংশ গাছের গাছের চিবালে সূর্যের রশ্মিতে হয়
যদিও শিশুরা বাচ্চাদের নিমজ্জিত নিমজ্জিত থেকে এবং এবং এবং না না না এবং না না না এবং
ভাইরাল হল একটি একটি একটি একটি হয় যার মধ্যে ন্যূনতম এবং এবং এবং এবং এবং এবং
অন্যান্য শহরে মধ্যে এলিফ্যান্টা স্থানগুলি হল চম্বল এবং এবং এবং
মেরিন রাষ্ট্রীয় উদ্যান বিমানপথে সদাহরিত সদাহরিত ׀
এটা এটাকে এটাকে এটি এটি এটি দিউকে সাম্রাজ্য দ্বারা
নতুন নতুন নতুন নতুন নতুন শুভ করুন
এই এই না প্রতিদিনের জন্য না না জন্য জন্য জন্য
এর ফলে স্মরণশক্তি পরিবর্তণ করে
মর্ককোর৬ ׃ হাড়ে এবং দাঁতে ঘা আর উর্জা
বৃদ্ধাবস্থার রোগকে অ্যালোপ্যাথি চিকিত্সায় অনুভব হয়
তিনি সিংহের সময় তাঁর উপর দরজায় রান করে এবং এবং এবং এবং খোসায় খোসায় খোসায় এবং এবং এবং একটি একটি এক

In [None]:
import csv

# Create a list of dictionaries where each dictionary represents a row
data = [{'valid_id': valid_id[i], 'bengali_pred': bengali_pred[i]} for i in range(len(valid_id))]

# Specify the CSV file path
csv_file_path = '/gdrive/MyDrive/bengali3.csv'

# Define the column names
fields = ['valid_id', 'bengali_pred']

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

print(f'Saved predictions to {csv_file_path}')

Saved predictions to /gdrive/MyDrive/bengali2.csv


In [None]:
print(len(bengali_pred))
print(len(test_source))

9836
9836
