In [None]:
from google.colab import drive
import json
import numpy as np
import pandas as pd
import spacy
from collections import Counter
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import random

In [None]:
!pip install torchtext==0.6.0 --quiet
from torchtext.data import Field, BucketIterator

In [None]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/train_data1.json', 'r') as file:
    data = json.load(file)

In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/val_data1 (1).json', 'r') as file:
    validation_data = json.load(file)

In [None]:
import string

# Define the remove_punctuations function
def remove_punctuations(sentence):
    punctuations = list(string.punctuation)
    punctuations.append('।')
    punctuations.append('৷')
    punctuations.append('’')
    punctuations.append('‘')
    cleaned = ""
    for letter in sentence:
        if letter not in punctuations:
            cleaned += letter
    return cleaned


In [None]:
import re

# Define a function to check if a sentence contains English words
def contains_english_words(sentence):
    return bool(re.search(r'[a-zA-Z]', sentence))

In [None]:
# Process JSON data
source_sentences_train = []
target_sentences_train = []
len_malyalam = []
len_english = []

id_train = []

for language_pair, language_data in data.items():
    if(language_pair == "English-Malayalam"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"].lower()
              target = entry_data["target"].lower()
              source = remove_punctuations(source)
              target = remove_punctuations(target)
              if not contains_english_words(target):
                    source_sentences_train.append(source)
                    target_sentences_train.append(target)
                    id_train.append(entry_id)
                    len_malyalam.append(len(source.split(' ')))
                    len_english.append(len(target.split(' ')))

Language Pair: English-Malayalam
  Data Type: Train


In [None]:
# Calculate the maximum length of malyalam and English sentences
max_length_malyalam = max(len_malyalam)
max_length_english = max(len_english)

print(f"Maximum Length of malyalam Sentence: {max_length_malyalam} words")
print(f"Maximum Length of English Sentence: {max_length_english} words")

Maximum Length of malyalam Sentence: 107 words
Maximum Length of English Sentence: 108 words


In [None]:
len(target_sentences_train)

52562

In [None]:
validation_source = []
valid_id = []
len_val=[]


for language_pair, language_data in validation_data.items():
    if(language_pair == "English-Malayalam"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"].lower()
              source = remove_punctuations(source)
              validation_source.append(source)
              valid_id.append(entry_id)
              len_val.append(len(source.split(' ')))

Language Pair: English-Malayalam
  Data Type: Validation


In [None]:
len(validation_source)

7723

In [None]:
print(max(len_val))

76


In [None]:
nlp_en = spacy.load('en_core_web_sm')
# Tokenize English sentences
tokenized_source_sentences = []
for sentence in source_sentences_train:
    doc = nlp_en(sentence)
    tokens = [token.text for token in doc]
    tokenized_source_sentences.append(tokens)

In [None]:
tokenized_validation_source = []
for sentence in validation_source:
    doc = nlp_en(sentence)
    tokens = [token.text for token in doc]
    tokenized_validation_source.append(tokens)

In [None]:
!pip install nltk



In [None]:
pip install indic-nlp-library



In [None]:
import nltk

# Download the Gujarati tokenizer
nltk.download("indic_nltk")

from indicnlp.tokenize import indic_tokenize


# Tokenize malyalam sentences
tokenized_target_sentences = []
for sentence in target_sentences_train:
    tokens = indic_tokenize.trivial_tokenize(sentence)
    tokenized_target_sentences.append(tokens)

[nltk_data] Error loading indic_nltk: Package 'indic_nltk' not found
[nltk_data]     in index


In [None]:
print(target_sentences_train[5])
print(tokenized_target_sentences[5])

ആദ്യ വാരാന്ത്യത്തിൽ ഈ ചിത്രം 7 ദശലക്ഷം രൂപ 92000 യുഎസ് ഡോളർ നേടി
['ആദ്യ', 'വാരാന്ത്യത്തിൽ', 'ഈ', 'ചിത്രം', '7', 'ദശലക്ഷം', 'രൂപ', '92000', 'യുഎസ്', 'ഡോളർ', 'നേടി']


In [None]:
from torchtext.data import Dataset, Example, Field

# Define your custom tokenizer function for English text
def tokenize_english(text):
    return text

def tokenize_malyalam(text):
    return text

# Define your custom Datasets using tokenized sentences
class CustomTranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_field, target_field):
        fields = [('source', source_field), ('target', target_field)]
        examples = []
        for src, tgt in zip(source_sentences, target_sentences):
            examples.append(Example.fromlist([src, tgt], fields))
        super().__init__(examples, fields)
# Create Fields for English (source) and malyalam (target) text
english = Field(tokenize=tokenize_english,
                lower=True,
                init_token="<sos>",
                eos_token="<eos>")

malyalam = Field(tokenize=tokenize_malyalam,
              lower=True,
              init_token="<sos>",
              eos_token="<eos>")


train_dataset = CustomTranslationDataset(tokenized_source_sentences, tokenized_target_sentences, english, malyalam)


# Build vocabulary for the Fields with the same max_size
english.build_vocab(train_dataset,max_size=30000, min_freq=3)
malyalam.build_vocab(train_dataset,max_size=30000, min_freq=3)



In [None]:
print(len(english.vocab), len(malyalam.vocab))

15784 25404


In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator = BucketIterator(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.source),
    device=device
)


In [None]:
count = 0
for data in train_iterator:
  if count < 1 :
    print("Shapes", data.source.shape, data.target.shape)
    print()
    print("English - ",*data.source, " Length - ", len(data.source))
    print()
    print("malyalam - ",*data.target, " Length - ", len(data.target))
    temp_eng = data.source
    temp_malyalam = data.target
    count += 1


Shapes torch.Size([6, 32]) torch.Size([13, 32])

English -  tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0') tensor([   91,   166,  1899,   157,   249,   268,   591,  2565,   431,    14,
          479,    29,     0,    91,     8,  2436,   104,   168, 11559,  2150,
          668,     8,   156, 14010,   264,   264,    67,  4170,   110,   395,
           91,  9298], device='cuda:0') tensor([   8,   55,  384,   55,    6,   80, 1024,    0, 1828,   22,   52,   53,
        1325,   10,    4,   80,   15,  429,   15,    5,   15,   17,   10,    4,
        1089,  163,   28,   52,   52,   12,   23,   46], device='cuda:0') tensor([  616,    68,     6,  1112,   287,     4,     6,   280, 10265,   414,
          483,   205,  2674,  1493,   678,   509,  9171,   451,    25,  2607,
           25,  1075,   196,  9665,    10,     6,    92,  1819,   385,    81,
          222,    12], device='cuda:0') tensor([  94, 2170,   17,  7

In [None]:
temp_eng_idx = (temp_eng).cpu().detach().numpy()
temp_malyalam_idx = (temp_malyalam).cpu().detach().numpy()

In [None]:

print(english.vocab.__dict__.keys())
print(list(english.vocab.__dict__.values()))
e = list(english.vocab.__dict__.values())
for i in e:
  print(i)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:

word_2_idx = dict(e[3])
idx_2_word = {}
for k,v in word_2_idx.items():
  idx_2_word[v] = k


In [None]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True


    self.embedding = nn.Embedding(input_size, embedding_size)


    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  def forward(self, x):


    embedding = self.dropout(self.embedding(x))

    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_size_encoder = len(english.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(15784, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [None]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(hidden_size, output_size)

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state

In [None]:
input_size_decoder = len(malyalam.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(malyalam.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(25404, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=25404, bias=True)
)


In [None]:

for batch in train_iterator:
  print(batch.source.shape)
  print(batch.target.shape)
  break

x = batch.target[1]
print(x)

torch.Size([25, 32])
torch.Size([21, 32])
tensor([ 5648,  1562,     0,     0,    44,  3549,  3784,  1824, 14294,   880,
            5,     5, 23044, 23280,  5939,     0,  5571,   848,   481,     0,
            0,     0,  1598,     0,    52,  5840,  2419,  2723,   176,     5,
         2994,    41], device='cuda:0')


In [None]:

class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(malyalam.vocab)

    # Shape --> outputs (14, 32, 5766)
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state, cell_state = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
    x = target[0] # Trigger token

    for i in range(1, target_len):
      # Shape --> output (32, 5766)
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766)
    return outputs

In [None]:

# Hyperparameters

learning_rate = 0.001
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = malyalam.vocab.stoi[""]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
model

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(15784, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(25404, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=25404, bias=True)
  )
)

In [None]:
def translate_sentence(model, sentence, english, malyalam, device, max_length=108):
    nlp_en = spacy.load('en_core_web_sm')

    if type(sentence) == str:
        tokens = [token.text for token in nlp_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [malyalam.vocab.stoi[""]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == malyalam.vocab.stoi[""]:
            break

    translated_sentence = [malyalam.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

def bleu(data, model, english, malyalam, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["source"]
        trg = vars(example)["target"]

        prediction = translate_sentence(model, src,english,malyalam, device)
        prediction = prediction[:-1]  # remove  token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


In [None]:
def pred_trans(model, tokens, english, malyalam, device, max_length=108):
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [malyalam.vocab.stoi[""]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == malyalam.vocab.stoi[""]:
            break

    translated_sentence = [malyalam.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [None]:
import time
from tqdm import tqdm
epoch_loss = 0.0
num_epochs = 10
best_loss = 999999
best_epoch = -1
sentence1 = "avoid alcohol and illicit drugs"
ts1  = []

for epoch in range(num_epochs):
  start_time = time.time()
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, english, malyalam,device, max_length=108)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
    input = batch.source.to(device)
    target = batch.target.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp
    optimizer.step()
    step += 1
    epoch_loss += loss.item()

  # Calculate the time taken for the epoch
  end_time = time.time()
  epoch_time = end_time - start_time
  print(f"Time taken for epoch {epoch + 1}: {epoch_time:.2f} seconds")

  print("Epoch_Loss - {}".format(loss.item()))
  print()

print(epoch_loss / len(train_iterator))


Epoch - 1 / 10
Translated example sentence 1: 
 ['സിനിമയിൽ', 'ആരംഭിച്ചപ്പോള്\u200d', 'ആരംഭിച്ചപ്പോള്\u200d', 'ഗണേഷ്', 'പുരാവസ്തു', 'പുരാവസ്തു', 'പ്രാധാന്യമുള്ളതാണ്', 'ഹായ്', 'ഹായ്', 'പുറത്തിറക്കിയത്', 'ഗുളികകളും', 'ഗുളികകളും', 'ഗുളികകളും', 'തുടരുന്നത്', 'കൂടാന്\u200d', 'ജലം', 'വിഷയമായി', 'കാര്യാലയം', 'മുകളിലായി', 'അവകാശം', 'ഡൗൺലോഡ്', 'ഉത്തർപ്രദേശ്', 'പട്ടേൽ', 'കടയില്\u200d', 'ഹിൽസ്', 'വീശുന്ന', 'റോഷ്നി', 'റോഷ്നി', 'കൃഷിക്കും', 'കൃഷിക്കും', 'സുഗന്ധവ്യഞ്ജനത്തിന്\u200dറെ', 'കൃഷിക്കും', 'ആശുപത്രിയിലെ', 'ശേഖരിച്ച്', 'കടയിലേക്ക്', 'കേരളം', 'കേരളം', 'നിറവേറ്റുന്നു', 'ഓർമ്മിപ്പിക്കാൻ', 'അർത്ഥമില്ല', 'സംരക്ഷിക്കാന്\u200d', 'ഡാറ്റ', 'പറയട്ടെ', 'കഥയാണ്', 'ആക്ഷൻ', 'വെണ്ണയും', 'ആഗ്രഹിക്കുന്നുവെങ്കില്\u200d', 'ഇടാന്\u200d', 'അർപ്പിക്കാൻ', 'അർപ്പിക്കാൻ', 'അവസ്ഥയിൽ', 'ഞരമ്പിന്\u200dറെ', 'കൃഷിക്കും', 'കൃഷിക്കും', 'സുഗന്ധവ്യഞ്ജനത്തിന്\u200dറെ', 'കൃഷിക്കും', 'ആശുപത്രിയിലെ', 'ശേഖരിച്ച്', 'കടയിലേക്ക്', 'കേരളം', 'കേരളം', 'അടിക്കാൻ', 'അതിനനുസരിച്ച്', 'ഓർമ്മിപ്പിക്കാൻ', 'തീർത്ഥാടന', 'സംരക്ഷണത്തിനു', 'അർത്ഥമില

100%|██████████| 1643/1643 [05:07<00:00,  5.34it/s]


Time taken for epoch 1: 308.26 seconds
Epoch_Loss - 3.9716806411743164

Epoch - 2 / 10
Translated example sentence 1: 
 ['ഇവിടെ', 'ഏറ്റവും', '<eos>', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

 67%|██████▋   | 1093/1643 [03:24<02:06,  4.34it/s]

In [None]:
s = validation_source[10]
translation = translate_sentence(model, s, english, malyalam, device, max_length=128)
cleaned_tokens = [token for token in translation if token not in ['<eos>', '<pad>']]
readable_translation = ' '.join(cleaned_tokens)
print(readable_translation)

In [None]:
malyalam_pred = []

for tokens in tokenized_validation_source:
  translation = pred_trans(model, tokens, english, malyalam, device, max_length=114)
  cleaned_tokens = [token for token in translation if token not in ['<eos>', '<pad>','।','৷']]
  pred = ' '.join(cleaned_tokens)
  print(pred)
  malyalam_pred.append(pred)


In [None]:
import csv

# Create a list of dictionaries where each dictionary represents a row
data = [{'valid_id': valid_id[i], 'malyalam_pred': malyalam_pred[i]} for i in range(len(valid_id))]

# Specify the CSV file path
csv_file_path = '/gdrive/MyDrive/malyalam2.csv'

# Define the column names
fields = ['valid_id', 'malyalam_pred']

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

print(f'Saved predictions to {csv_file_path}')

In [None]:
print(len(malyalam_pred))
print(len(validation_source))