In [None]:
from google.colab import drive
import json
import pickle
import math
import numpy as np
import pandas as pd
import spacy
from collections import Counter
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch import Tensor
import random
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.vocab import vocab
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
import io
import time

In [None]:
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/train_data2.json', 'r') as file:
    data = json.load(file)

In [None]:
# Read JSON data from file
with open('/gdrive/MyDrive/Test_data2_final.json', 'r') as file:
    val_data = json.load(file)

In [None]:
import string

# Define the remove_punctuations function
def remove_punctuations(sentence):
    punctuations = list(string.punctuation)
    cleaned = ""
    for letter in sentence:
        if letter not in punctuations:
            cleaned += letter
    return cleaned

In [None]:
# Process JSON data
source_sentences_train = []
target_sentences_train = []
len_kannada = []
len_english = []

id_train = []

for language_pair, language_data in data.items():
    if(language_pair == "English-Kannada"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"]
              target = entry_data["target"].lower()
              source = remove_punctuations(source)
              target = remove_punctuations(target)
              source_sentences_train.append(source)
              target_sentences_train.append(target)
              id_train.append(entry_id)
              len_kannada.append(len(source.split(' ')))
              len_english.append(len(target.split(' ')))

Language Pair: English-Kannada
  Data Type: Train


In [None]:
test_source = []
valid_id = []
len_val=[]


for language_pair, language_data in val_data.items():
    if(language_pair == "English-Kannada"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"]
              source = remove_punctuations(source)
              test_source.append(source)
              valid_id.append(entry_id)
              len_val.append(len(source.split(' ')))

Language Pair: English-Kannada
  Data Type: Test


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Calculate the maximum length of kannada and English sentences
max_length_kannada = max(len_kannada)
max_length_english = max(len_english)

print(f"Maximum Length of kannada Sentence: {max_length_kannada} words")
print(f"Maximum Length of English Sentence: {max_length_english} words")

Maximum Length of kannada Sentence: 86 words
Maximum Length of English Sentence: 114 words


In [None]:
pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-1.3.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
from indicnlp.tokenize import indic_tokenize
# Tokenize kannada sentences
tokenized_source_sentences = []
for sentence in source_sentences_train:
    tokens = indic_tokenize.trivial_tokenize(sentence)
    tokenized_source_sentences.append(tokens)

In [None]:
tok_val=[]
for sentence in test_source:
    tokens = indic_tokenize.trivial_tokenize(sentence)
    tok_val.append(tokens)

In [None]:
# Tokenize English sentences
nlp_en = spacy.load('en_core_web_sm')
tokenized_target_sentences = []
for sentence in target_sentences_train:
    doc = nlp_en(sentence)
    tokens = [token.text for token in doc]
    tokenized_target_sentences.append(tokens)

In [None]:
def build_vocab(tokenized_sentences):
    counter = Counter()
    for sentence in tokenized_sentences:
        counter.update(sentence)
    v = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=1)
    return v

# Build source and target vocabularies
source_vocab = build_vocab(tokenized_source_sentences)
target_vocab = build_vocab(tokenized_target_sentences)

In [None]:
print(len(source_vocab))

100485


In [None]:
def data_process(tokenized_source, tokenized_target, source_vocab, target_vocab):
    data = []
    for (tokenized_source_sentence, tokenized_target_sentence) in zip(tokenized_source, tokenized_target):
        source_tensor = torch.tensor([source_vocab[token] for token in tokenized_source_sentence],
                                     dtype=torch.long)
        target_tensor = torch.tensor([target_vocab[token] for token in tokenized_target_sentence],
                                     dtype=torch.long)
        data.append((source_tensor, target_tensor))
    return data

# Process the data
train_data = data_process(tokenized_source_sentences, tokenized_target_sentences, source_vocab, target_vocab)

In [None]:
batch_size = 32
pad_idx = source_vocab['<pad>']
bos_idx = source_vocab['<bos>']
eos_idx = source_vocab['<eos>']

def generate_batch(data_batch):
  source_batch, target_batch = [], []
  for (source_item, target_item) in data_batch:
    source_batch.append(torch.cat([torch.tensor([bos_idx]), source_item, torch.tensor([eos_idx])], dim=0))
    target_batch.append(torch.cat([torch.tensor([bos_idx]), target_item, torch.tensor([eos_idx])], dim=0))
  source_batch = pad_sequence(source_batch, padding_value=pad_idx)
  target_batch = pad_sequence(target_batch, padding_value=pad_idx)
  return source_batch, target_batch

train_iter = DataLoader(train_data, batch_size=batch_size,shuffle=True, collate_fn=generate_batch)

In [None]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int, emb_size: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=nhead,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=nhead,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor, tgt_mask: Tensor, src_padding_mask: Tensor, tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
  src_seq_len = src.shape[0]
  tgt_seq_len = tgt.shape[0]

  tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
  src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

  src_padding_mask = (src == pad_idx).transpose(0, 1)
  tgt_padding_mask = (tgt == pad_idx).transpose(0, 1)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
src_vocab_size = len(source_vocab)
tgt_vocab_size = len(target_vocab)
emb_size = 512
nhead = 8
ffn_hid_dim = 512
num_encoder_layers = 3
num_decoder_layers = 3
num_epochs = 30
transformer = Seq2SeqTransformer(num_encoder_layers, num_decoder_layers, emb_size, src_vocab_size, tgt_vocab_size, ffn_hid_dim)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [None]:
def train_epoch(model, train_iter, optimizer):
  model.train()
  losses = 0
  for idx, (src, tgt) in enumerate(train_iter):
      src = src.to(device)
      tgt = tgt.to(device)
      tgt_input = tgt[:-1, :]
      src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
      logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
      optimizer.zero_grad()
      tgt_out = tgt[1:,:]
      loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
      loss.backward()
      optimizer.step()
      losses += loss.item()
  return losses / len(train_iter)

In [None]:
for epoch in range(1, num_epochs+1):
  start_time = time.time()
  train_loss = train_epoch(transformer, train_iter, optimizer)
  end_time = time.time()
  print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))



Epoch: 1, Train loss: 6.600, Epoch time = 155.542s
Epoch: 2, Train loss: 5.776, Epoch time = 157.602s
Epoch: 3, Train loss: 5.349, Epoch time = 158.629s
Epoch: 4, Train loss: 4.964, Epoch time = 159.050s
Epoch: 5, Train loss: 4.621, Epoch time = 158.787s
Epoch: 6, Train loss: 4.308, Epoch time = 159.620s
Epoch: 7, Train loss: 4.028, Epoch time = 159.617s
Epoch: 8, Train loss: 3.768, Epoch time = 159.674s
Epoch: 9, Train loss: 3.529, Epoch time = 159.846s
Epoch: 10, Train loss: 3.308, Epoch time = 159.739s
Epoch: 11, Train loss: 3.106, Epoch time = 159.056s
Epoch: 12, Train loss: 2.917, Epoch time = 159.160s
Epoch: 13, Train loss: 2.743, Epoch time = 159.195s
Epoch: 14, Train loss: 2.586, Epoch time = 159.484s
Epoch: 15, Train loss: 2.438, Epoch time = 159.883s
Epoch: 16, Train loss: 2.302, Epoch time = 160.493s
Epoch: 17, Train loss: 2.179, Epoch time = 161.183s
Epoch: 18, Train loss: 2.073, Epoch time = 160.697s
Epoch: 19, Train loss: 1.987, Epoch time = 161.072s
Epoch: 20, Train loss

In [None]:
torch.save(transformer.state_dict(), 'inference_model')

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == eos_idx:
            break
    return ys

# def translate(model, src, src_vocab, tgt_vocab, device):
#     model.eval()
#     tokens = [bos_idx] + [src_vocab.get_stoi()[tok] for tok in src] + [eos_idx]
#     num_tokens = len(tokens)
#     src = (torch.LongTensor(tokens).reshape(num_tokens, 1)).to(device)
#     src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(device)
#     tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=bos_idx, device=device).flatten()
#     translated_sentence= " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
#     translated_sentence = " ".join(translated_sentence.split())
#     return translated_sentence

In [None]:
def translate(model, src, src_vocab, tgt_vocab, device):
    model.eval()
    unk_token_idx = src_vocab.get_stoi()['<unk>']

    src_indices = [src_vocab[token] if token in src_vocab else unk_token_idx for token in src]

    tokens = [bos_idx] + src_indices + [eos_idx]
    num_tokens = len(tokens)
    src = torch.LongTensor(tokens).reshape(num_tokens, 1).to(device)

    src_mask = torch.zeros(num_tokens, num_tokens).to(device, dtype=torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=bos_idx, device=device).flatten()

    translated_tokens = [tgt_vocab.get_itos()[idx] if idx != unk_token_idx else '<unk>' for idx in tgt_tokens]

    translated_sentence = " ".join(translated_tokens).replace("<bos>", "").replace("<eos>", "")
    translated_sentence = " ".join(translated_sentence.split())
    return translated_sentence

In [None]:
print(tokenized_source_sentences[3])

['ಕ್ಯಾಲೆಂಡರ್\u200cಗೆ', 'ನಾಳೆಯ', 'ಊಟದ', 'ದಿನಾಂಕವನ್ನು', 'ಸೇರಿಸಿ']


In [None]:
# Create an instance of the model
loaded_transformer = Seq2SeqTransformer(num_encoder_layers, num_decoder_layers,
                                       emb_size, src_vocab_size, tgt_vocab_size,
                                       ffn_hid_dim)

# Load the saved state dictionary
loaded_transformer.load_state_dict(torch.load('inference_model'))

# Ensure the model is in evaluation mode
loaded_transformer.eval()



Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )

In [None]:
print(len(tok_val))

13370


In [None]:
english_pred = []
for sentence in tok_val:
  pred = translate(transformer, sentence, source_vocab, target_vocab, DEVICE)
  print(pred)
  english_pred.append(pred)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
on february and march the whole south eastern air force emerged
today the following happened to me i had a meeting with george
gets out of these plants and not possible to festival grown from a started whereas as tripura
from this reason we should be able to prevent our farmers from cheap import
there are many types of visiting places in kerala
i am going to an umbrella later
in karnataka he gets affected by the view of weather and influenced by anti also
on the point of view of industry not being rain but with various aspects of the world gets influenced by the also in the month of also every year in
later he used to make noise of loud
to save the loan from debt that a good investment does not reduce financial services
i have a meeting tomorrow at two p m with john put that in my calendar
the taking business of this region is a commercial pool for trade and valleys where people used to go to a purchase and purchase
this 

In [None]:
import csv

# Create a list of dictionaries where each dictionary represents a row
data = [{'valid_id': valid_id[i], 'english_pred': english_pred[i]} for i in range(len(valid_id))]

# Specify the CSV file path
csv_file_path = '/gdrive/MyDrive/phase2_kan2.csv'

# Define the column names
fields = ['valid_id', 'english_pred']

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

print(f'Saved predictions to {csv_file_path}')

Saved predictions to /gdrive/MyDrive/phase2_kan2.csv


In [None]:
print(len(english_pred))

13370
