In [1]:
!pip install -q torch==2.2.0 torchtext==0.17.0

In [2]:
!pip show torch torchtext

Name: torch
Version: 2.2.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: C:\Users\Dell\anaconda3\Lib\site-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
Required-by: torchdata, torchtext
---
Name: torchtext
Version: 0.17.0
Summary: Text utilities, models, transforms, and datasets for PyTorch.
Home-page: https://github.com/pytorch/text
Author: PyTorch Text Team
Author-email: packages@pytorch.org
License: BSD
Location: C:\Users\Dell\anaconda3\Lib\site-packages
Requires: numpy, requests, torch, torchdata, tqdm
Required-by: 


In [3]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# 1. Load and Prepare Your Dataset
data_path = r'C:\Users\Dell\Documents\Capstone\Working\dict.csv'  # Replace with your dataset path
df = pd.read_csv(data_path, encoding = "utf-8")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

sanskrit_sentences = df['Sanskrit'].tolist()  # Column name containing Sanskrit sentences
english_sentences = df['English'].tolist()  # Column name containing English sentences

# 2. Tokenize and Build Vocabulary
sanskrit_tokenizer = get_tokenizer('basic_english')  # Customize tokenizer if needed
english_tokenizer = get_tokenizer('basic_english')

def yield_tokens(data, tokenizer):
    for text in data:
        yield tokenizer(text)

sanskrit_vocab = build_vocab_from_iterator(yield_tokens(sanskrit_sentences, sanskrit_tokenizer), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
english_vocab = build_vocab_from_iterator(yield_tokens(english_sentences, english_tokenizer), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

sanskrit_vocab.set_default_index(sanskrit_vocab["<unk>"])
english_vocab.set_default_index(english_vocab["<unk>"])

In [6]:
print(df)

      English Sanskrit
0           I     अहम्
1          me     माम्
2         you    त्वम्
3          go     गच्छ
4        went  अगच्छत्
...       ...      ...
1604   breath   श्वासः
1605  breathe   श्वसति
1606    brick      इटः
1607   bridge    पुलम्
1608    brief    संक्ष

[1609 rows x 2 columns]


In [7]:
def process_text(text, tokenizer, vocab):
    return [vocab["<bos>"]] + [vocab[token] for token in tokenizer(text)] + [vocab["<eos>"]]

sanskrit_data = [process_text(sentence, sanskrit_tokenizer, sanskrit_vocab) for sentence in sanskrit_sentences]
english_data = [process_text(sentence, english_tokenizer, english_vocab) for sentence in english_sentences]

# Padding sequences
sanskrit_data = pad_sequence([torch.tensor(seq) for seq in sanskrit_data], batch_first=True, padding_value=sanskrit_vocab["<pad>"])
english_data = pad_sequence([torch.tensor(seq) for seq in english_data], batch_first=True, padding_value=english_vocab["<pad>"])

# 3. Create Dataset and DataLoader
class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

train_dataset = TranslationDataset(sanskrit_data, english_data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [8]:
# 4. Define GRU-based Seq2Seq Model
class EncoderGRU(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers):
        super(EncoderGRU, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)  # embedded: [batch_size, src_len, emb_dim]
        outputs, hidden = self.rnn(embedded)  # outputs: [batch_size, src_len, hid_dim], hidden: [n_layers, batch_size, hid_dim]
        return hidden

class DecoderGRU(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers):
        super(DecoderGRU, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, tgt, hidden):
        embedded = self.embedding(tgt)  # embedded: [batch_size, tgt_len, emb_dim]
        outputs, hidden = self.rnn(embedded, hidden)  # outputs: [batch_size, tgt_len, hid_dim], hidden: [n_layers, batch_size, hid_dim]
        predictions = self.fc_out(outputs)  # predictions: [batch_size, tgt_len, output_dim]
        return predictions, hidden

class Seq2SeqGRU(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqGRU, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        hidden = self.encoder(src)  # hidden: [n_layers, batch_size, hid_dim]
        outputs, _ = self.decoder(tgt, hidden)  # outputs: [batch_size, tgt_len, output_dim]
        return outputs


In [9]:
# 5. Initialize and Train the Model
INPUT_DIM = len(sanskrit_vocab)
OUTPUT_DIM = len(english_vocab)
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
N_EPOCHS = 20
LEARNING_RATE = 0.001

encoder_gru = EncoderGRU(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS)
decoder_gru = DecoderGRU(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS)
model_gru = Seq2SeqGRU(encoder_gru, decoder_gru)

criterion = nn.CrossEntropyLoss(ignore_index=english_vocab["<pad>"])
optimizer = torch.optim.Adam(model_gru.parameters(), lr=LEARNING_RATE)

for epoch in range(N_EPOCHS):
    model_gru.train()
    epoch_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model_gru(src, tgt[:, :-1])  # Exclude the last token for the target
        output = output.view(-1, output.shape[-1])
        tgt = tgt[:, 1:].contiguous().view(-1)  # Exclude the first token for the target
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{N_EPOCHS}, Loss: {epoch_loss/len(train_loader):.4f}")

Epoch 1/20, Loss: 3.8338
Epoch 2/20, Loss: 3.7985
Epoch 3/20, Loss: 3.4379
Epoch 4/20, Loss: 2.3905
Epoch 5/20, Loss: 0.8677
Epoch 6/20, Loss: 0.3440
Epoch 7/20, Loss: 0.3194
Epoch 8/20, Loss: 0.2575
Epoch 9/20, Loss: 0.2374
Epoch 10/20, Loss: 0.2193
Epoch 11/20, Loss: 0.2217
Epoch 12/20, Loss: 0.2052
Epoch 13/20, Loss: 0.1916
Epoch 14/20, Loss: 0.1627
Epoch 15/20, Loss: 0.1577
Epoch 16/20, Loss: 0.1553
Epoch 17/20, Loss: 0.1511
Epoch 18/20, Loss: 0.1416
Epoch 19/20, Loss: 0.1369
Epoch 20/20, Loss: 0.1400


In [15]:
state = {
    "epoch": 20,
    "model": model_gru,
    "model_state_dict": model_gru.state_dict(),
    "optimizer": optimizer.state_dict()
}
torch.save(state, "temp.pth")

In [16]:
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
from torchtext.data.utils import get_tokenizer

# Ensure model is in evaluation mode
model_gru.eval()

def predict_translation(model, input_sentence, sanskrit_tokenizer, sanskrit_vocab, english_vocab, max_len=50):
    model.eval()  # Set the model to evaluation mode

    # Preprocess the input sentence
    tokens = sanskrit_tokenizer(input_sentence.lower())
    input_indices = [sanskrit_vocab["<bos>"]] + [sanskrit_vocab[token] for token in tokens] + [sanskrit_vocab["<eos>"]]
    input_tensor = torch.tensor(input_indices).unsqueeze(0)  # Add batch dimension

    # Encode the input sentence
    with torch.no_grad():
        hidden = model.encoder(input_tensor)

    # Initialize the target sequence with the <bos> token
    tgt_indices = [english_vocab["<bos>"]]
    tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0)  # Add batch dimension

    # Prepare to store the predicted sentence
    predicted_sentence = []

    for _ in range(max_len):
        # Decode the current token
        with torch.no_grad():
            output, hidden = model.decoder(tgt_tensor, hidden)

        # Get the predicted next token
        predicted_token_index = output.argmax(2)[:, -1].item()
        predicted_sentence.append(predicted_token_index)

        # If <eos> token is generated, stop the prediction loop
        if predicted_token_index == english_vocab["<eos>"]:
            break

        # Update the target sequence with the predicted token
        tgt_tensor = torch.cat((tgt_tensor, torch.tensor([[predicted_token_index]])), dim=1)

    # Convert predicted indices back to words
    translated_words = [english_vocab.get_itos()[idx] for idx in predicted_sentence]

    # Remove <eos> if it's in the translated words
    if "<eos>" in translated_words:
        translated_words.remove("<eos>")

    return ' '.join(translated_words)


In [1]:

# Example
input_sentence = "फलम"
translation = predict_translation(model_gru, input_sentence, sanskrit_tokenizer, sanskrit_vocab, english_vocab)
print(f"Translated: {translation}")



NameError: name 'predict_translation' is not defined

In [None]:
import streamlit as st
st.write("Sanskrit to English Translation")
input_sentence = st.text_input("Word to be translated")
translation = predict_translation(model_gru, input_sentence, sanskrit_tokenizer, sanskrit_vocab, english_vocab)
st.write(f"Translated: {translation}")
