In this notebook, machine translation is performed by using two **deep learning** approaches: a **Recurrent Neural Network (RNN)** and **Transformer**.

The anki data for **Chinese Mandarin to English translation** is trained using **sequence-to-sequence models**.

## I. Load Packages

In [1]:
import pandas as pd
import unicodedata
import re
from torch.utils.data import Dataset
import torch
import math
import random
import os

import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm.notebook import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

rnn_encoder, rnn_encoder, transformer_encoder, transformer_decoder = None, None, None, None
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if __name__=='__main__':
    print('Using device:', DEVICE)

Using device: cuda


## II. Download and Prepare the Data

### Helper Functions

In [2]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence_chn(w):
  w = re.sub(r'([?.!,。？！，])', r' \1 ', w)
  w = ' '.join([c for c in w])
  w = re.sub(r'[" "]+', ' ', w)
  w = ' '.join( [t if t not in '1234567890`~@#$%^&*()_-+={}[];\':"/<>\\|' else '' for t in w.split(' ')] )
  w = unicode_to_ascii(w.lower().strip())
  w = '<start> ' + w + ' <end>'
  return w

def preprocess_sentence_general(w):
    '''
    Preprocess the sentence to add the start, end tokens and make them lower-case
    '''
    w = unicode_to_ascii(w.lower().strip())  # Normalize and lowercase
    w = re.sub(r'([?.!,])', r' \1 ', w)      # Add space around punctuation
    w = re.sub(r'[" "]+', ' ', w)            # Replace multiple spaces with a single space
    w = ' '.join([t if t not in '1234567890`~@#$%^&*()_-+={}[];\':"/<>\\|' else '' for t in w.split(' ')])  # Remove unwanted characters

    w = w.rstrip().strip()                   # Trim any remaining spaces
    w = '<start> ' + w + ' <end>'            # Add start and end tokens
    return w

def preprocess_sentence(w, lang):
    assert lang in ["chn", "general"]
    if lang == "chn":
      w = preprocess_sentence_chn(w)
    elif lang == "general":
      w = preprocess_sentence_general(w)

    w = w.rstrip().strip()
    return w

def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, src_vocab, trg_vocab, src_lang, trg_lang):
    # Vectorize the input and target languages
    src_tensor = [[src_vocab.word2idx[s if s in src_vocab.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe[src_lang].values.tolist()]
    trg_tensor = [[trg_vocab.word2idx[s if s in trg_vocab.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe[trg_lang].values.tolist()]

    # Calculate the max_length of input and output tensor for padding
    max_length_src, max_length_trg = max(len(t) for t in src_tensor), max(len(t) for t in trg_tensor)
    print('max_length_src: {}, max_length_trg: {}'.format(max_length_src, max_length_trg))

    # Pad all the sentences in the dataset with the max_length
    src_tensor = [pad_sequences(x, max_length_src) for x in src_tensor]
    trg_tensor = [pad_sequences(x, max_length_trg) for x in trg_tensor]

    return src_tensor, trg_tensor, max_length_src, max_length_trg


def train_test_split(src_tensor, trg_tensor):
    '''
    Create training and test sets.
    '''
    total_num_examples = len(src_tensor) - int(0.2*len(src_tensor))
    src_tensor_train, src_tensor_test = src_tensor[:int(0.75*total_num_examples)], src_tensor[int(0.75*total_num_examples):total_num_examples]
    trg_tensor_train, trg_tensor_test = trg_tensor[:int(0.75*total_num_examples)], trg_tensor[int(0.75*total_num_examples):total_num_examples]

    return src_tensor_train, src_tensor_test, trg_tensor_train, trg_tensor_test

### Evaluation Functions
These functions will be used to evaluate both the RNN and Transformer Models.

In [3]:
def get_reference_candidate(target, pred, trg_vocab):
    def _to_token(sentence):
        lis = []
        for s in sentence[1:]:
            x = trg_vocab.idx2word[s]
            if x == "<end>": break
            lis.append(x)
        return lis
    reference = _to_token(list(target.numpy()))
    candidate = _to_token(list(pred.numpy()))
    return reference, candidate

def compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab):
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], trg_vocab)

        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)

    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    scores = {"bleu_1": bleu_1, "bleu_2": bleu_2, "bleu_3": bleu_3, "bleu_4": bleu_4}
    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate, scores

### Download and Visualize the Data

In [4]:
lang_path = 'cmn-eng'

os.system(f"wget http://www.manythings.org/anki/{lang_path}.zip")
os.system(f"unzip -o {lang_path}.zip")
src_script, trg_script = "chn", "general"
src_lang, trg_lang = lang_path.split('-')[0], lang_path.split('-')[1]

In [5]:
total_num_examples = 50000
dat = pd.read_csv(f'{src_lang}.txt',
                sep="\t",
                header=None,
                usecols=[0,1],
                names=[f'{trg_lang}', f'{src_lang}'],
                nrows=total_num_examples,
                encoding="UTF-8"
).sample(frac=1).reset_index().drop(['index'], axis=1)

dat # Visualize the data

Unnamed: 0,eng,cmn
0,I ate breakfast on my balcony.,我在陽臺上吃的早飯。
1,Tom tried to kill Mary.,汤姆试着杀死玛丽。
2,This rug is handmade.,这块地毯是手工制作的。
3,The road was wet from the rain.,這條路因為下雨所以是濕的。
4,Did you open a window?,你打開窗戶了嗎?
...,...,...
29904,The train was delayed because of heavy snowfall.,火车因大雪被耽搁了。
29905,Did you check all the items on the shopping list?,你把購物單上的全都檢查過了嗎？
29906,She set out on a trip last week.,她上週去旅行了。
29907,I think this machine is in need of repair.,我认为这机器需要修理。


### Preprocess the data

In [6]:
data = dat.copy()

data[trg_lang] = dat[trg_lang].apply(lambda w: preprocess_sentence(w, trg_script))
data[src_lang] = dat[src_lang].apply(lambda w: preprocess_sentence(w, src_script))
data # Visualizing the data

Unnamed: 0,eng,cmn
0,<start> i ate breakfast on my balcony . <end>,<start> 我 在 陽 臺 上 吃 的 早 飯 。 <end>
1,<start> tom tried to kill mary . <end>,<start> 汤 姆 试 着 杀 死 玛 丽 。 <end>
2,<start> this rug is handmade . <end>,<start> 这 块 地 毯 是 手 工 制 作 的 。 <end>
3,<start> the road was wet from the rain . <end>,<start> 這 條 路 因 為 下 雨 所 以 是 濕 的 。 <end>
4,<start> did you open a window ? <end>,<start> 你 打 開 窗 戶 了 嗎 ? <end>
...,...,...
29904,<start> the train was delayed because of heavy...,<start> 火 车 因 大 雪 被 耽 搁 了 。 <end>
29905,<start> did you check all the items on the sho...,<start> 你 把 購 物 單 上 的 全 都 檢 查 過 了 嗎 ？ <end>
29906,<start> she set out on a trip last week . <end>,<start> 她 上 週 去 旅 行 了 。 <end>
29907,<start> i think this machine is in need of rep...,<start> 我 认 为 这 机 器 需 要 修 理 。 <end>


### Vocabulary & Dataloader Classes

A class fro managing the vocabulary is created. There is a seperate class for the vocabulary because there are two dfiferent vacabularies - one for source language and one for target language.

Then the dataloader is prepared and return the source sentence and target sentence.

In [7]:
class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab

        for index, word in enumerate(vocab):
            self.word2idx[word] = index + 2 # +2 because of <pad> and <unk> token
            self.idx2word[index + 2] = word

    def __len__(self):
        return len(self.word2idx)

    def __repr__(self):
        if len(self.vocab) <= 5:
            return str(self.vocab)
        else:
            return f'Vocab_Lang object with {len(self.vocab)} words'

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [8]:
import numpy as np
import random
from torch.utils.data import DataLoader

In [9]:
#adjust hyperparameters
BATCH_SIZE = 64
EMBEDDING_DIM = 256

### Build Vocabulary

In [10]:
def build_vocabulary(pd_dataframe):
    sentences = [sen.split() for sen in pd_dataframe]
    vocab = {}
    for sen in sentences:
        for word in sen:
            if word not in vocab:
                vocab[word] = 1
    return list(vocab.keys())

In [11]:
src_vocab_list = build_vocabulary(data[src_lang])
trg_vocab_list = build_vocabulary(data[trg_lang])

### Instantiate Datasets
The train and test datasets are now instantiated.

In [12]:
src_vocab = Vocab_Lang(src_vocab_list)
trg_vocab = Vocab_Lang(trg_vocab_list)

src_tensor, trg_tensor, max_length_src, max_length_trg = preprocess_data_to_tensor(data, src_vocab, trg_vocab, src_lang, trg_lang)
src_tensor_train, src_tensor_val, trg_tensor_train, trg_tensor_val = train_test_split(src_tensor, trg_tensor)

# create train and val datasets
train_dataset = MyData(src_tensor_train, trg_tensor_train)
train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

test_dataset = MyData(src_tensor_val, trg_tensor_val)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)

max_length_src: 46, max_length_trg: 36


  self.data = torch.LongTensor(X)


In [13]:
idxes = random.choices(range(len(train_dataset.dataset)), k=5)
src, trg =  train_dataset.dataset[idxes]
print('Source:', src)
print('Source Dimensions: ', src.size())
print('Target:', trg)
print('Target Dimensions: ', trg.size())

Source: tensor([[   2,    3,  152, 1713,   81,   12,   13,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,  260,  263, 1448,  377,  502,   12,   13,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,   41,    4, 1564,   50,   12,   13,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    3,  153,   41, 1735,   12,   13,   

In [14]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradi

## III. Encoder and Decoder Models

In [15]:
def create_positional_embedding(max_len, embed_dim):
    '''
    Args:
        max_len: The maximum length supported for positional embeddings
        embed_dim: The size of your embeddings
    Returns:
        pe: [max_len, 1, embed_dim] computed as in the formulae above
    '''
    pe = torch.zeros(max_len, embed_dim)
    pos = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp((torch.arange(0, embed_dim, 2, dtype=torch.float) * (-math.log(10000.0) / embed_dim)))
    pe[:, 0::2] = torch.sin(pos.float() * div_term)
    pe[:, 1::2] = torch.cos(pos.float() * div_term)
    pe = pe.unsqueeze(0).transpose(0,1)

    return pe

### Encoder Model


In [16]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_src, device, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.device = device
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Encoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_src: maximum length of the source sentences
            device: the working device (you may need to map your postional embedding to this device)
            dropout: the dropout to be applied. Default=0.1.
        """
        self.src_vocab = src_vocab
        src_vocab_size = len(src_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_src, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that position_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embedding = nn.Embedding(src_vocab_size, embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerEncoder model (use embedding_dim,
        # num_layers, num_heads, & dim_feedforward here)
        enc_model = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(enc_model, num_layers).to(device)

    def make_src_mask(self, src):
        """
        Args:
            src: [max_len, batch_size]
        Returns:
            Boolean matrix of size [batch_size, max_len] indicating which indices are padding
        """
        assert len(src.shape) == 2, 'src must have exactly 2 dimensions'
        src_mask = src.transpose(0, 1) == 0 # padding idx
        return src_mask.to(self.device) # [batch_size, max_src_len]

    def forward(self, x):
        """
        Args:
            x: [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, embed_dim]
        Steps (note: x refers to the original input to this function throughout the pseudo-code):
        - Pass x through the word embedding
        - Add positional embedding to the word embedding, then apply dropout
        - Call make_src_mask(x) to compute a mask: this tells us which indexes in x
          are padding, which we want to ignore for the self-attention
        - Call the encoder, with src_key_padding_mask = src_mask
        """

        embedding = self.embedding(x).to(self.device)
        embedding = self.dropout(embedding + self.position_embedding[:embedding.size(0)])
        embedding_mask = self.make_src_mask(x)

        output = self.transformer_encoder(embedding, src_key_padding_mask=embedding_mask)

        return output

### Decoder Model


In [17]:
class TransformerDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_trg, device, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.device = device
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Decoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_trg: maximum length of the target sentences
            device: the working device
            dropout: the dropout to be applied. Default=0.1.
        """
        self.trg_vocab = trg_vocab
        trg_vocab_size = len(trg_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_trg, embedding_dim).to(self.device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that positional_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embedding = nn.Embedding(trg_vocab_size, embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerDecoder model (you'll need to use embedding_dim,
        # num_layers, num_heads, & dim_feedforward here)
        decoder_model = nn.TransformerDecoderLayer(embedding_dim, num_heads, dim_feedforward).to(self.device)
        self.transfomer_decoder = nn.TransformerDecoder(decoder_model,num_layers)

        # Final fully connected layer
        self.fc = nn.Linear(embedding_dim,trg_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(self.device)
        return mask

    def forward(self, dec_in, enc_out):
        """
        Args:
            dec_in: [sequence length, batch_size]
            enc_out: [max_len, batch_size, embed_dim]
        Returns:
            output: [sequence length, batch_size, trg_vocab_size]
        Steps:
        - Compute input word and positional embeddings in similar manner to encoder
        - Call generate_square_subsequent_mask() to compute a mask: this time,
          the mask is to prevent the decoder from attending to tokens in the "future".
          In other words, at time step i, the decoder should only attend to tokens
          1 to i-1.
        - Call the decoder, with tgt_mask = trg_mask
        - Run the output through the fully-connected layer and return it
        """
        embedding = self.embedding(dec_in.to(self.device))
        embedding = self.dropout(embedding + self.position_embedding[:embedding.size(0)])

        trg_mask = self.generate_square_subsequent_mask(dec_in.size(0))
        output = self.transfomer_decoder(embedding, enc_out, tgt_mask=trg_mask)

        output = self.fc(output)

        return output

## IV. Demo Interface

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
import torch
import gradio as gr

encoder = torch.load('/content/drive/MyDrive/transformer_encoder_cmn.pt')
decoder = torch.load('/content/drive/MyDrive/transformer_decoder_cmn.pt')

encoder.to(DEVICE)
decoder.to(DEVICE)

encoder.eval()
decoder.eval()

TransformerDecoder(
  (embedding): Embedding(7368, 256)
  (dropout): Dropout(p=0.5, inplace=False)
  (transfomer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Drop

In [46]:
import torch
def tokenize_input(input_sentence, vocab):
    """
    Tokenize and convert input sentence to indices using the provided vocabulary.
    """
    return [vocab.word2idx.get(char, vocab.word2idx['<unk>']) for char in input_sentence]


def detokenize_output(output_indices, vocab):
    """
    Convert output indices to words and remove special tokens.
    """
    words = []
    for idx in output_indices:
        if idx not in {vocab.word2idx['<pad>'], vocab.word2idx['<start>'], vocab.word2idx['<end>']}:
            word = vocab.idx2word.get(idx, '<unk>')
            if word == '<unk>':
                word = '[UNK]'
            words.append(word)
    return ' '.join(words)


def decode_transformer_model(encoder, decoder, src, max_decode_len, device):
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len), dtype=torch.long).to(device)
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word))).to(device)

    # Start decoding with the start token
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size, device=device)
    curr_output[:, 0] = dec_input.squeeze(1)

    # Encoder output
    enc_output = encoder(src.to(device))

    for t in range(1, max_decode_len):
        # Prepare decoder input
        dec_in = curr_output[:, :t]

        # Ensure the correct dimensions for decoder input
        dec_in = dec_in.transpose(0, 1).to(device)

        # Decode
        predictions = decoder(dec_in, enc_output)

        # Extract the last prediction
        predictions = predictions[-1, :, :]

        # Store predictions
        curr_predictions[:, t, :] = predictions

        # Get the next token
        next_token = torch.argmax(predictions, dim=1)
        curr_output[:, t] = next_token

        # Break if all sequences have generated the <end> token
        if torch.all(next_token == trg_vocab.word2idx['<end>']):
            break

    return curr_output, curr_predictions, enc_output

def translate_sentence(input_sentence):
    """
    Translate an input sentence using the preloaded Transformer model.
    """
    encoder.eval()
    decoder.eval()

    try:
        # Tokenize the input sentence
        src_indices = tokenize_input(input_sentence, encoder.src_vocab)
        src_tensor = torch.tensor(src_indices).unsqueeze(1).to(DEVICE)  # Shape: [max_len, 1]

        # Decode the sentence
        curr_output, _, _ = decode_transformer_model(encoder, decoder, src_tensor, max_decode_len=50, device=DEVICE)

        # Convert output indices to words
        translated_sentence = detokenize_output(curr_output.squeeze(0).tolist(), decoder.trg_vocab)

        return translated_sentence

    except Exception as e:
        print(f"Error during translation: {str(e)}")
        return str(e)

In [45]:
import gradio as gr

examples = [
    ["你今天在做什么呢？"],
    ["我今天过得很开心。"],
    ["我非常喜欢夏天。"]
]

interface = gr.Interface(
    fn=translate_sentence,
    inputs=gr.Textbox(label="Chinese"),
    outputs=gr.Textbox(label="English"),
    title="Neural Machine Translation App",
    description="Enter a sentence in Chinese and get the translation in English.",
    examples=examples
)

interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://516d35b9845ccda268.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


