## Requirements

In [None]:
# Install indicnlp library
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading https://files.pythonhosted.org/packages/2f/51/f4e4542a226055b73a621ad442c16ae2c913d6b497283c99cae7a9661e6c/indic_nlp_library-0.71-py3-none-any.whl
Collecting morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: morfessor, indic-nlp-library
Successfully installed indic-nlp-library-0.71 morfessor-2.0.6


In [None]:
# Thanks to: https://stackoverflow.com/a/48133859/14938928

%%bash
fileid="1TSQWZCxZIbpMjzxt4Tw2pdljoJz6ddn_"
curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o train.csv

fileid="1IodW8rvwGfDY52ngrd4zn5B1KkDfSM4P"
curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o dev.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   408    0   408    0     0    581      0 --:--:-- --:--:-- --:--:--   581100   408    0   408    0     0    581      0 --:--:-- --:--:-- --:--:--   580
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 10.3M    0 10.3M    0     0  7268k      0 --:--:--  0:00:01 --:--:-- 7268k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0100   408    0   408    0     0    240      0 -

## Importing Libraries

In [None]:
from tqdm import tqdm
import random
import re

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

import csv

from indicnlp.tokenize import indic_tokenize, indic_detokenize

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

if device.type == 'cpu':
    print('⚠️⚠️⚠️ You may want to use a GPU ⚠️⚠️⚠️')

cuda


## Splitting Dataset into Train & Dev Set

In [None]:
# Read the dataset
with open('train.csv', 'r') as fin_train, open('dev.csv') as fin_dev:
    # Skip header
    next(fin_train)
    
    csv_reader = csv.reader(fin_train, delimiter=',')
    train_set = [[src_sentence, tgt_sentence] for src_sentence, tgt_sentence, _ in csv_reader]

    # Skip header
    next(fin_dev)
    
    csv_reader = csv.reader(fin_dev, delimiter=',')
    dev_set = [[src_sentence, tgt_sentence] for src_sentence, tgt_sentence, _ in csv_reader]

In [None]:
train_size = len(train_set)
val_size = len(dev_set)

print(f'Train and Val set of {train_size} and {val_size} sentence pairs respectively')

Train and Val set of 56355 and 8421 sentence pairs respectively


In [None]:
# Checking if splitting preserved integrity
rng = random.Random()
rng.seed(24)
for i in range(10):
    print(rng.choice(train_set))

['Which Pick # has a Nationality of canada, and a Team from of sudbury wolves?', 'SELECT MIN Pick # FROM table WHERE Nationality = canada AND Team from = sudbury wolves']
['What school did draft pick from round 3 go to?', 'SELECT  School FROM table WHERE Round = 3']
['What is the company that made the chassis for the entrant danka arrows yamaha?', 'SELECT  Chassis FROM table WHERE Entrant = danka arrows yamaha']
['Which driver has a Time/Retired of 2:45:46.2?', 'SELECT  Driver FROM table WHERE Time/Retired = 2:45:46.2']
['What is the area where population in 2010 is 38062?', 'SELECT  Area (km²) FROM table WHERE Population (2010) = 38062']
['How few runs does the 97.00 average have?', 'SELECT MIN Runs FROM table WHERE Average = 97.00']
['List the players of the year for the tournament held in matadome ( northridge, california )?', 'SELECT  Conference Player of the Year FROM table WHERE Tournament Venue (City) = Matadome ( Northridge, California )']
['How many schools left in 2002-03?', 

## Preprocessing

In [None]:
class Preprocessing:
    """
    A class containing utitily methods for preprocessing
    """
    
    def __init__(self):
        pass
     
    def normalize(self, sentence, lang_name):
        """Lowercase, tokenize a given sentence
        
        Parameters
        ----------
        sentence : str
            Sentence to be normalized and tokenized
        lang_name : str
            Language name of the given sentence

        Returns
        -------
        sentence: str
            Normalized and tokenized words separted by space
        """

        sentence = sentence.lower().strip()
        sentence = ' '.join(indic_tokenize.trivial_tokenize(sentence, lang_name))
        return sentence
    
    def filter_pairs(self, pairs, max_length):
        """Remove pairs whose lengths is greater than max_length
        
        Parameters
        ----------
        pairs : iterable
            Iterable object containing sentence pairs
        max_length : int
            Maximum length of sentences in a sentence pair

        Returns
        -------
        filtered_pairs : list
            Filtered sentence pairs
        """

        filter = lambda pair: len(pair[0].split(' ')) < max_length and \
                              len(pair[1].split(' ')) < max_length
    
        filtered_pairs = [pair for pair in pairs if filter(pair)]
        return filtered_pairs

## Building Vocabulary for Source & Target Language

In [None]:
class Language:
    """
    A class to build vocabulary of source and target language
    """

    def __init__(self, lang_name):
        """Constructor to initialize the object
        
        Parameters
        ----------
        name : str
            Name of the language
        """
        self.name = lang_name
        self.PAD, self.SOS, self.EOS, self.UNK = 0, 1, 2, 3
        self.word2idx = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
        self.idx2word = {0: '<pad>', 1: '<sos>', 2: '<eos>', 3: '<unk>'}
        self.n_words = 4
    
    def add_sentence(self, sentence):
        """Populate the vocabulary with words in the sentence

        Parameters
        ----------
        sentence : str
            Sentence to be used to populate the vocabulary
        """

        for word in sentence.split(' '):
            if word not in self.word2idx:
                self.word2idx[word] = self.n_words
                self.idx2word[self.n_words] = word
                self.n_words += 1
            
    def sentence_to_indices(self, sentence):
        """Converts a sentence to sequence of index
        
        Parameters
        ----------
        sentence : str
            Sentence for which sequence of index is to be calculated

        Returns
        -------
        indices : list
            Sequence of index appended by EOS token index
        """
        
        indices = [self.word2idx[word] if word in self.word2idx else self.UNK
                   for word in sentence.split(' ')]
        indices.append(self.EOS)
    
        return indices
    
    def indices_to_sentence(self, indices):
        """Converts sequence of index to corresponding sentence

        Parameters
        ----------
        indices : iterable
            Sequence of index

        Returns
        -------
        sentence : str
            Sentence corresponding to given sequence of index
        """

        sentence = ' '.join(self.idx2word[index] for index in indices)
        return sentence

In [None]:
# Preprocessing object
preprocess = Preprocessing()

# Source and target language
src_lang, tgt_lang = Language('en'), Language('en')

In [None]:
# Normalize the training set sentence pairs
train_sentences = [
    [preprocess.normalize(src_sentence, src_lang.name), preprocess.normalize(tgt_sentence, tgt_lang.name)]
    for src_sentence, tgt_sentence in train_set
]
print(f'Normalized {len(train_sentences)} sentence pairs')

# Filter the training set sentence pairs
train_sentences = preprocess.filter_pairs(train_sentences, max_length=36)
print(f'Filtered to {len(train_sentences)} sentence pairs')

Normalized 56355 sentence pairs
Filtered to 56172 sentence pairs


In [None]:
# Add training set sentences to their corresponding language
for src_sentence, tgt_sentence in train_sentences:
    src_lang.add_sentence(src_sentence)
    tgt_lang.add_sentence(tgt_sentence)

print(f'No. of words in source language: {src_lang.n_words}')
print(f'No. of words in target language: {tgt_lang.n_words}')

print(random.choice(train_sentences))

No. of words in source language: 38137
No. of words in target language: 35872
['how many records were made on the game that ended with score w 121–119 ( ot )', 'select count record from table where score = w 121–119 ( ot )']


## Convert sentences
Converting senteces to integer sequences according to their vocabulary and then transforming them into tensors for input to Pytorch Encoder-Decoder Model

In [None]:
# Convert training set sentences to corresponding sequence of index
train_indices = [
    [src_lang.sentence_to_indices(src_sentence), tgt_lang.sentence_to_indices(tgt_sentence)]
    for src_sentence, tgt_sentence in train_sentences
]

assert len(train_indices) == len(train_sentences)

In [None]:
def to_tensor(indices):
    """Converts sequence of index to tensors

    Parameters
    ----------
    indices : iterable
        Sequence of index
    
    Returns
    -------
    Pytorch tensor of corresponding sequence of index
    """

    return torch.tensor(indices, dtype=torch.long, device=device)

# Convert training set sequences of index to tensors
train_tensors = [
    [to_tensor(src_indices), to_tensor(tgt_indices)] for src_indices, tgt_indices in train_indices
]

In [None]:
def collate(batch):
    """Utitlity function for batching via DataLoader
    
    Parameters
    ----------
    batch : iterable
        Batch of tensor pairs
        
    Returns
    -------
    padded_source_tensors : 2-D tensor (batch_size, seq_len)
        Source language tensors padded by source language PAD token
    padded_target_tensors : 2-D tensor (batch_size, seq_len)
        Source language tensors padded by source language PAD token
    """
    
    src_tensors, tgt_tensors = zip(*batch)

    padded_src_tensors = pad_sequence(src_tensors, padding_value=src_lang.PAD, batch_first=True)
    padded_tgt_tensors = pad_sequence(tgt_tensors, padding_value=tgt_lang.PAD, batch_first=True)

    return padded_src_tensors, padded_tgt_tensors

## Seq2Seq Model using Pytorch

In [None]:
class Encoder(nn.Module):
    """
    A class implementing Bi-GRU Encoder
    """
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        """Constructor to initialize the Encoder object
        
        Parameters
        ----------
        vocab_size : int
            Vocabulary size of source language
        embedding_dim : int
            Size of embedding vectors of words in source language
        hidden_size : int
            Size of hidden state vectors of Encoder
        """
        
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        
        self.W1 = nn.Linear(2*hidden_size, hidden_size)
        self.W2 = nn.Linear(2*hidden_size, hidden_size)

    def forward(self, input, hidden):
        """Implements the forward pass of Encoder
        
        Parameters
        ----------
        input : 2-D tensor (batch_size, seq_len)
            Source tensors
        hidden : 3-D tensor (num_layers, batch_size, hidden_size)
            Hidden state vector
                
        Returns
        -------
        output : 3-D tensor (batch_size, seq_len, hidden_size)
            Encoder hidden states of all timesteps
        hidden : 3-D tensor (1, batch_size, hidden_size)
            Hidden state vector of last timestep
        """
        
        embedding = self.embedding(input)
        
        output, hidden = self.gru(embedding, hidden)
        output = self.W1(output)
        
        hidden = torch.cat((hidden[0, :, :], hidden[1, :, :]), dim=1).unsqueeze(0)
        hidden = self.W2(hidden)
        
        return output, hidden

    def init_hidden(self, batch_size):
        """Initializes hidden state vector for Bi-GRU Encoder
        
        Parameters
        ----------
        batch_size : int
            Batch size
        
        Returns
        -------
        Tensor initialized with all zeroes of shape (2, batch_size, hidden_size)
        """
        
        return torch.zeros(2, batch_size, self.hidden_size, device=device)

In [None]:
class Decoder(nn.Module):
    """
    A class implementing Bahdanau Attention Decoder with GRU units
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        """Constructor to initialize the Decoder object
        
        Parameters
        ----------
        vocab_size : int
            Vocabulary size of target language
        embedding_dim : int
            Size of embedding vectors of words in target language
        hidden_size : int
            Size of hidden state vectors of Decoder
        """
        
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)
        
        self.gru = nn.GRU(embedding_dim + hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input, hidden, encoder_outputs):
        """Implements the forward pass of Decoder
        
        Parameters
        ----------
        input : 2-D tensor (batch_size, seq_len)
            Target tensors
        hidden : 3-D tensor (1, batch_size, hidden_size)
            Hidden state vector
        encoder_outputs : 3-D tensor (batch_size, seq_len, hidden_size)
            Output of each timestep of Encoder
                
        Returns
        -------
        output : 3-D tensor (batch_size, seq_len, hidden_size)
            Decoder output of seq_len timesteps
        hidden : 3-D tensor (1, batch_size, hidden_size)
            Decoder hidden state vector
        """
        
        embedding = self.embedding(input)
        
        tmp_hidden = hidden.permute(1, 0, 2)
        scores = torch.tanh(self.W1(tmp_hidden) + self.W2(encoder_outputs))
        attn_weights = F.softmax(self.V(scores), dim=1)
        
        context_vector = torch.sum(attn_weights * encoder_outputs, dim=1, keepdim=True)
        output = torch.cat((embedding, context_vector), -1)
        
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

    def init_hidden(self, batch_size):
        """Initializes hidden state vector
        
        Parameters
        ----------
        batch_size : int
            Batch size
        
        Returns
        -------
        Tensor initialized with all zeroes of shape (1, batch_size, hidden_size)
        """
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [None]:
class Seq2Seq(nn.Module):
    """
    A class implementing end-to-end seq2seq model
    """
    
    def __init__(self, src_vocab_size, tgt_vocab_size, hidden_size, embedding_dim, SOS, EOS):
        """Construcor to initialize seq2seq object
        
        Parameters
        ----------
        src_vocab_size : int
            Source vocabulary size
        tgt_vocab_szie : int
            Target vocabulary size
        hidden_size : int
            Hidden state vector size same for both Encoder and Decoder
        embedding_dim : int
            Embedding dimension vector size same for both Encoder and Decoder
        SOS : int
            Index value of Start of Sentence token
        EOS : int
            Index value of End of Sentence token
        """
        
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(
            vocab_size=src_vocab_size,
            embedding_dim=embedding_dim,
            hidden_size=hidden_size
        ).to(device)
        
        self.decoder = Decoder(
            vocab_size=tgt_vocab_size,
            embedding_dim=embedding_dim,
            hidden_size=hidden_size
        ).to(device)
        
        self.SOS = SOS
        self.EOS = EOS
        
    def init_hidden(self, batch_size):
        """Initializes hidden state vectors of Encoder and Decoder
        
        Parameters
        ----------
        batch_size : int
            Batch size
        
        Returns
        -------
        Encoder and Decoder initial hidden state tensor
        """
        
        return self.encoder.init_hidden(batch_size), self.decoder.init_hidden(batch_size)
        
    def forward(self, src_tensors, tgt_tensors, criterion, tf):
        """Implements forward pass of end-to-end seq2seq2 model
        
        Parameters
        ----------
        src_tensors : 2-D tensor (batch_size, seq_len)
            Source tensors
        tgt_tensors : 2-D tensor (batch_size, seq_len)
            Target tensors
        criterion
            loss function
                
        Returns
        -------
        Trainig loss incured
        """
        
        batch_size = src_tensors.size(0)
        encoder_hidden, decoder_hidden = self.init_hidden(batch_size)
        
        encoder_outputs, encoder_hidden = self.encoder(src_tensors, encoder_hidden)
        
        decoder_input = torch.full((batch_size, 1), self.SOS, dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden
        
        loss = 0
        
        if random.random() < tf:
            for timestep in range(tgt_tensors.size(1)):
                decoder_output, decoder_hidden = self.decoder(decoder_input,
                                                              decoder_hidden,
                                                              encoder_outputs)
                # Teacher forcing
                decoder_input = tgt_tensors[:, timestep].unsqueeze(1)
                loss += criterion(decoder_output.squeeze(dim=1), tgt_tensors[:, timestep])
        else:
            for timestep in range(tgt_tensors.size(1)):
                decoder_output, decoder_hidden = self.decoder(decoder_input,
                                                              decoder_hidden,
                                                              encoder_outputs)
                _, topi = torch.topk(decoder_output, 1)
                
                decoder_input = topi.view(batch_size, 1)
                loss += criterion(decoder_output.squeeze(dim=1), tgt_tensors[:, timestep])

        return loss / tgt_tensors.size(1)

In [None]:
def train(model, train_loader, epochs, lr, tf):
    """Implements training of given seq2seq model
    
    Parameters
    ----------
    model : Seq2Seq
        A seq2seq model
    train_loader : DataLoader
        DataLoader for getting batches
    epochs : int
        Number of epochs
    lr : float
        Learning rate
    """
    
    for parameter in model.parameters():
        if parameter.dim() > 1:
            nn.init.xavier_uniform_(parameter)
    
    optimizer = optim.Adam([parameter for parameter in model.parameters()], lr=lr)
    criterion = nn.NLLLoss()
    
    for epoch in range(epochs):
        epoch_loss = no_of_batches = 0
        
        with tqdm(train_loader, unit='batch') as tr:            
            for src_tensors, tgt_tensors in tr:
                tr.set_description(f'Epoch {(epoch + 1):>2}')
                
                optimizer.zero_grad()
            
                loss = model(src_tensors, tgt_tensors, criterion, tf)
            
                loss.backward()
                optimizer.step()
                
                epoch_loss += loss.item()
                no_of_batches += 1
            
                tr.set_postfix(loss=f'{epoch_loss / no_of_batches:.4f}')

In [None]:
def predict(model, src_tensor, max_length):
    """Implements prediction for a given source tensor
    
    Parameters
    ----------
    model : Seq2Seq
        Trained seq2seq model
    src_tensor: 2-D tensor
        Source tensor of shape (1, seq_len)
    max_length: int
        Maximum length of predicted target sentence
            
    Returns
    -------
    prediction : list
        Sequence of target index
    """
    
    with torch.no_grad():
        encoder_hidden, decoder_hidden = model.init_hidden(1)

        encoder_outputs, encoder_hidden = model.encoder(src_tensor, encoder_hidden)

        decoder_input = torch.tensor([[tgt_lang.SOS]], dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden

        prediction = []

        for timestep in range(max_length):
            decoder_output, decoder_hidden = model.decoder(decoder_input, decoder_hidden, encoder_outputs)
            _, topi = torch.topk(decoder_output, 1)

            if topi.item() == tgt_lang.EOS:
                break
            else:
                prediction.append(topi.item())

            decoder_input = torch.tensor([[topi.item()]], dtype=torch.long, device=device)

        return prediction

In [None]:
# Hyperparameters of seq2seq model
embedding_dim = 512
hidden_size = 512
batch_size = 64
epochs = 11
lr = 0.001
tf = 1

# Seq2Seq model object
model = Seq2Seq(
    src_vocab_size=src_lang.n_words,
    tgt_vocab_size=tgt_lang.n_words,
    hidden_size=hidden_size,
    embedding_dim=embedding_dim,
    SOS=tgt_lang.SOS,
    EOS=tgt_lang.EOS
).to(device)

## Training

Skip this section if training is not to be done, i.e., only inferencing purposes model is to be used.

In [None]:
train_loader = DataLoader(train_tensors, batch_size=batch_size, collate_fn=collate, shuffle=True)

# Training the model
model.train()
train(model=model, train_loader=train_loader, epochs=epochs, lr=lr, tf=tf)

Epoch  1: 100%|██████████| 878/878 [07:04<00:00,  2.07batch/s, loss=1.9763]
Epoch  2: 100%|██████████| 878/878 [07:06<00:00,  2.06batch/s, loss=0.9917]
Epoch  3: 100%|██████████| 878/878 [07:05<00:00,  2.06batch/s, loss=0.5471]
Epoch  4: 100%|██████████| 878/878 [07:05<00:00,  2.07batch/s, loss=0.3066]
Epoch  5: 100%|██████████| 878/878 [07:05<00:00,  2.06batch/s, loss=0.1906]
Epoch  6: 100%|██████████| 878/878 [07:05<00:00,  2.07batch/s, loss=0.1438]
Epoch  7: 100%|██████████| 878/878 [07:04<00:00,  2.07batch/s, loss=0.1187]
Epoch  8: 100%|██████████| 878/878 [07:04<00:00,  2.07batch/s, loss=0.0984]
Epoch  9: 100%|██████████| 878/878 [07:03<00:00,  2.07batch/s, loss=0.0931]
Epoch 10: 100%|██████████| 878/878 [07:05<00:00,  2.06batch/s, loss=0.0830]
Epoch 11: 100%|██████████| 878/878 [07:07<00:00,  2.06batch/s, loss=0.0688]


In [None]:
torch.save(model.state_dict(), 'seq2seq_model.pt')

## Inferencing

In [None]:
model.load_state_dict(torch.load('seq2seq_model.pt'))
model.eval()

In [None]:
# Validation set sentences
src_sentences, tgt_sentences = zip(*dev_set)

# Preprocess validation set source sentences
preprocess = Preprocessing()
src_sentences = [preprocess.normalize(src_sentence, src_lang.name) for src_sentence in src_sentences]
print(f'Normalized {len(src_sentences)} source sentences')

# Validation set source tensors
src_tensors = [to_tensor(src_lang.sentence_to_indices(src_sentence)) for src_sentence in src_sentences]

Normalized 8421 source sentences


In [None]:
def predictions(model, src_loader, tgt_lang):
    """
    Validation or test set predictions
    """
    tgt_predictions = []
    
    with tqdm(src_loader, unit='sentences') as tr:            
        for src_tensor, _ in tr:
            tgt_indices = predict(model, src_tensor, 36)
            tgt_prediction = tgt_lang.indices_to_sentence(tgt_indices)
            tgt_predictions.append(tgt_prediction)
        
    return tgt_predictions

In [None]:
val_loader = DataLoader(src_tensors, batch_size=1)
tgt_predictions = predictions(model, val_loader, tgt_lang)

100%|██████████| 8421/8421 [03:10<00:00, 44.21sentences/s]


In [None]:
rng = random.Random()
rng.seed(42)
indices = [rng.randrange(0, len(dev_set)) for i in range(5)]

for idx in indices:
    print(f'> {src_sentences[idx]}')
    print(f'= {tgt_sentences[idx]}')
    print(f'< {tgt_predictions[idx]}')
    print()

> how many pole positions for round 20 ?
= SELECT COUNT Pole Position FROM table WHERE Round = 20
< select count pole position from table where round = 20

> what is the pa when the skip is colleen jones ?
= SELECT  PA FROM table WHERE Skip = Colleen Jones
< select entities from table where skip = erkki jones

> what is the kosal with hatibandha as the sambalpuri cinema ?
= SELECT  Kosal FROM table WHERE Sambalpuri Cinema = hatibandha
< select detriment from table where 07:00 = yak

> in the game on or before week 9 , who was the opponent when the attendance was 61,626 ?
= SELECT  Opponent FROM table WHERE Week < 9 AND Attendance = 61,626
< select opponent from table where week < 9 and attendance = 914

> which competition has a venue of estadio alfonso lastras , san luis potosí , mexico , and a goal larger than 15 ?
= SELECT  Competition FROM table WHERE Venue = estadio alfonso lastras, san luis potosí, mexico AND Goal > 15
< select sum competition from table where venue = estadio ciu

In [None]:
!pip install "nltk==3.4.5"

import nltk
nltk.download('wordnet')

Collecting nltk==3.4.5
[?25l  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
[K     |████████████████████████████████| 1.5MB 5.5MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-cp37-none-any.whl size=1449910 sha256=06cd45584681423f12e912f38f6897da0f1a8bb2d27adf51aab89175f791d6d3
  Stored in directory: /root/.cache/pip/wheels/96/86/f6/68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.4.5


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score

def print_scores(true_sentences, pred_sentences):
    if len(true_sentences) != len(pred_sentences):
        print(f'E: Number of sentences do not match. True: {len(true_sentences)} Pred: {len(pred_sentences)}')
        return

    for i in range(len(true_sentences)):
        true_sentences[i] = true_sentences[i].lower()
        pred_sentences[i] = pred_sentences[i].lower()
    
    true_sentences_joined, pred_sentences_joined = [], []

    for i in range(len(true_sentences)):
        # some punctuations from string.punctuation
        split_true = list(filter(None, re.split(r'[\s!"#$%&\()+,-./:;<=>?@\\^_`{|}~]+', true_sentences[i])))
        split_pred = list(filter(None, re.split(r'[\s!"#$%&\()+,-./:;<=>?@\\^_`{|}~]+', pred_sentences[i])))

        true_sentences_joined.append(' '.join(split_true))
        pred_sentences_joined.append(' '.join(split_pred))

    print(f'Number of sentences: {len(true_sentences_joined)}')

    scores = {}

    # Macro-averaged BLEU-4 score.
    scores['bleu_4_macro'] = 0
    for ref, hyp in zip(true_sentences_joined, pred_sentences_joined):
        scores['bleu_4_macro'] += sentence_bleu(
            [ref.split()],
            hyp.split(),
            smoothing_function=SmoothingFunction().method2
        )
    scores['bleu_4_macro'] /= len(true_sentences_joined)

    # BLEU-4 score.
    scores['bleu_4'] = corpus_bleu(
        [[ref.split()] for ref in true_sentences_joined],
        [hyp.split() for hyp in pred_sentences_joined],
        smoothing_function=SmoothingFunction().method2
    )

    # METEOR score.
    scores['meteor'] = 0
    for ref, hyp in zip(true_sentences_joined, pred_sentences_joined):
        scores['meteor'] += single_meteor_score(ref, hyp)
    scores['meteor'] /= len(true_sentences_joined)

    # Print out scores.
    for key in scores:
        print(f'{key}: {scores[key]}')

print_scores(list(tgt_sentences), tgt_predictions)

Number of sentences: 8421
bleu_4_macro: 0.6424026061807562
bleu_4: 0.5977420791225594
meteor: 0.7844590430266942


## Test Set

In [None]:
# Test set source sentences
with open('testhindistatements.csv', 'r') as fin:
    # Skip header
    next(fin)
    
    csv_reader = csv.reader(fin, delimiter=',')
    src_sentences = [src_sentence for _, _, src_sentence in csv_reader]

In [None]:
# Preprocess test set source sentences
preprocess = Preprocessing()
src_sentences = [preprocess.normalize(src_sentence, src_lang.name) for src_sentence in src_sentences]
print(f'Normalized {len(src_sentences)} source sentences')

# Test set source tensors
src_tensors = [to_tensor(src_lang.sentence_to_indices(src_sentence)) for src_sentence in src_sentences]

assert len(src_sentences) == len(src_tensors)

Normalized 24102 source sentences


In [None]:
test_loader = DataLoader(src_tensors, batch_size=1)
tgt_predictions = predictions(model, test_loader, tgt_lang)

100%|██████████| 24102/24102 [05:08<00:00, 78.22sentences/s] 


In [None]:
# Generate answer.txt
with open('answer.txt', 'w') as fout:
    for tgt_prediction in tqdm(tgt_predictions):
        fout.write(f'{tgt_prediction}\n')

100%|██████████| 24102/24102 [00:00<00:00, 943656.73it/s]


**References**
- Neural Machine Translation by Jointly Learning to Align and Translate  
[arXiv:1409.0473](https://arxiv.org/abs/1409.0473)