<a href="https://colab.research.google.com/github/szhan227/FinalProject2022FallCSCI2470/blob/main/Copy_of_Semantic_Parsing_Stencil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-processing

In [None]:
# data from https://www.cs.utexas.edu/~ai-lab/pubs/cocktail-ecml-01.pdf
# and https://www.cs.utexas.edu/users/ml/nldata.html 

import regex as re
from nltk.stem import SnowballStemmer
from urllib.request import urlopen
from contextlib import closing
from sklearn.model_selection import train_test_split

ss = SnowballStemmer('english')

inputs = []
queries = []

with closing(urlopen('ftp://ftp.cs.utexas.edu/pub/mooney/nl-ilp-data/jobsystem/jobqueries640')) as r:
  for line in r.readlines():
    line = line.decode('utf-8')
    input, query = line.lower().split('],')

    # parse input. lowercase, stem with nltk, add <s>
    input = input[7:-2].split(',')
    input = [ss.stem(x) for x in input]
    inputs.append(input)

    # parse query 
    query = query.strip('.\n')
    # https://stackoverflow.com/questions/43092970/tokenize-by-using-regular-expressions-parenthesis
    query = re.findall(r"\w+(?:'\w+)?|[^\w\s]", query)
    query = ["<s>"] + query + ["</s>"]
    queries.append(query)

# do train test split of 500 training and 140 test instances
inputs_train, inputs_test, queries_train, queries_test = train_test_split(inputs, queries, test_size=140, random_state=8)

In [None]:
inputs_train

[['what', 'job', 'are', 'there', 'use', 'tcl/tk'],
 ['show',
  'me',
  'the',
  'job',
  'use',
  'c++',
  'that',
  'requir',
  'a',
  'bscs',
  'but',
  'desir',
  'a',
  'mscs'],
 ['what', 'job', 'are', 'there', 'for', 'a', 'network', 'specialist'],
 ['give', 'me', 'the', 'job', 'in', 'visual', 'c++'],
 ['list', 'job', 'requir', 'bsee'],
 ['give',
  'me',
  'the',
  'job',
  'for',
  'a',
  'databas',
  'specialist',
  'in',
  'usa'],
 ['what',
  'job',
  'can',
  'a',
  'delphi',
  'develop',
  'find',
  'in',
  'san',
  'antonio',
  'on',
  'window'],
 ['list', 'job', 'requir', 'ba'],
 ['i',
  'wonder',
  'what',
  'jpl',
  'doe',
  'on',
  'unix',
  'with',
  'prolog',
  'and',
  'vax'],
 ['what', 'job', 'are', 'avail', 'for', 'a', 'solari', 'system', 'administr'],
 ['what',
  'job',
  'requir',
  'a',
  'bscs',
  'degre',
  'and',
  'desir',
  'an',
  'mscs',
  'degre'],
 ['are',
  'there',
  'ani',
  'comput',
  'job',
  'in',
  'the',
  'field',
  'of',
  'statist'],
 ['list',

In [None]:
from collections import Counter

input_vocab = Counter()
for l in inputs_train:
  input_vocab.update(l)

input_word2idx = {}
for w, c in input_vocab.items():
  if c >= 2:
    input_word2idx[w] = len(input_word2idx)
input_word2idx['<UNK>'] = len(input_word2idx)
input_word2idx['<PAD>'] = len(input_word2idx)
input_idx2word = {i:word for word,i in input_word2idx.items()}

input_vocab = list(input_word2idx.keys())

query_vocab = Counter()
for q in queries_train:
  query_vocab.update(q)
query_vocab['<UNK>'] = 0
query_vocab['<PAD>'] = 0
query_idx2word = {i:word for i, word in enumerate(query_vocab.keys())}
query_word2idx = {word:i for i, word in query_idx2word.items()}

In [None]:
print('input vocab size:', len(input_word2idx))
print(input_word2idx)
print('query vocab size:', len(query_word2idx))
print(query_word2idx)

input vocab size: 229
{'what': 0, 'job': 1, 'are': 2, 'there': 3, 'use': 4, 'show': 5, 'me': 6, 'the': 7, 'c++': 8, 'that': 9, 'requir': 10, 'a': 11, 'bscs': 12, 'but': 13, 'desir': 14, 'mscs': 15, 'for': 16, 'network': 17, 'specialist': 18, 'give': 19, 'in': 20, 'visual': 21, 'list': 22, 'bsee': 23, 'databas': 24, 'usa': 25, 'can': 26, 'develop': 27, 'find': 28, 'san': 29, 'antonio': 30, 'on': 31, 'window': 32, 'ba': 33, 'i': 34, 'jpl': 35, 'doe': 36, 'unix': 37, 'with': 38, 'prolog': 39, 'and': 40, 'vax': 41, 'avail': 42, 'solari': 43, 'system': 44, 'administr': 45, 'degre': 46, 'an': 47, 'ani': 48, 'comput': 49, 'of': 50, 'java': 51, '2': 52, 'year': 53, 'experi': 54, 'texa': 55, 'ai': 56, 'cobol': 57, 'ibm': 58, 'machin': 59, 'pay': 60, '70000': 61, 'posit': 62, 'onli': 63, 'austin': 64, 'tell': 65, 'mfc': 66, 'data': 67, 'wareh': 68, '60000': 69, 'locat': 70, 'powerbuild': 71, 'as': 72, 'senior': 73, 'softwar': 74, 'houston': 75, 'not': 76, 'nashvill': 77, 'experien': 78, 'ii': 79

In [None]:
inputs_train_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_train]
inputs_test_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_test]

queries_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_train]
queries_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_test]

def pad(input_seq, max_len, pad_token_idx):
  input_seq = input_seq[:max_len]
  padded_seq = input_seq + (max_len - len(input_seq)) * [pad_token_idx]
  return padded_seq

inputs_max_target_len = max([len(i) for i in inputs_train_tokens])
inputs_train_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_train_tokens]
inputs_test_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_test_tokens]

queries_max_target_len = int(max([len(i) for i in queries_train_tokens]) * 1.5)
queries_train_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_train_tokens]
queries_test_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_test_tokens]

# Data Loading

In [None]:
from torch.utils.data import Dataset, DataLoader, default_collate
import torch
import torch.nn as nn
class JobsDataset(Dataset):
  def __init__(self, inputs, queries):
    self.inputs = inputs
    self.queries = queries

  def __len__(self):
      return len(self.inputs)

  def __getitem__(self, idx):
      return self.inputs[idx], self.queries[idx]

def build_datasets():
  jobs_train = JobsDataset(inputs=inputs_train_tokens, queries=queries_train_tokens)
  jobs_test = JobsDataset(inputs=inputs_test_tokens, queries=queries_test_tokens)
  return jobs_train, jobs_test

def collate(batch):
  src, tgt = default_collate(batch)
  return torch.stack(src), torch.stack(tgt)

def build_dataloaders(dataset_train, dataset_test, train_batch_size):
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

# Todo: Define model

In [None]:
def create_model():

  input_vocab_sz = len(input_word2idx)
  query_vocab_sz = len(query_word2idx)

  emb_sz = 256
  hidden_sz = 128
  nlayers = 2

  encoder = Encoder(input_vocab_sz, emb_sz, hidden_sz, nlayers)
  decoder = Decoder(query_vocab_sz, emb_sz, hidden_sz, nlayers)

  model = Seq2Seq(encoder, decoder, hidden_sz, query_vocab_sz)

  return model

class Encoder(nn.Module):

  def __init__(self, vocab_sz, emb_sz, hidden_sz, nlayers=2):
    super().__init__()
    self.hidden_sz = hidden_sz
    self.nlayers = nlayers
    self.embedding = nn.Embedding(vocab_sz, emb_sz)
    self.lstm = nn.LSTM(emb_sz, hidden_sz, nlayers)

  def forward(self, inputs):
    seq_embedding = self.embedding(inputs)
    encoder_output, (hidden_state, cell) = self.lstm(seq_embedding)
    return encoder_output, hidden_state, cell


class Decoder(nn.Module):

  def __init__(self, vocab_sz, emb_sz, hidden_sz, nlayers=2):
    super().__init__()
    self.hidden_sz = hidden_sz
    self.vocab_sz = vocab_sz
    self.embedding = nn.Embedding(vocab_sz, emb_sz)
    self.lstm = nn.LSTM(emb_sz, hidden_sz, nlayers)


  def forward(self, inputs, hidden_state, cell):
    query_embedding = self.embedding(inputs)
    decoder_outputs, (decoder_hidden_state, decoder_cell) = self.lstm(query_embedding, (hidden_state, cell))
    return decoder_outputs, decoder_hidden_state, decoder_cell

  
class Seq2Seq(nn.Module):

  def __init__(self, encoder, decoder, hidden_sz, output_vocab_sz):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

    self.W1 = nn.Linear(hidden_sz, hidden_sz)
    self.W2 = nn.Linear(hidden_sz, hidden_sz)
    self.W0 = nn.Linear(hidden_sz, output_vocab_sz)


  # def post_decode_with_attn(self, encoder_outputs, decoder_logits, decoder_hidden_state):
  #   htl = decoder_hidden_state[-1]

  def decode(self, labels, encoder_outputs, encoder_state, encoder_cell):

    logits, decoder_hidden_state, decoder_cell = self.decoder(labels, encoder_state, encoder_cell)
    htl = decoder_hidden_state[-1]

    numerator = torch.exp(encoder_outputs * htl)
    denominator = torch.sum(numerator, dim=0)

    scores = numerator / denominator

    c = torch.sum(scores * encoder_outputs, dim=0)

    h_attns = torch.tanh(self.W1(logits) + self.W2(c))

    return self.W0(h_attns)

  def forward(self, inputs, labels):
    encoder_outputs, encoder_state, encoder_cell = self.encoder(inputs)
    return self.decode(labels, encoder_outputs, encoder_state, encoder_cell)
    # logits, decoder_hidden_state, decoder_cell = self.decoder(labels, encoder_state, encoder_cell)
    # print('logits', logits.shape)
    # print('decoder hidden state', decoder_hidden_state.shape)
    # print('encoder outputs', encoder_outputs.shape)
    # The top hidden state of decoder
    # htl = decoder_hidden_state[-1]

    # numerator = torch.exp(encoder_outputs * htl)
    # denominator = torch.sum(numerator, dim=0)

    # # print('numerator', numerator.shape)
    # # print('denominator', denominator.shape)
    # scores = numerator / denominator
    # # print('score', scores.shape)

    # c = torch.sum(scores * encoder_outputs, dim=0)

    # # print('htl shape', htl.shape)
    # # print('c shape', c.shape)

    # h_attns = torch.tanh(self.W1(logits) + self.W2(c))
    # # print('h_attns', h_attns.shape)
    # return self.W0(h_attns)
    # return logits



In [None]:
jobs_train, jobs_test = build_datasets()
dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=20)

In [None]:
for d in dataloader_test:
  input, label = d
  # input = torch.transpose(input, 0, -1)
  # label = torch.transpose(label, 0, -1)
  print(input.shape, label.shape)
  model = create_model()
  output = model(input, label)
  print(output.shape)
  break

torch.Size([22, 1]) torch.Size([166, 1])
torch.Size([166, 1, 503])


# Todo: Training and testing loops

In [None]:
QUERY_SOS_INDEX = query_word2idx['<s>']
QUERY_EOS_INDEX = query_word2idx['</s>']
QUERY_PAD_INDEX = query_word2idx['<PAD>']

def padding_mask(idx_tensor):
  return (idx_tensor == QUERY_PAD_INDEX).transpose(0, 1)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=QUERY_PAD_INDEX)

In [None]:
def train(model, train_dataloader, num_epochs, device="cuda"):
  for i in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer = torch.optim.Adam(model.parameters())
    for src, tgt in train_dataloader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:-1, :]

        # print('show src shape:', src.shape)
        # print('show tgt shape:', tgt.shape)
        logits = model(src, tgt_input)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        # print('show logits shape:', logits.shape)
        # print('show labels shape:', tgt_out.shape)
        # raise Exception('lalala')
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        total_loss += loss.item()
        # print((f"\rEpoch: {i}, Train loss: {loss:.3f}"), end='')
    # print()
    print((f"Epoch: {i}, Train loss: {total_loss / len(train_dataloader):.3f}"))
  return model

In [None]:
# This function is obsolete, NEVER USE. 
def evaluate2(model, dataloader, device="cuda"):
    accr = 0
    count = 0
    softmax = nn.Softmax(dim=0)
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:-1, :]

        logits = model(src, tgt_input)
        tgt_out = tgt[1:, :]
        tgt_padding = padding_mask(tgt_out)
        # loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        prediction = softmax(logits)
        prediction = torch.argmax(prediction, dim=-1)
        prediction = prediction.squeeze()
        tgt_out = tgt_out.squeeze()
        tgt_padding = tgt_padding.squeeze()
        # print(prediction.shape, tgt_out.shape, tgt_padding.shape)
        for pred, tg, pad in zip(prediction, tgt_out, tgt_padding):
          # print('is pad:', pad)
          if pad.item():
            continue
          if pred == tg:
            accr += 1
          count += 1
        print(f'\rTest Accuracy: {accr / count} ', end='')
    print()
    return accr / count

# This is the correct evaluation function
def evaluate(model, dataloader, device='cuda'):
  accr = 0
  total = 0
  softmax = nn.Softmax(dim=-1)

  for src, tgt in dataloader:
    src, tgt = src.to(device), tgt.to(device)
    tgt_padding_mask = padding_mask(tgt)
    # print('src shape', src.shape)
    encoder_outputs, encoder_state, encoder_cell = model.encoder(src)
    ys = torch.ones(1, 1).fill_(QUERY_SOS_INDEX).type(torch.long).to(device)
    for i in range(queries_max_target_len-1):
      
      # def decode(self, labels, encoder_outputs, encoder_state, encoder_cell):
      out = model.decode(ys, encoder_outputs, encoder_state, encoder_cell).to(device)
      out = out.transpose(0, 1).to(device)
      # out = out[0]
      # print('out:', out.shape)
      prob = softmax(out)
      # print('prob:', prob.shape)
      prob = prob[:, -1]
      # print('prob:', prob)
      _, next_word = torch.max(prob, dim=1)
      # print('next word:', next_word.shape)
      next_word = next_word.item()
      
      ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
      if next_word == QUERY_EOS_INDEX:
          break
    pred = torch.squeeze(ys)
    tgt = torch.squeeze(tgt)
    mask = torch.squeeze(tgt_padding_mask)
    # print('pred:', ys)
    # print('true:', tgt)
    # print('mask:', tgt_padding_mask)
    for p, t, m in zip(pred, tgt, mask):
      if m.item():
        break
      if p == t:
        accr += 1
      total += 1
      print(f'\rEvaluating: per-token accuracy: {accr / total}', end='')
  print()
  return accr / total

# Run this!

Your outputs should look something like this (not exactly the same numbers, just in a similar ballpark and format).

```
Epoch: 1, Train loss: 4.590
Epoch: 2, Train loss: 1.871
Epoch: 3, Train loss: 1.424
...
Test Accuracy: 0.5195115804672241
```



In [None]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=20)
    model = create_model()
    model.to(device)
    model = train(model, dataloader_train, num_epochs=25, device=device)
    test_accuracy = evaluate(model, dataloader_test, device=device)
    print(f'Test Accuracy: {test_accuracy}')

main()

Epoch: 0, Train loss: 4.167
Epoch: 1, Train loss: 2.893
Epoch: 2, Train loss: 2.318
Epoch: 3, Train loss: 1.727
Epoch: 4, Train loss: 1.322
Epoch: 5, Train loss: 1.094
Epoch: 6, Train loss: 0.985
Epoch: 7, Train loss: 0.930
Epoch: 8, Train loss: 0.873
Epoch: 9, Train loss: 0.823
Epoch: 10, Train loss: 0.813
Epoch: 11, Train loss: 0.781
Epoch: 12, Train loss: 0.724
Epoch: 13, Train loss: 0.695
Epoch: 14, Train loss: 0.670
Epoch: 15, Train loss: 0.643
Epoch: 16, Train loss: 0.607
Epoch: 17, Train loss: 0.621
Epoch: 18, Train loss: 0.617
Epoch: 19, Train loss: 0.562
Epoch: 20, Train loss: 0.543
Epoch: 21, Train loss: 0.557
Epoch: 22, Train loss: 0.545
Epoch: 23, Train loss: 0.528
Epoch: 24, Train loss: 0.525
Evaluating: per-token accuracy: 0.5654699049630412
Test Accuracy: 0.5654699049630412


Link to this colab:[https://colab.research.google.com/drive/1mj4U9LaUk_BF_8oNaX_tXNj6VqH0p-2v?usp=sharing](https://)