<a href="https://colab.research.google.com/github/vlordier/colabs/blob/main/NGL_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




# Data


In [None]:
import torch.nn as nn
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [None]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'dev.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        # assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [None]:
# coding: utf-8
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

# import data
# import model

# parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
# parser.add_argument('--data', type=str, default='./data',
#                     help='location of the data corpus')
# parser.add_argument('--model', type=str, default='LSTM',
#                     help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
# parser.add_argument('--emsize', type=int, default=200,
#                     help='size of word embeddings')
# parser.add_argument('--nhid', type=int, default=200,
#                     help='number of hidden units per layer')
# parser.add_argument('--nlayers', type=int, default=2,
#                     help='number of layers')
# parser.add_argument('--lr', type=float, default=20,
#                     help='initial learning rate')
# parser.add_argument('--clip', type=float, default=0.25,
#                     help='gradient clipping')
# parser.add_argument('--epochs', type=int, default=40,
#                     help='upper epoch limit')
# parser.add_argument('--batch_size', type=int, default=20, metavar='N',
#                     help='batch size')
# parser.add_argument('--bptt', type=int, default=35,
#                     help='sequence length')
# parser.add_argument('--dropout', type=float, default=0.2,
#                     help='dropout applied to layers (0 = no dropout)')
# parser.add_argument('--tied', action='store_true',
#                     help='tie the word embedding and softmax weights')
# parser.add_argument('--seed', type=int, default=1111,
#                     help='random seed')
# parser.add_argument('--cuda', action='store_true',
#                     help='use CUDA')
# parser.add_argument('--log-interval', type=int, default=200, metavar='N',
#                     help='report interval')
# parser.add_argument('--save', type=str, default='model.pt',
#                     help='path to save the final model')
# parser.add_argument('--onnx-export', type=str, default='',
#                     help='path to export the final model in onnx format')
# args = parser.parse_args()

args = {}
args['data'] = 'data'
args['model'] = 'LSTM'
args['emsize'] = 200
args['nhid'] = 200
args['nlayers'] = 2
args['lr'] = 20
args['clip'] = 0.25
args['epochs'] = 1
args['batch_size'] = 20
args['bptt'] = 35
args['dropout'] = 0.2
args['tied'] = True
args['seed'] = 1111
args['cuda'] = True
args['log_interval'] = 200
args['save'] = 'model.pt'
args['onnx_export'] = ''



# Set the random seed manually for reproducibility.
torch.manual_seed(args['seed'])
if torch.cuda.is_available():
    if not args['cuda']:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args['cuda'] else "cpu")

###############################################################################
# Load data
###############################################################################

corpus = Corpus(args['data'])

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, args['batch_size'])
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = RNNModel(args['model'], ntokens, args['emsize'], args['nhid'], args['nlayers'], args['dropout'], args['tied']).to(device)

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(args['bptt'], len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args['bptt']):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args['batch_size'])
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args['bptt'])):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args['log_interval'] == 0 and batch > 0:
            cur_loss = total_loss / args['log_interval']
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args['bptt'], lr,
                elapsed * 1000 / args['log_interval'], cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def export_onnx(path, batch_size, seq_len):
    print('The model is also exported in ONNX format at {}'.
          format(os.path.realpath(args['onnx_export'])))
    model.eval()
    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
    hidden = model.init_hidden(batch_size)
    torch.onnx.export(model, (dummy_input, hidden), path)


# Loop over epochs.
lr = args['lr']
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, args['epochs']+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args['save'], 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(args['save'], 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

if len(args['onnx_export']) > 0:
    # Export the model in ONNX format.
    export_onnx(args['onnx_export'], batch_size=1, seq_len=args['bptt'])

| epoch   1 |   200/  766 batches | lr 20.00 | ms/batch 25.97 | loss  8.06 | ppl  3176.30
| epoch   1 |   400/  766 batches | lr 20.00 | ms/batch 25.65 | loss  6.84 | ppl   932.36
| epoch   1 |   600/  766 batches | lr 20.00 | ms/batch 25.81 | loss  6.41 | ppl   606.93
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 21.85s | valid loss  6.19 | valid ppl   490.14
-----------------------------------------------------------------------------------------
| End of training | test loss  6.18 | test ppl   484.93


# create_data.py

In [None]:
import sys
import csv
import os

csv.field_size_limit(sys.maxsize)
input_file = '/content/drive/MyDrive/Colab Notebooks/data/the_office/the_office_scripts.csv'
NAME_IND = 5
input_file = '/content/drive/MyDrive/Colab Notebooks/data/the_office/the_office_scripts.csv'
train_file = '/content/drive/MyDrive/Colab Notebooks/data/train.txt'
valid_file = '/content/drive/MyDrive/Colab Notebooks/data/valid.txt'
test_file = '/content/drive/MyDrive/Colab Notebooks/data/test.txt'

def get_num_lines():
    num_lines = 0
    with open(input_file, encoding='ISO-8859-1') as input:
        reader = csv.reader(input)
        next(reader)
        for row in enumerate(reader):
            num_lines += 1
    return num_lines

def create(num_lines):
    with open(input_file, encoding='ISO-8859-1') as input:
        reader = csv.reader(input)
        next(reader)
        train = open(train_file, 'w')
        valid = open(valid_file, 'w')
        test = open(test_file, 'w')
        for i, row in enumerate(reader):
            if i < 0.8 * num_lines:
                train.write(row[NAME_IND].strip() + '\n')
            elif i < 0.9 * num_lines:
                valid.write(row[NAME_IND].strip() + '\n')
            else:
                test.write(row[NAME_IND].strip() + '\n')
        train.close()
        valid.close()
        test.close()

def main():
    if os.stat(train_file).st_size == 0:
        print('Adding names to data files')
        num_lines = get_num_lines()
        create(num_lines)
    else:
        print('Already added names to data files')

if __name__ == '__main__':
    main()

Already added names to data files


# generate.py

In [None]:
import argparse

import torch

# import data

# parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')

# # Model parameters.
# parser.add_argument('--data', type=str, default='./data',
#                     help='location of the data corpus')
# parser.add_argument('--checkpoint', type=str, default='./model.pt',
#                     help='model checkpoint to use')
# parser.add_argument('--outf', type=str, default='generated.txt',
#                     help='output file for generated text')
# parser.add_argument('--words', type=int, default='1000',
#                     help='number of words to generate')
# parser.add_argument('--seed', type=int, default=1111,
#                     help='random seed')
# parser.add_argument('--cuda', action='store_true',
#                     help='use CUDA')
# parser.add_argument('--temperature', type=float, default=1.0,
#                     help='temperature - higher will increase diversity')
# parser.add_argument('--log-interval', type=int, default=100,
#                     help='reporting interval')
# args = parser.parse_args()

args = {}
args['data'] = 'data'
args['checkpoint'] = './model.pt'
args['outf'] = 'generated.txt'
args['words'] = 1000
args['seed'] = 1111
args['cuda'] = True
args['temperature'] = 1.0
args['log_interval'] = 100


# Set the random seed manually for reproducibility.
torch.manual_seed(args['seed'])
if torch.cuda.is_available():
    if not args['cuda']:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args['cuda'] else "cpu")

if args['temperature'] < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args['checkpoint'], 'rb') as f:
    model = torch.load(f).to(device)
model.eval()

corpus = Corpus(args['data'])
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open(args['outf'], 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(args['words']):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(args['temperature']).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % args['log_interval'] == 0:
                print('| Generated {}/{} words'.format(i, args['words']))

| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words


# NMT

## utils.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
nmt.py: NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
"""

import math
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

def pad_sents_char(sents, char_pad_token):
    """ Pad list of sentences according to the longest sentence in the batch and max_word_length.
    @param sents (list[list[list[int]]]): list of sentences, result of `words2charindices()` 
        from `vocab.py`
    @param char_pad_token (int): index of the character-padding token
    @returns sents_padded (list[list[list[int]]]): list of sentences where sentences/words shorter
        than the max length sentence/word are padded out with the appropriate pad token, such that
        each sentence in the batch now has same number of words and each word has an equal 
        number of characters
        Output shape: (batch_size, max_sentence_length, max_word_length)
    """
    # Words longer than 21 characters should be truncated
    max_word_length = 21 

    ### YOUR CODE HERE for part 1f
    ### TODO:
    ###     Perform necessary padding to the sentences in the batch similar to the pad_sents() 
    ###     method below using the padding character from the arguments. You should ensure all 
    ###     sentences have the same number of words and each word has the same number of 
    ###     characters. 
    ###     Set padding words to a `max_word_length` sized vector of padding characters.  
    ###
    ###     You should NOT use the method `pad_sents()` below because of the way it handles 
    ###     padding and unknown words.
    sents_padded = []
    max_sent_length = max(len(s) for s in sents)

    for s in sents:
        words_padded = []
        for w in s:
            padded = [char_pad_token] * max_word_length
            padded[:len(w)] = w[:max_word_length]
            words_padded.append(padded)
        while len(words_padded) != max_sent_length:
            words_padded.append([char_pad_token] * max_word_length)
        sents_padded.append(words_padded)
    ### END YOUR CODE

    return sents_padded


def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    max_len = max(len(s) for s in sents)
    batch_size = len(sents)

    for s in sents:
        padded = [pad_token] * max_len
        padded[:len(s)] = s
        sents_padded.append(padded)

    return sents_padded

def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

def read_corpus_nlg(file_path):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    speakers = []
    src = []
    tgt = []
    prevLine = None
    for line in open(file_path):
        if prevLine != None:
            sent = line.strip().replace("\n", "").split(' ')
            i = 0
            while ":" not in sent[i]:
                i += 1
            speaker = sent[i][:-1]
            if speaker not in speakers:
                speakers.append(speaker)
            # only append <s> and </s> to the target sentence
            sent = ['<s>'] + sent + ['</s>']
            tgt.append(sent)
            src.append(prevLine)
            prevLine = sent[1:-1]
        else:
            sent = line.strip().replace("\n", "").split(' ')
            speaker = sent[0][:-1]
            if speaker not in speakers:
                speakers.append(speaker)
            prevLine = sent

    print(speakers)
    return speakers, src, tgt

def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

## vocab.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
vocab.py: Vocabulary Generation
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>

Usage:
    vocab.py --train-src=<file> --train-tgt=<file> [options] VOCAB_FILE

Options:
    -h --help                  Show this screen.
    --train-src=<file>         File of training source sentences
    --train-tgt=<file>         File of training target sentences
    --size=<int>               vocab size [default: 50000]
    --freq-cutoff=<int>        frequency cutoff [default: 2]
"""

from collections import Counter
from docopt import docopt
from itertools import chain
import json
import torch
from typing import List
# from utils import read_corpus, pad_sents, pad_sents_char

class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}
        
        ## Additions to the A4 code:
        self.char_list = list("""ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]""")
        self.char_list.append('\t')
        # print(self.char_list)
        self.char2id = dict() # Converts characters to integers
        self.char2id['<pad>'] = 0
        self.char2id['{'] = 1
        self.char2id['}'] = 2
        self.char2id['<unk>'] = 3
        for i, c in enumerate(self.char_list):
            self.char2id[c] = len(self.char2id)
        self.char_unk = self.char2id['<unk>']
        self.start_of_word = self.char2id["{"]
        self.end_of_word = self.char2id["}"]
        assert self.start_of_word+1 == self.end_of_word

        self.id2char = {v: k for k, v in self.char2id.items()} # Converts integers to characters
        ## End additions to the A4 code

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2charindices(self, sents):
        """ Convert list of sentences of words into list of list of list of character indices.
        @param sents (list[list[str]]): sentence(s) in words
        @return word_ids (list[list[list[int]]]): sentence(s) in indices
        """
        ### YOUR CODE HERE for part 1e
        ### TODO: 
        ###     This method should convert characters in the input sentences into their 
        ###     corresponding character indices using the character vocabulary char2id 
        ###     defined above.
        ###
        ###     You must prepend each word with the `start_of_word` character and append 
        ###     with the `end_of_word` character. 

        return [[[self.char2id[c] for c in (self.id2char[self.start_of_word] + w + self.id2char[self.end_of_word])] for w in s] for s in sents]

        ### END YOUR CODE

    def words2indices(self, sents):
        """ Convert list of sentences of words into list of list of indices.
        @param sents (list[list[str]]): sentence(s) in words
        @return word_ids (list[list[int]]): sentence(s) in indices
        """
        return [[self[w] for w in s] for s in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        embedding = self.words2charindices(sents)
        padded = pad_sents_char(embedding, self.char2id['<pad>'])
        #print(len(padded))
        #print(max(len(i) for i in padded))
        #print(len(padded[0][0]))
        tensor = torch.tensor(padded, dtype=torch.long, device=device)
        tensor = tensor.permute(1, 0, 2)
        #print(tensor.size())
        return tensor

        ### END YOUR CODE

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry


class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source sentences provided by read_corpus() function
        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
        @param vocab_size (int): Size of vocabulary for both source and target languages
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
        """

        assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)

        print('initialize target vocabulary ..')
        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))



if __name__ == '__main__':
    # args = docopt(__doc__)
    args = {}
    args['train-src'] = 'en_es_data/train.es'
    args['train-tgt'] = 'en_es_data/train.en'
    args['size'] = 200
    args['freq-cutoff'] = 1
    args['VOCAB_FILE'] = 'vocab.json'

    print('read in source sentences: %s' % args['train-src'])
    print('read in target sentences: %s' % args['train-tgt'])

    src_sents = read_corpus(args['train-src'], source='src')
    tgt_sents = read_corpus(args['train-tgt'], source='tgt')

    vocab = Vocab.build(src_sents, tgt_sents, int(args['size']), int(args['freq-cutoff']))
    print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

    vocab.save(args['VOCAB_FILE'])
    print('vocabulary saved to %s' % args['VOCAB_FILE'])

read in source sentences: en_es_data/train.es
read in target sentences: en_es_data/train.en
initialize source vocabulary ..
number of word types: 14256, number of word types w/ frequency >= 1: 14256
initialize target vocabulary ..
number of word types: 17724, number of word types w/ frequency >= 1: 17724
generated vocabulary, source 204 words, target 202 words
vocabulary saved to vocab.json


## cnn.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
"""

### YOUR CODE HERE for part 1i
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

class CNN(nn.Module):
    def __init__(self, e_char, filters, kernel_size=5):
        super(CNN, self).__init__()

        self.e_char = e_char
        self.kernel_size = kernel_size
        self.filters = filters

        self.conv_layer = nn.Conv1d(e_char, filters, kernel_size, bias=True)

    def forward(self, x_reshaped) -> torch.Tensor:
        x_conv = self.conv_layer(x_reshaped)
        x_conv_out = torch.max(F.relu(x_conv), 2)[0]
        return x_conv_out

### END YOUR CODE

## highway.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
"""

### YOUR CODE HERE for part 1h
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

class Highway(nn.Module):
    def __init__(self, e_word):
        super(Highway, self).__init__()
        
        self.e_word = e_word

        self.proj = nn.Linear(self.e_word, self.e_word, bias=True)
        self.gate = nn.Linear(self.e_word, self.e_word, bias=True)

    def forward(self, x_conv_out) -> torch.Tensor:
        x_gate = torch.sigmoid(self.gate(x_conv_out))
        x_proj = F.relu(self.proj(x_conv_out))
        x_highway = x_gate * x_proj + (1 - x_gate) * x_proj
        return x_highway

### END YOUR CODE 

## model_embeddings.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
model_embeddings.py: Embeddings for the NMT model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Anand Dhoot <anandd@stanford.edu>
Michael Hahn <mhahn2@stanford.edu>
"""

import torch.nn as nn

# Do not change these imports; your module names should be
#   `CNN` in the file `cnn.py`
#   `Highway` in the file `highway.py`
# Uncomment the following two imports once you're ready to run part 1(j)

# from cnn import CNN
# from highway import Highway

# End "do not change" 

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their CNN-based embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layer for one language
        @param embed_size (int): Embedding size (dimensionality) for the output 
        @param vocab (VocabEntry): VocabEntry object. See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()

        ## A4 code
        # pad_token_idx = vocab.src['<pad>']
        # self.embeddings = nn.Embedding(len(vocab.src), embed_size, padding_idx=pad_token_idx)
        ## End A4 code

        ### YOUR CODE HERE for part 1j
        pad_token_idx = vocab.char2id['<pad>']
        self.e_char = 50
        self.embed_size = embed_size
        self.vocab = vocab

        self.embeddings = nn.Embedding(len(self.vocab.char2id), self.e_char, padding_idx=pad_token_idx)
        self.cnn = CNN(self.e_char, self.embed_size)
        self.highway = Highway(self.embed_size)
        self.dropout = nn.Dropout(0.3)

        ### END YOUR CODE

    def forward(self, input):
        """
        Looks up character-based CNN embeddings for the words in a batch of sentences.
        @param input: Tensor of integers of shape (sentence_length, batch_size, max_word_length) where
            each integer is an index into the character vocabulary

        @param output: Tensor of shape (sentence_length, batch_size, embed_size), containing the 
            CNN-based embeddings for each word of the sentences in the batch
        """
        ## A4 code
        # output = self.embeddings(input)
        # return output
        ## End A4 code

        ### YOUR CODE HERE for part 1j
        #print(self.embed_size)
        #print(input.size())
        sentence_length = input.size()[0]
        batch_size = input.size()[1]
        e = self.embeddings(input)
        #print(e.size())
        e = e.permute(0, 1, 3, 2)
        e = e.contiguous()
        e = e.view(-1, e.size()[2], e.size()[3])
        #print(e.size())
        x_conv_out = self.cnn.forward(e)
        #print(x_conv_out.size())
        x_highway = self.highway.forward(x_conv_out)
        #print(x_highway.size())
        x_word_emb = self.dropout(x_highway)
        #print(x_word_emb.size())
        x_word_emb = x_word_emb.view(sentence_length, batch_size, self.embed_size)
        return x_word_emb

        ### END YOUR CODE

## char_decoder.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
"""

import torch
import torch.nn as nn

class CharDecoder(nn.Module):
    def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None):
        """ Init Character Decoder.

        @param hidden_size (int): Hidden size of the decoder LSTM
        @param char_embedding_size (int): dimensionality of character embeddings
        @param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for documentation.
        """
        ### YOUR CODE HERE for part 2a
        ### TODO - Initialize as an nn.Module.
        ###      - Initialize the following variables:
        ###        self.charDecoder: LSTM. Please use nn.LSTM() to construct this.
        ###        self.char_output_projection: Linear layer, called W_{dec} and b_{dec} in the PDF
        ###        self.decoderCharEmb: Embedding matrix of character embeddings
        ###        self.target_vocab: vocabulary for the target language
        ###
        ### Hint: - Use target_vocab.char2id to access the character vocabulary for the target language.
        ###       - Set the padding_idx argument of the embedding matrix.
        ###       - Create a new Embedding layer. Do not reuse embeddings created in Part 1 of this assignment.
        super(CharDecoder, self).__init__()

        self.target_vocab = target_vocab
        self.charDecoder = nn.LSTM(char_embedding_size, hidden_size, bidirectional=False)
        self.char_output_projection = nn.Linear(hidden_size, len(self.target_vocab.char2id), bias=True)
        pad_token_idx = self.target_vocab.char2id['<pad>']
        self.decoderCharEmb = nn.Embedding(len(self.target_vocab.char2id), char_embedding_size, padding_idx=pad_token_idx)

        ### END YOUR CODE
    
    def forward(self, input, dec_hidden=None):
        """ Forward pass of character decoder.

        @param input: tensor of integers, shape (length, batch)
        @param dec_hidden: internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)

        @returns scores: called s_t in the PDF, shape (length, batch, self.vocab_size)
        @returns dec_hidden: internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
        """
        ### YOUR CODE HERE for part 2b
        ### TODO - Implement the forward pass of the character decoder.
        embeddings = self.decoderCharEmb(input)
        dec_hidden, dec_cell = self.charDecoder(embeddings, dec_hidden)
        scores = self.char_output_projection(dec_hidden)

        return (scores, dec_cell)
        
        ### END YOUR CODE 


    def train_forward(self, char_sequence, dec_hidden=None):
        """ Forward computation during training.

        @param char_sequence: tensor of integers, shape (length, batch). Note that "length" here and in forward() need not be the same.
        @param dec_hidden: initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch, hidden_size)

        @returns The cross-entropy loss, computed as the *sum* of cross-entropy losses of all the words in the batch.
        """
        ### YOUR CODE HERE for part 2c
        ### TODO - Implement training forward pass.
        ###
        ### Hint: - Make sure padding characters do not contribute to the cross-entropy loss.
        ###       - char_sequence corresponds to the sequence x_1 ... x_{n+1} from the handout (e.g., <START>,m,u,s,i,c,<END>).
        #print(char_sequence)
        #print(char_sequence.size())
        inpt = char_sequence.narrow(0, 0, char_sequence.size()[0] - 1)
        #print(inpt)
        #print(inpt.size())
        target = char_sequence.narrow(0, 1, char_sequence.size()[0] - 1)
        #print(target)
        #print(target.size())
        scores, dec_hidden = self.forward(inpt, dec_hidden)
        #target = self.decoderCharEmb(target)
        #print(target.size())
        #print(scores.size())
        target = target.contiguous()
        target = target.view(target.size()[0] * target.size()[1])
        scores = scores.view(scores.size()[0] * scores.size()[1], scores.size()[2])
        #print(target.size())
        #print(scores.size())
        loss = nn.CrossEntropyLoss(reduction='sum', ignore_index=self.target_vocab.char2id['<pad>'])
        return loss(scores, target)

        ### END YOUR CODE

    def decode_greedy(self, initialStates, device, max_length=21):
        """ Greedy decoding
        @param initialStates: initial internal state of the LSTM, a tuple of two tensors of size (1, batch, hidden_size)
        @param device: torch.device (indicates whether the model is on CPU or GPU)
        @param max_length: maximum length of words to decode

        @returns decodedWords: a list (of length batch) of strings, each of which has length <= max_length.
                              The decoded strings should NOT contain the start-of-word and end-of-word characters.
        """

        ### YOUR CODE HERE for part 2d
        ### TODO - Implement greedy decoding.
        ### Hints:
        ###      - Use target_vocab.char2id and target_vocab.id2char to convert between integers and characters
        ###      - Use torch.tensor(..., device=device) to turn a list of character indices into a tensor.
        ###      - We use curly brackets as start-of-word and end-of-word characters. That is, use the character '{' for <START> and '}' for <END>.
        ###        Their indices are self.target_vocab.start_of_word and self.target_vocab.end_of_word, respectively.
        output_chars = []
        #print(initialStates[0].size()[1])
        for i in range(initialStates[0].size()[1]):
            output_chars.append([])
        #print(output_chars)
        #output_chars = [[] * initialStates[0].size()[1]]
        current_char = [[self.target_vocab.start_of_word] * initialStates[0].size()[1]]
        dec_hidden = initialStates
        for i in range(max_length):
            current_char = torch.tensor(current_char, device=device)
            scores, dec_hidden = self.forward(current_char, dec_hidden=dec_hidden)
            #embedding = self.decoderCharEmb(current_char)
            #print(embedding.size())
            #dec_hidden = self.charDecoder(embedding, dec_hidden)
            #print('hi')
            #scores = self.char_output_projection(dec_hidden[0])
            softmax = nn.Softmax(dim=2)
            p = softmax(scores)
            current_char = torch.argmax(p, dim=2)
            #print(current_char.size())
            chars = current_char.tolist()[0]
            #print(output_chars)
            #print(len(chars))
            for i in range(len(chars)):
                output_chars[i].append(chars[i])
        #print(output_chars)
        '''
        output_words = []
        idx = 0
        i = 0
        word = ""
        while i < len(output_chars):
            if idx < 21:
                print(self.target_vocab.id2char[self.target_vocab.end_of_word])
                if output_chars[i] != self.target_vocab.id2char[self.target_vocab.end_of_word]:
                    word += output_chars[i]
                    idx += 1
                else:
                    i += 21 - idx
                    idx = 0
                    output_words.append(word)
                    word = output_chars[i]
            else:
                idx = 0
                output_words.append(word)
                word = output_chars[i]
            i += 1
        output_words.append(word)
        print(output_words)
        '''
        #print(len(output_chars))
        '''
        output_chars_list = []
        idx = 0
        chars_list = []
        for i in range(len(output_chars)):
            if idx < 21:
                idx += 1
                chars_list.append(output_chars[i])
            else:
                output_chars_list.append(chars_list)
                idx = 0
                chars_list = [output_chars[i]]
        output_chars_list.append(chars_list)
        '''

        output_words = []
        for chars_list in output_chars:
            word = ""
            for c in chars_list:
                if c != self.target_vocab.end_of_word and c!= self.target_vocab.char2id['<pad>']:
                    word += self.target_vocab.id2char[c]
                else:
                    break
            output_words.append(word)

        return output_words
        
        ### END YOUR CODE

## nmt_model.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
nmt_model.py: NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
"""
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import random

# from model_embeddings import ModelEmbeddings
# from char_decoder import CharDecoder

Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

import random

class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src)
        self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt)

        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size)

        self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)    
        self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False)        
        self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

        if not no_char_decoder:
           self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) 
        else:
           self.charDecoder = None

    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors

        ## A4 code
        # source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        # target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)
 
        # enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        # enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        # combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        ## End A4 code
        
        ### YOUR CODE HERE for part 1k
        ### TODO: 
        ###     Modify the code lines above as needed to fetch the character-level tensor 
        ###     to feed into encode() and decode(). You should:
        ###     - Keep `target_padded` from A4 code above for predictions
        ###     - Add `source_padded_chars` for character level padded encodings for source
        ###     - Add `target_padded_chars` for character level padded encodings for target
        ###     - Modify calls to encode() and decode() to use the character level encodings
        source_padded_chars = self.vocab.src.to_input_tensor_char(source, device=self.device)
        target_padded_chars = self.vocab.src.to_input_tensor_char(target, device=self.device)
        target_padded_chars = target_padded_chars.contiguous()
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)
 
        enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars)
 
        ### END YOUR CODE

        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum() # mhahn2 Small modification from A4 code.



        if self.charDecoder is not None:
            max_word_len = target_padded_chars.shape[-1]

            target_words = target_padded[1:].contiguous().view(-1)
            target_chars = target_padded_chars[1:].view(-1, max_word_len)
            target_outputs = combined_outputs.view(-1, 256)
    
            target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices)
            rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices)
            oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0)))
            scores = scores - oovs_losses
    
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.
        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        X = self.model_embeddings_source(source_padded)
        X_packed = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed)
        (enc_hiddens, _) = pad_packed_sequence(enc_hiddens)
        enc_hiddens = enc_hiddens.permute(1, 0, 2)

        init_decoder_hidden = self.h_projection(torch.cat((last_hidden[0], last_hidden[1]), dim=1))
        init_decoder_cell = self.c_projection(torch.cat((last_cell[0], last_cell[1]), dim=1))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        return enc_hiddens, dec_init_state


    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.
        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where
                                       tgt_len = maximum target sentence length, b = batch size. 
        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings_target(target_padded)

        for Y_t in torch.split(Y, split_size_or_sections=1):
            Y_t = Y_t.squeeze(0)
            Ybar_t = torch.cat([Y_t, o_prev], dim=-1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs)

        return combined_outputs


    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.
        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 
        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(2)).squeeze(2)


        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        alpha_t = F.softmax(e_t, dim=-1)
        alpha_t_view = (alpha_t.size(0), 1, alpha_t.size(1))
        a_t = torch.bmm(alpha_t.view(*alpha_t_view), enc_hiddens).squeeze(1)
        U_t = torch.cat([dec_hidden, a_t], 1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)


    def beam_search(self, src_sent: List[str], beam_size: int=100, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        ## A4 code
        # src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)
        ## End A4 code

        src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))
			
            ## A4 code
            # y_tm1 = self.vocab.tgt.to_input_tensor(list([hyp[-1]] for hyp in hypotheses), device=self.device)
            # y_t_embed = self.model_embeddings_target(y_tm1)
            ## End A4 code

            y_tm1 = self.vocab.tgt.to_input_tensor_char(list([hyp[-1]] for hyp in hypotheses), device=self.device)
            y_t_embed = self.model_embeddings_target(y_tm1)
            y_t_embed = torch.squeeze(y_t_embed, dim=0)


            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)
            #p_t = self.target_vocab_projection(att_t)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            contiuating_hyp_scores_prob = F.softmax(contiuating_hyp_scores)
            #contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(p_t) + p_t).view(-1)

            #top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)
            top_cand_hyp_scores = []
            top_cand_hyp_pos = []

            indices = torch.multinomial(contiuating_hyp_scores_prob, live_hyp_num)
            for index in indices:
                top_cand_hyp_scores.append(contiuating_hyp_scores[index.item()])
                top_cand_hyp_pos.append(index)

            top_cand_hyp_scores = torch.tensor(top_cand_hyp_scores, device=self.device)
            #top_cand_hyp_scores = F.softmax(top_cand_hyp_scores)
            top_cand_hyp_pos = torch.tensor(top_cand_hyp_pos, dtype=torch.long, device=self.device)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            decoderStatesForUNKsHere = []
            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]

                # Record output layer in case UNK was generated
                if hyp_word == "<unk>":
                    hyp_word = "<unk>"+str(len(decoderStatesForUNKsHere))
                    decoderStatesForUNKsHere.append(att_t[int(prev_hyp_id)])

                new_hyp_sent = hypotheses[int(prev_hyp_id)] + [hyp_word]
                if "." in hyp_word or "?" in hyp_word or "!" in hyp_word:
                    new_hyp_sent += ['</s>']
                    hyp_word = '</s>'
                '''
                s1 = ""
                s2 = ""
                new_hyp_sent_copy = [x for x in new_hyp_sent if x != "<s>" and x != "</s>"]
                if hyp_word == '</s>' and s1.join(new_hyp_sent_copy) != "":
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)
                '''
                #s1 = ""
                #new_hyp_sent_copy = [x for x in new_hyp_sent if x != "<s>" and x != "</s>"]
                #print(s1.join(new_hyp_sent_copy))
                if hyp_word == '</s>':
                    # print(new_hyp_sent[1:-1])
                    #print(len(completed_hypotheses))
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(decoderStatesForUNKsHere) > 0 and self.charDecoder is not None: # decode UNKs
                decoderStatesForUNKsHere = torch.stack(decoderStatesForUNKsHere, dim=0)
                decodedWords = self.charDecoder.decode_greedy((decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device)
                assert len(decodedWords) == decoderStatesForUNKsHere.size()[0], "Incorrect number of decoded words" 
                #new_hypotheses = decodedWords[int([-1][5:])]
                for hyp in new_hypotheses:
                  if hyp[-1].startswith("<unk>"):
                        hyp[-1] = decodedWords[int(hyp[-1][5:])]#[:-1]

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
        # print(len(completed_hypotheses))
        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.att_projection.weight.device

    @staticmethod
    def load(model_path: str, no_char_decoder=False):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

## nlg_model.py

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

# from nmt_model import NMT
import os

class NLG(nn.Module):
    # Natural Language Generation model using a Neural Machine Translation context 
    # https://arxiv.org/abs/1702.07826

    def __init__(self, speakers, embed_size, hidden_size, dropout_rate, vocab, no_char_decoder, lr, clip_grad, lr_decay):
        # Need: NMT Model for each speaker (ex: translate from "person speaking to Michael" to "Michael")
        # Model for determining who speaks after who
        super(NLG, self).__init__()
        self.NMT_speakers = []
        self.NMT_models = []
        self.NMT_optimizers = []
        self.clip_grad = clip_grad
        self.lrs = []
        self.lr_decay = lr_decay
        # find a way to not have to hard-code speakers?
        for speaker in speakers:
            model = NMT(embed_size=embed_size,
                hidden_size=hidden_size,
                dropout_rate=dropout_rate,
                vocab=vocab, no_char_decoder=no_char_decoder)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            self.NMT_speakers.append(speaker.replace("/", "-").replace(" ","-"))
            self.NMT_models.append(model)
            self.NMT_optimizers.append(optimizer)
            self.lrs.append(lr)

    # change for double training?
    # def forward(self, speaker: str, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
    def forward(self, speaker: str, source: List[str], target: List[str]):
        if speaker in self.NMT_speakers:
            i = self.NMT_speakers.index(speaker)
            model = self.NMT_models[i]
            optimizer = self.NMT_optimizers[i]
            lr = self.lrs[i]

            optimizer.zero_grad()

            batch_size = 1

            example_losses = -model([source], [target]) # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.clip_grad)

            optimizer.step()

            lr = optimizer.param_groups[0]['lr'] * self.lr_decay
            self.lrs[i] = lr

            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            return batch_loss
        return 0

    def beam_search(self, speaker, src_sent, beam_size=50, max_decoding_time_step=70):
        i = self.NMT_speakers.index(speaker)
        model = self.NMT_models[i]

        was_training = model.training
        model.eval()

        example_hyps = []
        with torch.no_grad():
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

        if was_training: model.train(was_training)

        return example_hyps

    '''
    @staticmethod
    def load(directory: str, no_char_decoder=False):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        args = params['args']
        model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args)
        model.load_state_dict(params['state_dict'])
        for filename in os.listdir(directory):
            NMT_model = NMT.load(directory + "/" + filename, no_char_decoder=no_char_decoder)
        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.embed_size,
                hidden_size=self.hidden_size,
                dropout_rate=self.dropout_rate,
                no_char_decoder=self.no_char_decoder),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

        for i in range(len(self.NMT_speakers)):
            self.NMT_models[i].save("models/" + self.NMT_speakers[i] + "_" + path)
    '''

## run.py (train)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 5
run.py: Run Script for Simple NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>

Usage:
    run.py train --train-src=<file> --train-tgt=<file> --dev-src=<file> --dev-tgt=<file> --vocab=<file> [options]
    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE OUTPUT_FILE
    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE TEST_TARGET_FILE OUTPUT_FILE

Options:
    -h --help                               show this screen.
    --cuda                                  use GPU
    --train-src=<file>                      train source file
    --train-tgt=<file>                      train target file
    --dev-src=<file>                        dev source file
    --dev-tgt=<file>                        dev target file
    --vocab=<file>                          vocab file
    --seed=<int>                            seed [default: 0]
    --batch-size=<int>                      batch size [default: 32]
    --embed-size=<int>                      embedding size [default: 256]
    --hidden-size=<int>                     hidden size [default: 256]
    --clip-grad=<float>                     gradient clipping [default: 5.0]
    --log-every=<int>                       log every [default: 10]
    --max-epoch=<int>                       max epoch [default: 30]
    --input-feed                            use input feeding
    --patience=<int>                        wait for how many iterations to decay learning rate [default: 5]
    --max-num-trial=<int>                   terminate training after how many trials [default: 5]
    --lr-decay=<float>                      learning rate decay [default: 0.5]
    --beam-size=<int>                       beam size [default: 50]
    --sample-size=<int>                     sample size [default: 5]
    --lr=<float>                            learning rate [default: 0.001]
    --uniform-init=<float>                  uniformly initialize all parameters [default: 0.1]
    --save-to=<file>                        model save path [default: model.bin]
    --valid-niter=<int>                     perform validation after how many iterations [default: 2000]
    --dropout=<float>                       dropout [default: 0.3]
    --max-decoding-time-step=<int>          maximum number of decoding time steps [default: 70]
    --no-char-decoder                       do not use the character decoder
"""
import math
import sys
import pickle
import time


from docopt import docopt
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
# from nmt_model import Hypothesis, NMT
import numpy as np
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
# from utils import read_corpus, batch_iter
# from vocab import Vocab, VocabEntry

import torch
import torch.nn.utils

import random


def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl


def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score


def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab, no_char_decoder=args['--no-char-decoder'])
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    epoch = 0
    while epoch < int(args['--max-epoch']):
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents) # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers', file=sys.stderr)
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            # if epoch == int(args['--max-epoch']):
            #     print('reached maximum number of epochs!', file=sys.stderr)
            #     # exit(0)
            #     break


def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [random.choice(hyps) for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')


def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses


def main():
    """ Main func.
    """
    # args = docopt(__doc__)
    args = {}
    args['--seed'] = 0
    args['--cuda'] = True
    args['train'] = True
    args['decode'] = False
    args['--train-src'] = 'en_es_data/train.es'
    args['--train-tgt'] = 'en_es_data/train.en'
    args['--dev-src'] = 'en_es_data/dev.es'
    args['--dev-tgt'] = 'en_es_data/dev.en'
    args['--batch-size'] = 32
    args['--clip-grad'] = 5
    args['--valid-niter'] = 2000
    args['--log-every'] = 10
    args['--save-to'] = 'model.bin'
    args['--vocab'] = 'vocab.json'
    args['--embed-size'] = 256
    args['--hidden-size'] = 256
    args['--dropout'] = 0.3
    args['--no-char-decoder'] = False
    args['--uniform-init'] = 0.1
    args['--lr'] = 0.001
    args['--patience'] = 5
    args['--max-num-trial'] = 5
    args['--lr-decay'] = 0.5
    args['--max-epoch'] = 30
    args['TEST_SOURCE_FILE'] = 'en_es_data/test.es'
    args['TEST_TARGET_FILE'] = 'en_es_data/test.en'
    args['MODEL_PATH'] = 'model.bin'
    args['--beam-size'] = 50
    args['--max-decoding-time-step'] = 70
    args['OUTPUT_FILE'] = 'outputs/test_outputs.txt'


    # Check pytorch version
    # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)

    # seed the random number generators
    seed = int(args['--seed'])
    torch.manual_seed(seed)
    if args['--cuda']:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    if args['train']:
        train(args)
    elif args['decode']:
        decode(args)
    else:
        raise RuntimeError('invalid run mode')

if __name__ == '__main__':
    main()

uniformly initialize parameters [-0.100000, +0.100000]
use device: cuda:0


begin Maximum Likelihood training


epoch 1, iter 10, avg. loss 398.06, avg. ppl 130996977918.85 cum. examples 320, speed 1650.63 words/sec, time elapsed 3.01 sec
epoch 1, iter 20, avg. loss 296.39, avg. ppl 491692464.78 cum. examples 640, speed 1494.25 words/sec, time elapsed 6.19 sec
epoch 1, iter 30, avg. loss 278.17, avg. ppl 248193937.95 cum. examples 960, speed 1606.24 words/sec, time elapsed 9.05 sec
epoch 1, iter 40, avg. loss 316.37, avg. ppl 132122493.92 cum. examples 1280, speed 1753.87 words/sec, time elapsed 12.14 sec
epoch 1, iter 50, avg. loss 285.99, avg. ppl 111510735.34 cum. examples 1600, speed 1648.00 words/sec, time elapsed 15.14 sec
epoch 1, iter 60, avg. loss 310.07, avg. ppl 86292903.67 cum. examples 1920, speed 1522.91 words/sec, time elapsed 18.70 sec
epoch 1, iter 70, avg. loss 314.70, avg. ppl 81301355.95 cum. examples 2240, speed 1892.83 words/sec, time elapsed 21.62 sec
epoch 1, iter 80, avg. loss 280.87, avg. ppl 57568637.13 cum. examples 2560, speed 1710.43 words/sec, time elapsed 24.57 se

## run.py (test)

In [None]:
def main():
    """ Main func.
    """
    # args = docopt(__doc__)
    args = {}
    args['--seed'] = 0
    args['--cuda'] = True
    args['train'] = False
    args['decode'] = True
    args['--train-src'] = 'en_es_data/train.es'
    args['--train-tgt'] = 'en_es_data/train.en'
    args['--dev-src'] = 'en_es_data/dev.es'
    args['--dev-tgt'] = 'en_es_data/dev.en'
    args['--batch-size'] = 32
    args['--clip-grad'] = 5
    args['--valid-niter'] = 2000
    args['--log-every'] = 10
    args['--save-to'] = 'model.bin'
    args['--vocab'] = 'vocab.json'
    args['--embed-size'] = 256
    args['--hidden-size'] = 256
    args['--dropout'] = 0.3
    args['--no-char-decoder'] = False
    args['--uniform-init'] = 0.1
    args['--lr'] = 0.001
    args['--patience'] = 5
    args['--max-num-trial'] = 5
    args['--lr-decay'] = 0.5
    args['--max-epoch'] = 1
    args['TEST_SOURCE_FILE'] = 'en_es_data/test.es'
    args['TEST_TARGET_FILE'] = 'en_es_data/test.en'
    args['MODEL_PATH'] = 'model.bin'
    args['--beam-size'] = 50
    args['--max-decoding-time-step'] = 70
    args['OUTPUT_FILE'] = 'outputs/test_outputs.txt'

    # Check pytorch version
    # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)

    # seed the random number generators
    seed = int(args['--seed'])
    torch.manual_seed(seed)
    if args['--cuda']:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    if args['train']:
        train(args)
    elif args['decode']:
        decode(args)
    else:
        raise RuntimeError('invalid run mode')

if __name__ == '__main__':
    main()

load test source sentences from [en_es_data/test.es]
load test target sentences from [en_es_data/test.en]
load model from model.bin


Decoding:   0%|          | 0/1607 [00:00<?, ?it/s]



Decoding: 100%|██████████| 1607/1607 [53:37<00:00,  2.00s/it]


Corpus BLEU: 0.11854739275395311


## michael.py

In [None]:
# import csv
# import sys
# import json

# csv.field_size_limit(sys.maxsize)

# episode = 2
# line_text = 4
# speaker = 5
# d = {}

# name = "en_es_data/the_office_scripts.csv"
# train_speaker = open("en_es_data/train.es", "w")
# test_speaker = open("en_es_data/test.es", "w")
# dev_speaker = open("en_es_data/dev.es", "w")
# train_michael = open("en_es_data/train.en", "w")
# test_michael = open("en_es_data/test.en", "w")
# dev_michael = open("en_es_data/dev.en", "w")

# f = open(name)

# reader = csv.reader(f)
# next(reader)
# prevRow = next(reader)

# speaker_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3}
# michael_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3}
# data = {"src_word2id": speaker_vocab, "tgt_word2id": michael_vocab}
# speaker_count = 1
# michael_count = 1

# for currRow in reader:
#     if currRow[speaker] == "Michael" and prevRow[episode] == currRow[episode]:
#         if int(currRow[episode]) <= 15:
#             train_speaker.write(prevRow[line_text] + "\n")
#             train_michael.write(currRow[line_text] + "\n")
#         elif int(currRow[episode]) <= 19:
#             test_speaker.write(prevRow[line_text] + "\n")
#             test_michael.write(currRow[line_text] + "\n")
#         else:
#             dev_speaker.write(prevRow[line_text] + "\n")
#             dev_michael.write(currRow[line_text] + "\n")
#     for word in prevRow[line_text].split(" "):
#         if word not in speaker_vocab.keys():
#             speaker_vocab[word] = speaker_count
#             speaker_count += 1
#     for word in currRow[line_text].split(" "):
#         if word not in michael_vocab.keys():
#             michael_vocab[word] = michael_count
#             michael_count += 1
#     prevRow = currRow

# write_file = open("vocab.json", "w")
# json.dump(data, write_file)