In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import ast

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
import numpy as np

use_cuda = torch.cuda.is_available()

In [2]:
seperator = " +++$+++ "
movie_conversations_path = "/home/tyler/data/text/cornell_movie_dialogs_corpus/movie_conversations.txt"
movie_lines = "/home/tyler/data/text/cornell_movie_dialogs_corpus/movie_lines_converted.txt"
MAX_LENGTH = 69 # including EOS tag

In [3]:
def get_lines_to_words(path):
    """
    path: the path to the file with the words for lines
    returns a dictionary mapping from line number to the actually words
    """
    lines_dict = {}
    with open(path, "r") as f:
        for line in f:
            columns = line.split(seperator)
            lines_dict[columns[0]] = columns[-1]
    return lines_dict

In [4]:
lines_dict = get_lines_to_words(movie_lines)

In [5]:
def get_context_response_pairs(conversations_path, lines_dict):
    """
    conversations_path: the path to the conversation lines
    lines_dict: the dictionary mapping from lines to words
    returns: list of tuples (context, response)
    
    Code loops over all lines in a conversation taking the first as the 
    context and the next as the response. Thus, loop doesn't need to get to
    the last line.
    """
    with open(conversations_path, "r") as f:
        context_response_tuples = []
        for line in f:
            columns = line.split(seperator)
            convs = ast.literal_eval(columns[-1])
            for i, spoken_line in enumerate(convs[:-1]):
                context = lines_dict[convs[i]]
                response = lines_dict[convs[i+1]]
                context_response_tuples.append((context, response))
        return context_response_tuples

In [6]:
context_response_tuples = get_context_response_pairs(movie_conversations_path,
                                                    lines_dict)

In [7]:
def normalizeString(s):
    #put a space between punctuation, so not included in word
    s = s.strip().lower()
    s = re.sub(r"([.!?])", r" \1", s)
    #remove things that are not letters or punctuation
    s = re.sub(r"[^a-zA-Z.!?']+", r" ", s)
    return s

def clean_pairs(list_of_pairs, max_length=MAX_LENGTH-1):
    """
    list_of_pairs: list of context, response pairs as raw text
    max_length: max length of context or response. 99 percentile is 68
    returns list of tuples but each tuple is a list of tokenized words
    """
    pairs = []
    for pair in list_of_pairs:
        context = pair[0]
        response = pair[1]
        context_clean_tokens = normalizeString(context).split(" ")
        response_clean_tokens = normalizeString(response).split(" ")
        if len(context_clean_tokens) > max_length or len(response_clean_tokens) > max_length:
            continue
        pairs.append((context_clean_tokens, response_clean_tokens))
    return pairs

In [8]:
clean_tuples = clean_pairs(context_response_tuples)

In [9]:
class Words:
    def __init__(self):
        self.SOS_token = 0
        self.EOS_token = 1
        self.word2index = {}
        self.index2word = {self.SOS_token: "SOS", self.EOS_token: "EOS"}
        self.word2count = {}
        self.n_words = 2
        
    def __addArray(self, array):
        for word in array:
            self.__addWord(word)
            
    def addArrayOfTuples(self, array_of_tuples):
        for pair in array_of_tuples:
            self.__addArray(pair[0])
            self.__addArray(pair[1])
    
    def __addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
words = Words()
words.addArrayOfTuples(clean_tuples)

In [93]:
class EncoderRNN(nn.Module):
    """
    Simple encoder network that embeds the character and then feeds through a GRU
    """
    def __init__(self, input_size, hidden_size, batch_size, max_length=MAX_LENGTH):
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.max_length = max_length
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden
    
    def initHidden(self):
        result = torch.zeros(1, self.batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result
        
class AttnDecoderRNN(nn.Module):
    """
    Attn Decoder
    1. Need max length because learning which input words to attend to
    And thus need to know the maximum number of words could attend to
    2. The attn_weights tell us how much to weight each input word - in this case French,
       In order to predict the english word.
    """
    def __init__(self, input_size, hidden_size, batch_size,
                 dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(hidden_size, hidden_size)
        # note input and output same size
        self.linear = nn.Linear(hidden_size, input_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.attn_layer = nn.Linear(2 * self.hidden_size, MAX_LENGTH)
        self.out_layer = nn.Linear(self.hidden_size, input_size)
        self.attn_combined_layer = nn.Linear(2 * self.hidden_size, self.hidden_size)
    
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        embedded = self.dropout(embedded)
        attn = self.attn_layer(torch.cat((embedded[0], hidden[0]),dim=1))
        attn_weights = self.softmax(attn)
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs) #shape: bx1xh
        attn_combined = torch.cat((embedded[0], attn_applied[:,0,:]), 1)
        attn_combined = self.relu(self.attn_combined_layer(attn_combined).unsqueeze(0))
        output, hidden = self.gru(attn_combined, hidden)
        output = self.softmax(self.out_layer(output[0]))
        return output, hidden, attn_weights
    
    def initHidden(self):
        result = torch.zeros(1, self.batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result        

In [94]:
hidden_size = 512
batch_size = 1
encoder = EncoderRNN(words.n_words, hidden_size, batch_size)
decoder = AttnDecoderRNN(words.n_words, hidden_size, batch_size)

encoder.load_state_dict(torch.load("./models/chat_encoder.state"))
decoder.load_state_dict(torch.load("./models/chat_decoder.state"))

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda() 

In [95]:
def indexesFromSentence(sentence):
    return [words.word2index[word] for word in sentence]

def tensorFromSentence(sentence):
    indexes = indexesFromSentence(sentence)
    indexes.append(words.EOS_token)
    # make it 1 column with number of rows equal to words in sentence
    result = torch.LongTensor(indexes).view(-1, 1)
    pad_amount = MAX_LENGTH - result.size(0)
    if pad_amount > 0:
        result = F.pad(result, (0,0,0,pad_amount), value=words.EOS_token).data
    result = result.cuda() if use_cuda else result
    return result
    
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()
    input_variable = tensorFromSentence(normalizeString(sentence).split(" "))
    input_variable = input_variable.unsqueeze(0)
    input_length = input_variable.size()[1]
    
    encoder_outputs = torch.zeros((batch_size, MAX_LENGTH, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    
    for i in range(input_length):
        encoder_ouput, encoder_hidden = encoder.forward(input_variable[:,i,:], encoder_hidden)
        encoder_outputs[:,i,:] = encoder_ouput[0]
        
        
    decoder_hidden = encoder_hidden
    decoder_input = torch.LongTensor([[words.SOS_token]]*batch_size)
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoded_words = []
        
    for i in range(MAX_LENGTH):
        print(decoder_input)
        decoder_output, decoder_hidden, attn_weights = decoder.forward(decoder_input, 
                                                         decoder_hidden,
                                                        encoder_outputs)
        print(decoder_output.data)
        topv, topi = decoder_output.data.topk(1)
        top_predicted = topi[0][0]
        decoder_input = torch.LongTensor([[top_predicted]])
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        if top_predicted == words.EOS_token:
            decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(output_lang.index2word[top_predicted])
    return decoded_words

In [97]:
evaluate(encoder, decoder, "can we make")

tensor([[0]], device='cuda:0')
tensor([[-31.0352,  -0.0924,  -7.6506,  ..., -29.2614, -17.7012, -22.7200]],
       device='cuda:0')


['<EOS>']

In [15]:
clean_tuples[0]

(['can',
  'we',
  'make',
  'this',
  'quick',
  '?',
  'roxanne',
  'korrine',
  'and',
  'andrew',
  'barrett',
  'are',
  'having',
  'an',
  'incredibly',
  'horrendous',
  'public',
  'break',
  'up',
  'on',
  'the',
  'quad',
  '.',
  'again',
  '.'],
 ['well',
  'i',
  'thought',
  "we'd",
  'start',
  'with',
  'pronunciation',
  'if',
  "that's",
  'okay',
  'with',
  'you',
  '.'])