# Preprocessor

This file does all the non-RNN steps we need it to--the ones we only really have to do once.

In [1]:
import numpy as np
from numpy import array
from numpy.random import shuffle
from pickle import load
from pickle import dump

# Preprocessing

Our preprocessing method opens our data file and separates each line into pairs of utterances and responses.

In [2]:
######################
# Preprocessing Methods
######################

##### Load the raw dataset #####
#This method opens the raw text file, reads the lines, and closes the file.
def load_data(filename):
    file = open(filename, mode="rt")
    data = file.read()
    file.close()
    return data

##### Split data into utterance-response pairs #####
#This method splits the dataset into lines, and for each line, we create a dictionary.
#The dictionary key is the utterance (A), and the value is the response (B)
#For the utterance and response, the speech-tag and actual utterance is tab separated.
#We add each set of utterance-response pairs to an array called pairs.
def split_to_pairs(data):
    lines = data.split("\n")
    pairs = []
    for line in lines:
        tokens = line.split("\t")
        utterance = tokens[0] + "\t" + tokens[1]
        response = tokens[2] + "\t" + tokens[3]
        pairs.append([utterance, response])
    return pairs

##### Clean the data ######
#Optionally, we could make all words lowercase, remove punctuation, etc.
#I'm going to just leave the dataset in its native form and see how it does for now.
#This method essentially just reorganizes the data into a 2D array, where each row holds:
# [utterance, response]
def clean_data(pairs):
    cleaned_data = list()
    for pair in pairs:
        clean_pair = list()
        for utt in pair:
            clean_pair.append(utt)
        cleaned_data.append(clean_pair)
    return array(cleaned_data)

##### Save pairs to file #####
def save_pairs(pairs, new_filename):
    dump(pairs, open(new_filename, "wb"))
    print("Saved: %s" % new_filename)

In [3]:
#Run preprocessing
filepath = "resources/"
filename = filepath + "clean_dataset.txt"
data = load_data(filename)
pairs = split_to_pairs(data)
clean_pairs = clean_data(pairs)
save_pairs(clean_pairs, filepath + "utt-resp.pkl")

#Check our dataset
#you should see:
# [fp	and I'm calling from Garland, Texas.] => [b	Yeah,], etc.
for i in range(10):
     print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: resources/utt-resp.pkl
[fp	and I'm calling from Garland, Texas.] => [b	Yeah,]
[co^t	so. anyway, let me press one.] => [aa	Okay .]
[sd	and, it was an experience that I won't do again .] => [qw	How big a family do you have?]
[sd	We saw people we hadn't see in a while] => [qy	Did you have people coming from far away?]
[sd(^q)	and we're going, my gosh.] => [sv	Well you have]
[b	Yeah.] => [sv	And if, they come from far away, they take it more seriously]
[aa	I think you're right.] => [b	Yeah.]
[b	Yeah.] => [sd	My family's not very big]
[qw^d	Your family's from where?] => [sd	Well, I have a, a brother lives in Indianapolis, a sister lives in Chicago, and my folks live back in Buffalo, New York.]
[ba	no.] => [sd	I guess we have reunions about once a year or so.]


# Load Datasets and Split into train and test sets.

In [22]:
######################
# Load dataset methods
######################
n = 10

def load_sentences(filename):
    return load(open(filename, "rb"))

def save_sentences(sentences, filename):
    dump(sentences, open(filename, "wb"))
    print("Saved: %s" % filename)
    
def split_dataset(dataset, num_sentences):
    # Take every nth item from the dataset to test on     
    test  = dataset[::n]
    train = [item for i, item in enumerate(dataset) if (i) % n != 0]
    train = np.asarray(train)
    
    # Compare Data
    print(dataset[:10])
    print()
    print(test[:10])
    print()
    print(train[:10]) 
    print()
    print("Train entries: ", len(train))
    print("Test entries: ",  len(test))
    return train, test

In [23]:
#For testing purposes, you can change n_sentences, the number of trained sentences, to a smaller number.
raw_dataset = load_sentences(filepath + "utt-resp.pkl")
print("Number of raw data pairs: ", len(raw_dataset))
n_sentences = 30000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = split_dataset(dataset, n_sentences)
save_sentences(dataset, filepath + "utt-resp-both.pkl")
save_sentences(train, filepath + "utt-resp-train.pkl")
save_sentences(test, filepath + "utt-resp-test.pkl")

Number of raw data pairs:  46464
[['sd\tand it was just .' 'sv\tWell, that remark in itself is a slam .']
 ["sv\tThat's nice."
  "qy^g\tNow, being from Philadelphia, I don't expect your dress code to be quite that relaxed, right?"]
 ["sv\tIt's just a matter of education, I think." 'aa\tYeah,']
 ["sd\tWell I'm going to try to clean up the house after my two children for about an hour see if we can walk around."
  'b\tOkay']
 ['b\tUh huh.' 'b\tbut, yeah.']
 ['qw\tis that, what is that,' 'sd\tPalo Alto.']
 ["sd\tyou could say I'm in Baltimore." 'b\tYeah,']
 ['bf\tPart of it is technology, yeah.'
  'sv\tBut when, When you get into trouble like that in a place like Vietnam, you do tend to analyze the problems that you get into,']
 ['sd\tIt was wonderful.'
  'sv\tthey can do things to shrimp that no one else can.']
 ['sd\tYou know, and once a week we drive up into the mountains, you know, usually once a week, once every other week.'
  'ba\tWow,']]

[['sd\tand it was just .' 'sv\tWell, that r