In [4]:
import os, sys, string
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import nltk
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from keras.preprocessing.sequence import pad_sequences
import pickle
from collections import Counter

In [5]:
with open('valid_words.pickle', 'rb') as f:
    valid_words = pickle.load(f)
exclude = set(string.punctuation)
raw_texts = open('CPRFD10.TXT', encoding='utf-8').read()
raw_texts = raw_texts.strip().lower()
raw_texts = ''.join(ch for ch in raw_texts if ch not in exclude and type(ch) != 'str')
raw_words = nltk.word_tokenize(raw_texts)
raw_words = [w if w in valid_words else 'UNK' for w in raw_words]

In [29]:
words = sorted(list(set(raw_words)))
word_to_index = {w: i+1 for i, w in enumerate(words)}
index_to_word = {i+1: w for i, w in enumerate(words)}

n_vocab = len(words)+1
print('Total number of words:', n_vocab)

Total number of words: 4476
4461


In [7]:
sent_len = 10
dataX, dataY = [], []
for i in range(len(raw_words) - sent_len):
    dataX.append([word_to_index[w] for w in raw_words[i:i+sent_len]])
    dataY.append(word_to_index[raw_words[i+sent_len]])
n_patterns = len(dataX)
print('Total Patterns:', n_patterns)

Total Patterns: 356178


In [8]:
X = np.reshape(dataX, (n_patterns, sent_len, 1))
X = X/float(n_vocab)
y = np_utils.to_categorical(dataY)

In [9]:
# 2 layer stacked LSTM model
model = Sequential()
model.add(LSTM(1024, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1024))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

filename = "weights-improvement-49-0.2259.hdf5"
model.load_weights(filename)

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
testingX, options, testingY = [], [], []
with open('parsed_testing_data.csv', 'r') as f:
    firstTime = True
    for line in f:
        if firstTime:
            firstTime = False
            continue
        tokens = line.strip().split('\t')
        X = tokens[0][1:-1].split(',')
        option = tokens[1:-1]
        Y = ord(tokens[-1])-ord('a')
        X = [word_to_index[w.strip()[1:-1]] for w in X]
        testingX.append(X)
        options.append(option)
        testingY.append(Y)
        i += 1

In [43]:
# pick a random seed
predictions = []
for i, test in enumerate(testingX):
    option = options[i]
    option = [word_to_index[w] if w in valid_words else word_to_index['UNK'] for w in option]
    x = np.reshape(test, (1, len(test), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    ops = np.array([prediction[0,i] for i in option])
    predict_ans = np.argmax(ops)
    predictions.append(predict_ans)


In [45]:
sum(1 for x,y in zip(predictions,testingY) if x == y) / len(testingY)

0.17307692307692307