In [1]:
%matplotlib inline
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Dropout, Input, Activation, Dense, dot, concatenate
from keras.layers import LSTM
import os
import numpy as np
import pandas as pd
import re
import IPython
import matplotlib.pyplot as plt
import tensorflow as tf
import random

Using TensorFlow backend.


In [3]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
config = tf.ConfigProto(log_device_placement=True,gpu_options=gpu_options)
sess = tf.Session(config = config)
keras.backend.set_session(sess)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7



In [0]:
TRAIN_PATH = './DataSet/Train.csv'
TEST_PATH = './DataSet/Results.csv'
train_epochs = 100
batch_size = 32
lstm_units = 64
np.random.seed(42)

In [0]:
def tokenize(sent):
    return [ x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [0]:
def cleanup(sent):
    bad_chars = [';', ':', '!', "*", '\\','/', '~', '|', '@', '#', '%', '&', '(', ')', ',', '>', '<', '+', '=', '-', '_','.','`', '.', '[', ']','$', "'", '"', '^']
    for i in bad_chars:
        sent = sent.replace(i, '')
    
    return str(sent)

In [0]:
def parse_stories(lines, sentences = None):
    
    data = []
    for index, row in lines.iterrows():
        ques, ans, dis = cleanup(row['question'].lower()), cleanup(row['answer_text'].lower()), row['distractor'].lower()
        sentences.append(ques)
        sentences.append(ans)
        dis_list = re.findall("[\"|\']+(.*?)\.\s*[\"|\']+", dis)
        
        if len(dis_list) == 0:
            dis_list = re.findall("[\"|\']+(.*?)\.?\s*[\"|\']+", dis)
        
        for dis in dis_list[:2]:
            dis = cleanup(dis)
            sentences.append(dis)
            data.append([tokenize(ques), tokenize(ans), tokenize("START " + dis + " END")])
    
    return data

In [8]:
train_file = pd.read_csv(TRAIN_PATH)
train_file.head(10)

Unnamed: 0,question,answer_text,distractor
0,Meals can be served,in rooms at 9:00 p. m.,"'outside the room at 3:00 p. m.', 'in the dini..."
1,It can be inferred from the passage that,The local government can deal with the problem...,"'If some tragedies occur again ', ' relevant d..."
2,The author called Tommy 's parents in order to,help them realize their influence on Tommy,"'blame Tommy for his failing grades', 'blame T..."
3,It can be inferred from the passage that,the writer is not very willing to use idioms,'idioms are the most important part in a langu...
4,How can we deal with snake wounds according to...,Stay calm and do n't move .,'Cut the wound and suck the poison out .'
5,What was the writer 's problem when she studie...,She missed her family very much .,"""She did n't like her new school ."", ""She did ..."
6,Who were killed on February 5 in a small town ...,Chen Jianqing and one of her partners,"'Chen Jianqing and her husband', 'Chen Jingmin..."
7,"According to the writer , which of the followi...","Soccer is popular all over the world , but tru...",'Millions of people all over the world are pla...
8,During a fire children often,panic,'know certain steps'
9,What 's the title of the passage ?,Five children died in a kindergarten bus accid...,"'A bus accident in Deng zhou .', 'All primary ..."


In [0]:
sentences = []

In [10]:
train_sent = parse_stories(train_file, sentences)
len(sentences)

  return _compile(pattern, flags).split(string, maxsplit)


115981

In [11]:
len(train_sent)

52983

In [12]:
vocab = set()
words_list = []
for sent in sentences:
    for word in tokenize(sent):
        words_list.append(word)
for i in words_list:
    vocab.add(i)
vocab.add('START')
vocab.add('END')
vocab = sorted(vocab)

  return _compile(pattern, flags).split(string, maxsplit)


In [13]:
vocab_size = len(vocab)
vocab_size

23535

In [0]:
word_idx = {vocab[i]:i for i in range(len(vocab))}
idx_word = {i:vocab[i] for i in range(len(vocab))}

In [18]:
word_idx["animalsbehaviors"]

1708

In [15]:
ques_maxlen = 0
ans_maxlen = 0
dis_maxlen = 0

for i in train_sent: #+ test_sent
    ques_maxlen = max(ques_maxlen, len(i[0]))
    ans_maxlen = max(ans_maxlen, len(i[1]))
    
    if len(i) > 2:
        for j in i[2:]:
            dis_maxlen = max(dis_maxlen, len(j))

print(ques_maxlen, ans_maxlen, dis_maxlen)

ques_maxlen = 20
ans_maxlen = 20
dis_maxlen = 20

51 105 68


In [0]:
def vectorize(data, word_idx, ques_maxlen, ans_maxlen, dis_maxlen = 0):
    vec_ques = []
    vec_dis = []
    vec_ans = []
    
    for w in data:
        a = []
        q = []
        d = []
        
        for i in w[0]:
            q.append(word_idx[i])
        
        for i in w[1]:
            a.append(word_idx[i])
            
        for i in w[2]:    
            d.append(word_idx[i])
            
        vec_dis.append(d)
        vec_ques.append(q)
        vec_ans.append(a)
        
    return [pad_sequences(vec_ques, maxlen = ques_maxlen, padding='post'), pad_sequences(vec_ans, maxlen = ans_maxlen, padding='post'), pad_sequences(vec_dis, maxlen = dis_maxlen, padding='post')]

In [0]:
# idx = np.random.randint(0,len(train_sent),10000).astype(int)
random.shuffle(train_sent)
train_ques, train_ans, train_dis = vectorize(train_sent[:10000], word_idx, ques_maxlen, ans_maxlen, dis_maxlen)

In [30]:
train_ques.shape

(10000, 20)

In [31]:
train_dis[3]

array([  783, 20995, 21669,  8912,   784, 22252, 16148, 10426, 14700,
        8517,   782,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [0]:
decode_output_data = np.zeros((len(train_dis), dis_maxlen, len(word_idx)), dtype='float32')
for i, vec in enumerate(train_dis):
    for t, word_vec in enumerate(vec):
        if t>0:
            decode_output_data[i, t-1, word_vec] = 1

In [33]:
decode_output_data.shape

(10000, 20, 23535)

In [34]:
input_ans = Input((ans_maxlen,))
input_ques = Input((ques_maxlen,))

encoder_ans = Embedding(input_dim = vocab_size,output_dim = 64, input_length = ans_maxlen)
encoder_ques = Embedding(input_dim = vocab_size,output_dim = 64, input_length = ques_maxlen)

encoded_ans = encoder_ans(input_ans)
encoded_ques = encoder_ques(input_ques)

dot_layer = dot([encoded_ans, encoded_ques], axes = (2,2))
concat_layer = concatenate([encoded_ans, dot_layer])
encoder_output,state_h,state_c = LSTM(lstm_units,return_state=True)(concat_layer)

encoder_states = [state_h,state_c]

decoder_input = Input(shape=(None,))
decoder_em = Embedding(len(word_idx), 64)
decoder_embed = decoder_em(decoder_input)
decoder = LSTM(lstm_units,return_sequences=True,return_state=True)
decoder_output,_,_ = decoder(decoder_embed,initial_state=encoder_states)

decoder_dense = Dense(len(word_idx),activation='softmax')
decoder_output = decoder_dense(decoder_output)

model = Model([input_ans, input_ques ,decoder_input],decoder_output)







In [35]:
rmsprop = keras.optimizers.RMSprop(lr=0.01, rho=0.9)
model.compile(optimizer=rmsprop, loss='categorical_crossentropy', metrics=['acc'])





In [36]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 64)       1506240     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 20, 64)       1506240     input_2[0][0]                    
____________________________________________________________________________________________

In [37]:
text_model = model.fit([train_ans, train_ques, train_dis],decode_output_data,batch_size=32,epochs=100, validation_split=0.1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 9000 samples, validate on 1000 samples
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epo

In [38]:
encoder_model = Model([input_ans, input_ques], encoder_states)
encoder_model.summary()

decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embed2 = decoder_em(decoder_input)

decoder_outputs2, state_h2, state_c2 = decoder(decoder_embed2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_input] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 64)       1506240     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 20, 64)       1506240     input_2[0][0]                    
____________________________________________________________________________________________

In [0]:
def decode_sequence(ques_seq, ans_seq):
    states_value = encoder_model.predict([ans_seq, ques_seq])
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = word_idx['START']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)# Sample a token
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = idx_word[sampled_token_index]
        decoded_sentence += ' '+sampled_word# Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == 'END' or
           len(decoded_sentence) > dis_maxlen):
            stop_condition = True    # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index    # Update states
        states_value = [h, c]
        
    return decoded_sentence

In [40]:
for seq_index in range(120,140):
    ques_seq = train_ques[seq_index:seq_index+1]
    ans_seq = train_ans[seq_index:seq_index+1]
    translated_sent = decode_sequence(ques_seq, ans_seq)
    print('-')
    print('Input sentence:', train_sent[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: [['what', 'can', 'be', 'inferred', 'about', 'titanicii', 'from', 'the', 'passage', '?'], ['titanicii', 'will', 'have', 'more', 'space', 'in', 'its', 'lifeboats', 'than', 'the', 'titanic'], ['START', 'titanicii', 'will', 'allow', 'different', 'classes', 'of', 'passengers', 'to', 'mingle', 'END']]
Decoded sentence:  the driver world in the
-
Input sentence: [['from', 'this', 'passage', 'we', 'can', 'know', 'that', 'a', 'good', 'guide'], ['should', 'explain', 'something', 'that', 'visitors', 'ca', 'nt', 'understand'], ['START', 'must', 'take', 'visitors', 'to', 'the', 'taj', 'mahal', 'and', 'the', 'tiger', 'reserve', 'END']]
Decoded sentence:  can be more important
-
Input sentence: [['the', 'author', 'implies', 'that', 'it', 'is', 'very', 'easy', 'to', 'enter', 'a', 'bookshop', 'and', 'buy'], ['a', 'book', 'that', 'unexpectedly', 'fascinates', 'you'], ['START', 'a', 'book', 'on', 'ancient', 'coins', 'END']]
Decoded sentence:  a book must be get into
-
Input sentence: [[

In [None]:
def parse_stories_test(lines, sentences = None):
    
    data = []
    for index, row in lines.iterrows():
        ques, ans = cleanup(row['question'].lower()), cleanup(row['answer_text'].lower())
        sentences.append(ques)
        sentences.append(ans)
        data.append([tokenize(ques), tokenize(ans)])
    
    return data

In [53]:
test_file = pd.read_csv(TEST_PATH)
test_sent = parse_stories_test(test_file, sentences)

  return _compile(pattern, flags).split(string, maxsplit)


In [0]:
def vectorize_test(data, word_idx, ques_maxlen, ans_maxlen):
    vec_ques = []
    vec_ans = []
    
    for w in data:
        a = []
        q = []
        
        for i in w[0]:
            try:
                q.append(word_idx[i])
            except:
                q.append(np.random.randint(0,len(word_idx)))
        
        for i in w[1]:
            try:
                a.append(word_idx[i])
            except:
                a.append(np.random.randint(0,len(word_idx)))
            
        vec_ques.append(q)
        vec_ans.append(a)
        
    return [pad_sequences(vec_ques, maxlen = ques_maxlen, padding='post'), pad_sequences(vec_ans, maxlen = ans_maxlen, padding='post')]

In [0]:
test_ques, test_ans = vectorize_test(test_sent, word_idx, ques_maxlen, ans_maxlen)

In [0]:
dis_list = []
for seq_index in range(len(test_file)):
    ques_seq = test_ques[seq_index:seq_index+1]
    ans_seq = test_ans[seq_index:seq_index+1]
    translated_sent = decode_sequence(ques_seq, ans_seq)
    dis_list.append("'" + translated_sent + "'")
    

In [0]:
test_file['distractor'] = dis_list

In [0]:
test_file.to_csv("subs.csv", index=False)