In [1]:
import glob
import pandas as pd
import numpy as np
import re
import spacy
import gensim

from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
from nltk.tokenize import TreebankWordTokenizer

kTOKENIZER = TreebankWordTokenizer()
nlp = spacy.load('en')



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
training_set = pd.read_csv('./data/ROCStories__spring2016 - ROCStories_spring2016.csv')

In [4]:
training_set.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,9a51198e-96f1-42c3-b09d-a3e1e067d803,Overweight Kid,Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a c...,They got themselves and Dan on a diet.
1,617e7ada-3878-488d-bd56-40695b91f053,The Bike Accident,Carrie had just learned how to ride a bike.,She didn't have a bike of her own.,Carrie would sneak rides on her sister's bike.,She got nervous on a hill and crashed into a w...,The bike frame bent and Carrie got a deep gash...
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long...,"After walking for over a mile, something happe...",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to h...
3,d173b7de-4611-4cdf-934c-912834755e41,The bad customer.,Jane was working at a diner.,"Suddenly, a customer barged up to the counter.",He began yelling about how long his food was t...,Jane didn't know how to react.,"Luckily, her coworker intervened and calmed th..."
4,af0fd5a4-de36-47ba-8aa2-e99d10986d7a,Being Patient,I was talking to my crush today.,She continued to complain about guys flirting ...,I decided to agree with what she says and list...,"After I got home, I got a text from her.",She asked if we can hang out tomorrow.


In [5]:
corpus=[]
for i in range(len(training_set)):
    for j in range(5):
        s='sentence'+str(j+1)
        c=training_set.iloc[i][s]
        corpus.append(gensim.utils.simple_preprocess(c))

In [218]:
from gensim.models import Doc2Vec
doc2vec_model = Doc2Vec.load('./word2vec_model/m_300.d2v')

In [219]:
encoded_list=np.array([doc2vec_model.infer_vector(sentence) for sentence in corpus])
encoded= np.arange(len(encoded_list))

In [10]:
def get_batches(arr, batch_size, seq_length=5):
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [69]:
def get_encode(arr, dimension=300):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), dimension), dtype=np.float32)
    
    for i in range(one_hot.shape[0]):
        one_hot[i]=encoded_list[arr.reshape(-1,1)[i]]
        
    one_hot = one_hot.reshape((*arr.shape, dimension))
    return one_hot

In [12]:
class LSTMModel(nn.Module):

    def __init__(self,length,hidden_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.hidden_dim=hidden_dim
        
        self.lstm1 = nn.LSTM(length, hidden_dim, num_layers=2,dropout=0.5, batch_first=True)
#         self.lstm2 = nn.LSTM(len(self.chars), hidden_dim, num_layers=3)
        
        self.dropout = nn.Dropout(p=0.5)
        
        self.linear = nn.Linear(self.hidden_dim, length)

    def forward(self, input,hidden=None):
        seq_len, batch_size,_ = input.size()
        #print(input.shape)
        if hidden is None:
            h_0 = input.data.new(2, batch_size, self.hidden_dim).zero_().cuda()
            c_0 = input.data.new(2, batch_size, self.hidden_dim).zero_().cuda()
        else:
            h_0, c_0 = hidden
        
        output, hidden = self.lstm1(input, hidden)
#         output, hidden = self.lstm2(input, hidden)
        
        output = self.dropout(output)
        output = self.linear(output.view(-1, self.hidden_dim))
        return output, hidden

## Train LSTM model

In [220]:
# EMBEDDING_DIM = 64
dimension=300
HIDDEN_DIM = 256
# model = LSTMModel(chars,HIDDEN_DIM)
model = LSTMModel(dimension,HIDDEN_DIM)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

LSTMModel(
  (lstm1): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (linear): Linear(in_features=256, out_features=300, bias=True)
)

In [221]:
batch_size=100
seq_length=5
print_every_iters=250


optimizer = torch.optim.Adam(model.parameters(), lr=0.002) 
loss_fn = nn.CosineEmbeddingLoss()
# loss_fn = nn.CrossEntropyLoss()


model.train()
for epoch in range(20):
    for i, (x, y) in enumerate(get_batches(encoded, batch_size, seq_length)):
        x= get_encode(x,dimension)
        y=get_encode(y,dimension)
        inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)
        
        
        
        optimizer.zero_grad()
        outputs,_ = model(inputs)
        
        loss = loss_fn(outputs,targets.view(-1,dimension),torch.ones(500).to(device))
        loss.backward()        
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        
        
        optimizer.step()
#         print(outputs.argmax(1).size(),targets.view(-1).size())
        if i % print_every_iters == 0:
            print ('Epoch: %d [%d], loss: %1.3f' \
                   % (epoch, i, loss.item()
                      ))
    torch.save(model, './lstm_model/lstm300.mdl')


Epoch: 0 [0], loss: 1.013
Epoch: 0 [250], loss: 0.643


  "type " + obj.__name__ + ". It won't be checked "


Epoch: 1 [0], loss: 0.629
Epoch: 1 [250], loss: 0.629
Epoch: 2 [0], loss: 0.622
Epoch: 2 [250], loss: 0.625
Epoch: 3 [0], loss: 0.620
Epoch: 3 [250], loss: 0.623
Epoch: 4 [0], loss: 0.618
Epoch: 4 [250], loss: 0.621
Epoch: 5 [0], loss: 0.618
Epoch: 5 [250], loss: 0.622
Epoch: 6 [0], loss: 0.617
Epoch: 6 [250], loss: 0.619
Epoch: 7 [0], loss: 0.616
Epoch: 7 [250], loss: 0.620
Epoch: 8 [0], loss: 0.615
Epoch: 8 [250], loss: 0.618
Epoch: 9 [0], loss: 0.615
Epoch: 9 [250], loss: 0.618
Epoch: 10 [0], loss: 0.614
Epoch: 10 [250], loss: 0.617
Epoch: 11 [0], loss: 0.613
Epoch: 11 [250], loss: 0.617
Epoch: 12 [0], loss: 0.613
Epoch: 12 [250], loss: 0.616
Epoch: 13 [0], loss: 0.612
Epoch: 13 [250], loss: 0.617
Epoch: 14 [0], loss: 0.610
Epoch: 14 [250], loss: 0.616
Epoch: 15 [0], loss: 0.610
Epoch: 15 [250], loss: 0.616
Epoch: 16 [0], loss: 0.611
Epoch: 16 [250], loss: 0.617
Epoch: 17 [0], loss: 0.611
Epoch: 17 [250], loss: 0.614
Epoch: 18 [0], loss: 0.611
Epoch: 18 [250], loss: 0.614
Epoch: 19 

In [None]:
# model = torch.load('./lstm_model/lstm.mdl')

## compute test accuracy

In [48]:
test_data = pd.read_csv('./data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv')

In [49]:
test_data.head()

Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding
0,b929f263-1dcd-4a0b-b267-5d5ff2fe65bb,My friends all love to go to the club to dance.,They think it's a lot of fun and always invite.,I finally decided to tag along last Saturday.,I danced terribly and broke a friend's toe.,My friends decided to keep inviting me out as ...,"The next weekend, I was asked to please stay h...",2
1,7cbbc0af-bcce-4f56-871d-963f9bb6a99d,I tried going to the park the other day.,The weather seemed nice enough for a walk.,Within minutes of getting there I started snee...,My eyes were watery and it was hard to breathe.,My allergies were too bad and I had to go back...,It reminded me of how much I loved spring flow...,1
2,4745d627-be9b-45f2-ad92-99c82cc83f85,Avery was married with children.,She was tired of her boring life.,"One day, she decided to meet up with an old bo...",She made poor decisions that night and was unf...,Avery thought her children would be happy with...,Avery regretted what she did the next day.,2
3,6bc5a855-3a02-454c-8d8b-4428b1864f95,Josh loved when his mom baked apple pie.,He hated how he always had to wait until after...,So he decided this time he would sneak a piece...,The eggs his mom used must have been bad though.,Josh thought that the pie was delicious.,Josh got sick.,2
4,2f74e81d-957b-4541-9e66-59e5eaa6aef1,John was writing lyrics for his new album.,He started experiencing writer's block.,He tried to force himself to write but it woul...,"He took a walk, hung out with some friends, an...",He felt inspiration and then went back home to...,John then got an idea for his painting.,1


In [222]:
test = []
choice_1 = [] 
choice_2 = []
for i in range(len(test_data)):
    test.append(doc2vec_model.infer_vector(gensim.utils.simple_preprocess(test_data.loc[i]['InputSentence4'])))                
    choice_1.append(doc2vec_model.infer_vector(gensim.utils.simple_preprocess(test_data.loc[i]["RandomFifthSentenceQuiz1"])))
    choice_2.append(doc2vec_model.infer_vector(gensim.utils.simple_preprocess(test_data.loc[i]["RandomFifthSentenceQuiz2"])))       

In [210]:
def cosine(a, b):
    return torch.dot(a, b) / (torch.norm(a) * torch.norm(b))

In [223]:
test_arrange=np.arange(len(test))
test=np.array([sentence for sentence in test])
def test_get_encode(arr, dimension=300):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), dimension), dtype=np.float32)
    
    for i in range(one_hot.shape[0]):
        one_hot[i]=test[arr.reshape(-1,1)[i]]
        
    one_hot = one_hot.reshape((*arr.shape, dimension))
    return one_hot

In [206]:
def get_answer(out,choice1,choice2):
    if cosine(out[0],torch.from_numpy(choice1).to(device)).item()>cosine(out[0],torch.from_numpy(choice2).to(device)).item():
        return 1
    else:
        return 2

In [224]:
answer=[]
model.eval()
for i in range(len(test)):
    arr=np.array([i]).reshape(1,1)
    input=torch.from_numpy(test_get_encode(arr,dimension)).to(device)
    out,_=model(input)
    answer.append(get_answer(out,choice_1[i],choice_2[i]))

In [225]:
test_answer_true=np.array(list(test_data["AnswerRightEnding"]))
np.mean(np.array(answer)==test_answer_true)

0.5387493319080705

lstm accuracy: 54.14%

m300,lstm300,53.87%