In [1]:
from data import *
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import os 
import data
import pickle
import sys
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import Model
import random


# This is the iterator we'll use during training. 
# It's a generator that gives you one batch at a time.
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        batch = [source[index] for index in batch_indices]
        yield [source[index] for index in batch_indices]

# This is the iterator we use when we're evaluating our model. 
# It gives a list of batches that you can then iterate through.
def eval_iter(source, batch_size):
    batches = []
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while start < dataset_size - batch_size:
        start += batch_size
        batch_indices = order[start:start + batch_size]
        batch = [source[index] for index in batch_indices]
        if len(batch) == batch_size:
            batches.append(batch)
        else:
            continue
        
    return batches

# The following function gives batches of vectors and labels, 
# these are the inputs to your model and loss function
def get_batch(batch):
    vectors = []
    labels = []
    for dict in batch:
        vectors.append(dict["text_index_sequence"])
        labels.append(dict["label"])
    return vectors, labels

def repackage_hidden(h):
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

def training_loop(batch_size, num_epochs, model, loss_, optim, training_iter, dev_iter, train_eval_iter):
    step = 0
    epoch = 0
    total_batches = int(len(training_set) / batch_size)
    total_samples = total_batches * batch_size
    hidden = model.init_hidden(batch_size)
    while epoch <= num_epochs:
        epoch_loss = 0
        model.train()

        vectors, labels = get_batch(next(training_iter)) 
        vectors = torch.stack(vectors).squeeze()
        vectors = vectors.transpose(1, 0)
        
        labels = Variable(torch.stack(labels).squeeze().type('torch.FloatTensor')) 
        vectors = Variable(vectors)
        
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(vectors, hidden)
        lossy = loss_(output, labels)
        epoch_loss += lossy.data[0] * batch_size

        lossy.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 5.0)
        optim.step()

        if step % total_batches == 0:
            loss_train = evaluate(model, train_eval_iter,batch_size)
            loss_dev = evaluate(model, dev_iter,batch_size)
            kappa_dev = evaluate_kappa(model, dev_iter,batch_size)
            with open("test.txt", "a") as myfile:
                myfile.write("Epoch %i; Step %i; Avg Loss %f; Train loss: %f; Dev loss: %f; Dev kappa: %f\n" 
                  %(epoch, step, epoch_loss/total_samples, loss_train, loss_dev, kappa_dev))
            print("Epoch %i; Step %i; Avg Loss %f; Train loss: %f; Dev loss: %f; Dev kappa: %f" 
                  %(epoch, step, epoch_loss/total_samples, loss_train, loss_dev, kappa_dev))
            epoch += 1
            
        if step % 5 == 0:
            with open("test.txt", "a") as myfile:
                myfile.write("Epoch %i; Step %i; loss %f\n" %(epoch, step, lossy.data[0]))
            print("Epoch %i; Step %i; loss %f" %(epoch, step, lossy.data[0]))
        step += 1

# This function outputs the accuracy on the dataset, we will use it during training.
def evaluate(model, data_iter, batch_size):
    model.eval()
    correct = 0
    total = 0
    evalloss = 0.0
    hidden = model.init_hidden(batch_size)
    for i in range(len(data_iter)):
        vectors, labels = get_batch(data_iter[i])
        vectors = torch.stack(vectors).squeeze()
        vectors = vectors.transpose(1, 0)
        
        labels = Variable(torch.stack(labels).squeeze().type('torch.FloatTensor'))
        vectors = Variable(vectors)

        hidden = repackage_hidden(hidden)
        output, hidden = model(vectors, hidden)
        evalloss += F.mse_loss(output, labels).data[0]
    return evalloss/len(data_iter)


def evaluate_kappa(model, data_iter, batch_size):
    model.eval()
    predicted_labels = []
    true_labels = []
    hidden = model.init_hidden(batch_size)
    for i in range(len(data_iter)):
        vectors, labels = get_batch(data_iter[i])
        vectors = torch.stack(vectors).squeeze()
        vectors = vectors.transpose(1, 0)

        vectors = Variable(vectors)
        
        hidden = repackage_hidden(hidden)
        output, hidden = model(vectors, hidden)

        predicted = [int(round(float(num))) for num in output.data.cpu().numpy()]
        predicted_labels.extend([round(float(num)) for num in output.data.cpu().numpy()])
        labels = [int(label[0]) for label in labels]
        true_labels.extend(labels)

    return cohen_kappa_score(true_labels, predicted_labels, weights = "quadratic")

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class LSTM(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, vocab_size, embedding_dim, hidden_size, num_layers, dropout=0.2, bidirectional = False, pre_emb=None):
        super(LSTM, self).__init__()
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = getattr(nn, rnn_type)(embedding_dim, hidden_size, num_layers, bias=False, dropout=dropout, bidirectional=bidirectional)
        self.decoder = nn.Linear(hidden_size, 1)
        self.decoder_bi = nn.Linear(hidden_size*2, 1)
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.init_weights(pre_emb)
        
    def init_weights(self, pretrained_embedding):
            initrange = 0.1
            if(pretrained_embedding is not None):
                pretrained_embedding = pretrained_embedding.astype(np.float32)
                pretrained_embedding = torch.from_numpy(pretrained_embedding)
                self.encoder.weight.data = pretrained_embedding
            else:
                self.encoder.weight.data.uniform_(-initrange, initrange)
            self.decoder.bias.data.fill_(0)
            self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs, hidden):
        emb = self.encoder(inputs)
        output, hidden = self.rnn(emb, hidden)
        # mot here
        self.mot = torch.mean(output, 0)
        self.mot = torch.squeeze(output)
        print(self.mot.size())
        if self.bidirectional:
            decoded = self.decoder_bi(output)
        else:
            decoded = self.decoder(output)
        return decoded, hidden
    
    def init_hidden(self, batch_size):
        ##next(rnn.parameters()).data=rnn.encoder.weight.data
        weight = next(self.parameters()).data
        if self.bidirectional == True:
            return (Variable(weight.new(self.num_layers * 2, batch_size, self.hidden_size).zero_()),
                    Variable(weight.new(self.num_layers * 2, batch_size, self.hidden_size).zero_()))
        else:
            return (Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_()),
                    Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_()))            



In [3]:
import nltk

In [8]:
raw_data = pd.read_csv("../data/training_final.csv", sep=',',header=0, index_col=0)
data_set = data.get_data(raw_data)
print('Finished Loading!')

#get max sequence length
max_seq_length = max(list(map(lambda x:len(x.split()),raw_data.essay)))
print('max seq length: ', max_seq_length)

# split to train/val/test
data_size = len(data_set)
print('data_size',data_size)
training_set = data_set[:int(data_size*0.8)]
dev_set = data_set[int(data_size*0.8):int(data_size*0.9)]
test_set = data_set[int(data_size*0.9):]

Finished Loading!
max seq length:  1064
data_size 12977


In [12]:
#test_set.to_csv('test_set.csv', index = False)
pickle.dump(test_set,open('test_set.pk', 'wb'))



In [6]:
# convert and formatting
word_to_ix, index_to_word, vocab_size = data.build_dictionary([training_set])
#print('vocab size', vocab_size)
data.sentences_to_padded_index_sequences(word_to_ix, [training_set, dev_set], max_seq_length)
print('Finished Converting!')

Finished Converting!


In [7]:
data_set[0]

{'essay_set': 6, 'label': 
  9
 [torch.LongTensor of size 1], 'text': 'in the excerpt from @organization2\'s the mooring mast, the builders of the empire state building faced many obstacles in allowing dirigibles to dock there. for example, "the lack of a suitable landing area." architects cannot just drop a mooring mast on top of the empire state building\'s roof. the building would have had an extreme amount of pressure on it. in addition, along with the pressure, the dirigibles would "add stress to the building\'s frame." the builders needed to modify and strengthen the steel frame of the empire state building. this also cost a great deal of money. furthermore, "the greatest reason was safety." the slightest mistake in building the mast, could affect every person in the building. in order to allow dirigibles to dock there, the builders needed to be aware of all the citizens surrounding the building. all in all, these obstacles determined the fate of the mast.', 'text_index_sequence'

In [27]:

#######
# Train

# Hyper Parameters 
model = 'LSTM'
input_size = vocab_size
hidden_dim = 24
embedding_dim = 50
batch_size = 100
learning_rate = 0.1
num_epochs = 1
num_layer = 1
bi_direction = True


matrix = np.zeros((2, int(embedding_dim)))

oov=0
glove = {}
filtered_glove = {}
glove_path = '../data/filtered_glove_50.p'
if(os.path.isfile(glove_path)):
    print("Reusing glove dictionary to save time")
    pretrained_embedding = pickle.load(open(glove_path,'rb'))
else:
    #print('loading glove embedding')
    with open('../data/glove.6B.50d.txt') as f:
        lines = f.readlines()
        for l in lines:
            vec = l.split(' ')
            glove[vec[0].lower()] = np.array(vec[1:])
    print('glove size={}'.format(len(glove)))
    print("Finished making glove dictionary")

    for i in range(2, len(index_to_word)):
        word = index_to_word[i]
        if(word in glove):
            vec = glove[word]
            filtered_glove[word] = glove[word]
            matrix = np.vstack((matrix,vec))
        else:
            oov+=1
            random_init = np.random.uniform(low=-0.01,high=0.01, size=(1,embedding_dim))
            matrix = np.vstack((matrix,random_init))

    pickle.dump(matrix, open("../data/filtered_glove_50.p", "wb"))
    #print(matrix.shape)
    pretrained_embedding = matrix
    #print("word_to_ix", len(word_to_ix))
    #print("oov={}".format(oov))
    #print("Saving glove vectors")
    print("Saving glove vectors")







# Build, initialize, and train model
rnn = Model.LSTM(model, vocab_size, embedding_dim, hidden_dim, num_layer, dropout=0.2, bidirectional=bi_direction, 
pre_emb=None)

# Loss and Optimizer
loss = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

# Train the model
training_iter = data_iter(training_set, batch_size)
train_eval_iter = eval_iter(training_set, batch_size)
dev_iter = eval_iter(dev_set, batch_size)
print('start training:')
training_loop(batch_size, num_epochs, rnn, loss, optimizer, training_iter, dev_iter, train_eval_iter)


Reusing glove dictionary to save time
start training:
Epoch 0; Step 0; Avg Loss 0.731220; Train loss: 114.863277; Dev loss: 110.861959; Dev kappa: -0.036373
Epoch 1; Step 0; loss 75.315620
Epoch 1; Step 5; loss 96.604324
Epoch 1; Step 10; loss 85.804970
Epoch 1; Step 15; loss 46.120274
Epoch 1; Step 20; loss 106.849983
Epoch 1; Step 25; loss 38.159824
Epoch 1; Step 30; loss 72.105881
Epoch 1; Step 35; loss 66.949478
Epoch 1; Step 40; loss 37.453758
Epoch 1; Step 45; loss 50.282639
Epoch 1; Step 50; loss 40.439720
Epoch 1; Step 55; loss 21.929771
Epoch 1; Step 60; loss 15.146441
Epoch 1; Step 65; loss 15.030107
Epoch 1; Step 70; loss 24.175129
Epoch 1; Step 75; loss 7.333912
Epoch 1; Step 80; loss 14.395830
Epoch 1; Step 85; loss 12.380017
Epoch 1; Step 90; loss 6.457202
Epoch 1; Step 95; loss 9.085093
Epoch 1; Step 100; loss 8.120253
Epoch 1; Step 103; Avg Loss 0.083561; Train loss: 9.765585; Dev loss: 11.773623; Dev kappa: 0.921521


In [28]:
dev_iter = eval_iter(dev_set, batch_size)

In [30]:
len(dev_iter)

12

In [32]:
get_batch(next(training_iter))

{'label': 
  6
 [torch.LongTensor of size 1],
 'text': "In this world you never know what someone might writing. There're many materials in this world that I think aren't appropiate for someone to be reading, watching or listenning, ect.  You @MONTH1 be thinking that anyone makes his/her own decision of what they read, watch, listen.  Well, I do agree with that, but I do believe that some materials aren't rate it properly to age.  I do think that some materials should be banned from people under age, some from kids, and some from everyone.      You @MONTH1 be thinking that all of this is wrong and that there is nothing wrong with books, music, movies, ect., but in my opinion some materials are just offensive and not proper. I seen so many materials that I think are not proper for people under age to been looking at. In some movies, for example, the rating is just not right because some movies have so much violence or language.  You might not think it's that big of a deal, of course, bu