In [None]:
# Uses word level and learning resource level embeddings.
# All embeddings are trained on the fly
# Last hidden layer of word level LSTMs are concatenated with LR embeddings and fed into another LSTM
# Last hidden layer of the second LSTM is used to classify the collection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import util
import random
# import embeddings

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# model = embeddings.load_word2vec()
# text = util.get_alphanumeral(collections[0]["texts"][0])
# embeddings.get_doc_embedding(model, text)

In [4]:
df = util.get_processed_data("./data/collections_math.csv", True)
collections = util.get_collections(df)

In [5]:
training_data, testing_data = util.get_train_test([col["texts"] for col in collections])

In [7]:
# Preparing dictionaries for words and learning resources
word_to_ix = {}
text_to_ix = {}
for col in collections:
    for text in col["texts"]:
        if text not in text_to_ix:
            text_to_ix[text] = len(text_to_ix)
            for word in text.split():
                if word not in word_to_ix:
                        word_to_ix[word] = len(word_to_ix)

In [8]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long, device = device)

In [9]:
SENT_EMBEDDING_DIM = 30
WORD_EMBEDDING_DIM = 100
WORD_HIDDEN_DIM = 30
SENT_HIDDEN_DIM = 30

In [10]:
#Defining the model
class LSTMClassifier(nn.Module):
    
    def __init__(self, word_embedding_dim, sent_embedding_dim, word_hidden_dim, sent_hidden_dim, word_vocab_size, sent_vocab_size):
        super(LSTMClassifier, self).__init__()
        self.word_hidden_dim = word_hidden_dim
        self.sent_hidden_dim = sent_hidden_dim
        self.word_embeddings = nn.Embedding(word_vocab_size, word_embedding_dim)
        self.sent_embeddings = nn.Embedding(sent_vocab_size, sent_embedding_dim)
        self.word_lstm = nn.LSTM(word_embedding_dim, word_hidden_dim)
        self.sent_lstm = nn.LSTM(word_hidden_dim + sent_embedding_dim, sent_hidden_dim)
        
        self.hidden2out = nn.Linear(sent_hidden_dim, 2)
        self.sent_hidden = self.init_sent_hidden()
        
    def init_word_hidden(self):
        return (torch.zeros(1,1,self.word_hidden_dim, device = device), torch.zeros(1,1,self.word_hidden_dim, device = device))
    
    def init_sent_hidden(self):
        return (torch.zeros(1,1,self.sent_hidden_dim, device = device), torch.zeros(1,1,self.sent_hidden_dim, device = device))

    def forward(self, collection, word_ixs):
        for i,sent_ix in enumerate(collection):
#             print("word_ixs: ",word_ixs)
            self.word_hidden = self.init_word_hidden()
            word_embeds = self.word_embeddings(word_ixs[i])
            word_lstm_out, self.word_hidden = self.word_lstm(word_embeds.view(len(word_ixs[i]),1,-1), self.word_hidden)
            
            sent_embeds = self.sent_embeddings(torch.tensor([sent_ix], dtype = torch.long, device = device))
            sent_lstm_input = torch.cat((sent_embeds.view(-1), self.word_hidden[0].view(-1))).view(1,1,-1)
            out, self.sent_hidden = self.sent_lstm(sent_lstm_input, self.sent_hidden)
            
        out = self.hidden2out(self.sent_hidden[0].view(1,-1))
        score = F.log_softmax(out, dim = 1)
        return score

In [13]:
#Training
model = LSTMClassifier(WORD_EMBEDDING_DIM, SENT_EMBEDDING_DIM, WORD_HIDDEN_DIM, SENT_HIDDEN_DIM, len(word_to_ix), len(text_to_ix)).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)

for epoch in range(3):
    total_loss = 0
    count = 0
    printed_percentages = []
    for collection,label in training_data:
        
        complete_percentage = int(count*100/len(training_data))
        if complete_percentage%5 == 0 and (complete_percentage not in printed_percentages):
            print("epoch ",epoch+1,complete_percentage,"percent complete")
            printed_percentages.append(complete_percentage)
        count+=1
        
        label = torch.tensor([label], dtype = torch.long, device = device)
        sent_ixs = prepare_sequence(collection, text_to_ix)
        word_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
        model.zero_grad()

        model.sent_hidden = model.init_sent_hidden()
#         print(collection_ixs)
        
        score = model(sent_ixs, word_ixs)
        loss = loss_function(score, label)
        
        loss.backward()
        
        total_loss += loss.item()
        
        optimizer.step()
    print("\nepoch "+str(epoch+1)+" loss: "+str(total_loss)+"\n")

epoch  1 0 percent complete
epoch  1 5 percent complete
epoch  1 10 percent complete
epoch  1 15 percent complete
epoch  1 20 percent complete
epoch  1 25 percent complete
epoch  1 30 percent complete
epoch  1 35 percent complete
epoch  1 40 percent complete
epoch  1 45 percent complete
epoch  1 50 percent complete
epoch  1 55 percent complete
epoch  1 60 percent complete
epoch  1 65 percent complete
epoch  1 70 percent complete
epoch  1 75 percent complete
epoch  1 80 percent complete
epoch  1 85 percent complete
epoch  1 90 percent complete
epoch  1 95 percent complete

epoch 1 loss: 3944.1760928034782

epoch  2 0 percent complete
epoch  2 5 percent complete
epoch  2 10 percent complete
epoch  2 15 percent complete
epoch  2 20 percent complete
epoch  2 25 percent complete
epoch  2 30 percent complete
epoch  2 35 percent complete
epoch  2 40 percent complete
epoch  2 45 percent complete
epoch  2 50 percent complete
epoch  2 55 percent complete
epoch  2 60 percent complete
epoch  2 65 

In [14]:
#Testing
with torch.no_grad():
    total_coll = len(testing_data)
    correct_preds = 0
    count = 0
    for collection,label  in testing_data:
        label = torch.tensor(label, dtype = torch.long, device = device)
        sent_ixs = prepare_sequence(collection, text_to_ix)
        word_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
        score = model(sent_ixs, word_ixs)
        _, predicted = torch.max(score,1)
        correct = 1 if (predicted == label) else 0
        correct_preds += correct
        
    print("Count: ",count)
    print("Total collections : "+str(total_coll))
    print("Correct predictions: "+str(correct_preds))
    print ("Accuracy : "+str(correct_preds/total_coll))

Count:  0
Total collections : 2716
Correct predictions: 1359
Accuracy : 0.5003681885125184


In [None]:
# torch.save(model,"./models/word_lstm_collections_csv.pt")