In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import util
import random
import embeddings
import nltk

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
EMBEDDING_DIM = 50
HIDDEN_DIM = 100

In [5]:
vocab = embeddings.load_glove()


# language_model = embeddings.load_local_word2vec()
# text = util.get_alphanumeral(collections[0]["texts"][0])
# embeddings.get_doc_embedding(model, text)

In [6]:
df = util.get_processed_data("./data/embeddings_gensim.csv", False)
collections = util.get_collections(df)

In [7]:

for col in collections:
    lrs = []
    for i,text in enumerate(col["texts"]):
        new_text = ""
        for word in text.split():
            if len(word)>1:
                new_text+=word+" "
        if new_text.strip()!="":
            lrs.append(new_text.strip().lower())
    col["texts"] = lrs
        

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = []
doc_to_ix = {}
for col in collections:
    for text in col["texts"]:
        if text not in doc_to_ix:
            doc_to_ix[text] = len(doc_to_ix)
            documents.append(text)
vectorizer = TfidfVectorizer()
tf_matrix = vectorizer.fit_transform(documents)

feature_names = vectorizer.get_feature_names()
word_to_ix = {word:index for index, word in enumerate(feature_names)}

In [10]:
import util
training_data, testing_data = util.get_train_test(collections)



In [11]:
def get_doc_embedding(text, word_embedding_dim):
    global vocab
    global device
    global tf_matrix
    global word_to_ix
    global doc_to_ix
    embeds = []
    text = text.lower()
    for word in text.split():
        if word in vocab:
#             print(word, text)
            tf_idf = tf_matrix[doc_to_ix[text], word_to_ix[word]]
#             if tf_idf == 0:
#                 print("0 tfidf: text", text, "word", word)
            embed = [v * tf_idf for v in vocab[word]]
            embeds.append(torch.tensor(embed, dtype = torch.float, device = device))
        else:
            embeds.append(torch.zeros(word_embedding_dim, device = device, dtype = torch.float))
    embeds = torch.cat(embeds).view(len(text.split()), -1)
    return embeds.sum(dim = 0)/embeds.shape[0]

In [12]:
def text_similarity(text1, text2):
    cos = nn.CosineSimilarity(dim = 0)
    embed1 = get_doc_embedding(text1,EMBEDDING_DIM)
    embed2 = get_doc_embedding(text2,EMBEDDING_DIM)
    
    return cos(embed1, embed2)

In [13]:
def collection_similarity_score(texts):
    count = 0
    similarity_sum = 0.0
    for ind in range(len(texts)-1):
        similarity_sum += text_similarity(texts[ind],texts[ind+1])
        count+=1
    if count == 0:
        return 0
    else:
        return similarity_sum/count

In [14]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, doc_embedding_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(doc_embedding_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, 2)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1,1,self.hidden_dim, device = device), torch.zeros(1,1,self.hidden_dim, device = device))

    def forward(self, coll_embeds):
        
        lstm_out, self.hidden = self.lstm(coll_embeds.view(len(coll_embeds),1,-1), self.hidden)
        out = self.hidden2out(lstm_out[-1])
        score = F.log_softmax(out, dim = 1)
        return score

In [15]:
from gensim.models.doc2vec import Doc2Vec

doc2vec_model= Doc2Vec.load("d2v.model")

In [16]:
#Training
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_array = []

for epoch in range(10):
    total_loss = 0
    count = 0
    printed_percentages = []
    corrects = 0
    for collection,label in np.array(training_data):
        
        complete_percentage = int(count*100/len(training_data))
        if complete_percentage%5 == 0 and (complete_percentage not in printed_percentages):
            print("epoch ",epoch+1,complete_percentage,"percent complete")
            print("Current accuracy: ",(corrects+0.0)/(count+1))
            printed_percentages.append(complete_percentage)
        count+=1
        
        label = torch.tensor([label], dtype = torch.long, device = device)
#         collection_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
#         collection_embeds =[get_doc_embedding(text, EMBEDDING_DIM) for text in collection]
#         collection_embeds =[df[df.text==text]['embeddings'][0] for text in collection]
        collection_embeds = [torch.tensor(doc2vec_model.infer_vector(text), dtype = torch.float, device = device) for text in collection]
        collection_embeds = torch.cat(collection_embeds).view(len(collection), -1)
        model.zero_grad()

        model.hidden = model.init_hidden()
#         print(collection_ixs)
        
        score = model(collection_embeds)
        _, predicted = torch.max(score,1)
        correct = 1 if (predicted == label) else 0
        corrects += correct
        
#         print(score.shape,label.shape)
        loss = loss_function(score, label)
        
        loss.backward()
        total_loss += loss.item()
        
        optimizer.step()
        
    loss_array.append(total_loss) 
    print("\nepoch "+str(epoch+1)+" loss: "+str(total_loss)+"\n")
    print(loss_array)

('epoch ', 1, 0, 'percent complete')
('Current accuracy: ', 0.0)
('epoch ', 1, 5, 'percent complete')
('Current accuracy: ', 0.7547169811320755)
('epoch ', 1, 10, 'percent complete')
('Current accuracy: ', 0.8535433070866142)
('epoch ', 1, 15, 'percent complete')
('Current accuracy: ', 0.8885383806519453)
('epoch ', 1, 20, 'percent complete')
('Current accuracy: ', 0.9014195583596214)
('epoch ', 1, 25, 'percent complete')
('Current accuracy: ', 0.910410094637224)
('epoch ', 1, 30, 'percent complete')
('Current accuracy: ', 0.9195160441872698)
('epoch ', 1, 35, 'percent complete')
('Current accuracy: ', 0.9220018034265104)


KeyboardInterrupt: 

In [None]:
#Testing
with torch.no_grad():
    total_coll = len(testing_data)
    correct_preds = 0

    for collection,label  in np.array(testing_data):
        label = torch.tensor(label, dtype = torch.long, device = device)
        collection_embeds =[get_doc_embedding(text, EMBEDDING_DIM) for text in collection]
        collection_embeds = torch.cat(collection_embeds).view(len(collection), -1)
#         collection_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
        score = model(collection_embeds)
        _, predicted = torch.max(score,1)
        correct = 1 if (predicted == label) else 0
        correct_preds += correct
                
    print("Total collections : "+str(total_coll))
    print("Correct predictions: "+str(correct_preds))
    print ("Accuracy : "+str((correct_preds+0.0)/total_coll))

In [17]:
d1 = pd.read_csv('./data/collectionsWithTaxonomy.csv')

In [18]:
d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125564 entries, 0 to 125563
Data columns (total 7 columns):
collection_id    125564 non-null object
sequence_id      125564 non-null int64
resource_id      125564 non-null object
taxonomy         75474 non-null object
title            125312 non-null object
description      113221 non-null object
is_deleted       125564 non-null object
dtypes: int64(1), object(6)
memory usage: 6.7+ MB


In [None]:
# torch.save(model,"./models/word_lstm_collections_csv.pt")