In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import util
import random
import embeddings
import nltk
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import word_tokenize
import ast
from sklearn.model_selection import train_test_split


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
vocab = embeddings.load_glove()


# language_model = embeddings.load_local_word2vec()
# text = util.get_alphanumeral(collections[0]["texts"][0])
# embeddings.get_doc_embedding(model, text)

In [5]:
df = pd.read_csv('./data/collections_all_science_out-temp_lda.csv')

In [6]:
word2vec = Word2Vec.load('./Word2Vec_100dim_science')

def Word2doc(sentence,dim):
    words=sentence.strip().split()
    emb=np.zeros(dim)
    for word in words:
        if(word in word2vec.wv.vocab):
            emb = np.add(emb,word2vec[word])
    return np.array(emb/len(words))

In [7]:
Doc2vec_model= Doc2Vec.load("./doc2vec_100dim_science.model")
 
def Doc2vec(sentence,dim):
    emb = np.zeros(dim) 
    emb += Doc2vec_model.infer_vector(word_tokenize(sentence.strip().lower()))
    return np.array(emb)

In [8]:
df['avg_word_emb'] = df.Summarization.apply(lambda x: Word2doc(x,100))
df['doc_emb'] = df.Summarization.apply(lambda x: Doc2vec(x,100))

  


In [9]:
df['lda_topics'] = df.lda_topics.apply(lambda x : [b for _ ,b in ast.literal_eval(x) ])
df['input_vec'] = df.apply(lambda x: np.append(x['doc_emb'],x['lda_topics']),axis=1)

In [10]:
col = util.generate_collections(df)
rand_col = util.generate_random_collections(df,df.shape[0],7)

In [11]:
print(len(col),len(rand_col))

(1393, 4943)


In [12]:
all_col = col + rand_col

In [61]:
all_cols = []
for coll in all_col:
    collection = {}
    collection['input_vecs'] = torch.tensor(i['input_vecs'],dtype=torch.float, device = device)
    collection['label'] = torch.tensor(i['label'],dtype=torch.long, device = device)
    all_cols.append(collection)

In [None]:
for i in a_:
    print(torch.tensor(i['input_vecs'],dtype=torch.float))

In [62]:
training_data, testing_data = train_test_split(all_cols,train_size = 0.7, random_state = 42)

In [64]:
a_ = np.array(training_data)

print(a_[0]['input_vecs'].shape,a_[0]['label'].shape)

(torch.Size([3, 120]), torch.Size([1]))


In [None]:
EMBEDDING_DIM = 120
HIDDEN_DIM = 100

In [65]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, doc_embedding_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(doc_embedding_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, 2)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1,1,self.hidden_dim, device = device), torch.zeros(1,1,self.hidden_dim, device = device))

    def forward(self, coll_embeds):
        
        lstm_out, self.hidden = self.lstm(coll_embeds.view(len(coll_embeds),1,-1), self.hidden)
        out = self.hidden2out(lstm_out[-1])
        score = F.log_softmax(out, dim = 1)
        return score

In [66]:
#Training
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = 3 * 0.001)
loss_array = []

for epoch in range(10):
    total_loss = 0
    count = 0
    printed_percentages = []
    corrects = 0
    for collection in np.array(training_data):
        
        complete_percentage = int(count*100/len(training_data))
        if complete_percentage%5 == 0 and (complete_percentage not in printed_percentages):
            print("epoch ",epoch+1,complete_percentage,"percent complete")
            print("Current accuracy: ",(corrects+0.0)/(count+1))
            printed_percentages.append(complete_percentage)
        count+=1
        
#         label = torch.tensor([label], dtype = torch.long, device = device)
#         collection_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
#         collection_embeds =[get_doc_embedding(text, EMBEDDING_DIM) for text in collection]
#         collection_embeds =[df[df.text==text]['embeddings'][0] for text in collection]
#         collection_embeds = [torch.tensor(doc2vec_model.infer_vector(text), dtype = torch.float, device = device) for text in collection]
#         collection_embeds = torch.cat(collection_embeds).view(len(collection), -1)
        model.zero_grad()

        model.hidden = model.init_hidden()
#         print(collection_ixs)
        
        score = model(collection['input_vecs'])
        _, predicted = torch.max(score,1)
        correct = 1 if (predicted == collection['label']) else 0
        corrects += correct
        
#         print(score.shape,label.shape)
        loss = loss_function(score, collection['label'])
        
        loss.backward()
        total_loss += loss.item()
        
        optimizer.step()
        
    loss_array.append(total_loss) 
    print("\nepoch "+str(epoch+1)+" loss: "+str(total_loss)+"\n")
    print(loss_array)

('epoch ', 1, 0, 'percent complete')
('Current accuracy: ', 0.0)
('epoch ', 1, 5, 'percent complete')
('Current accuracy: ', 0.9910313901345291)
('epoch ', 1, 10, 'percent complete')
('Current accuracy: ', 0.9955056179775281)
('epoch ', 1, 15, 'percent complete')
('Current accuracy: ', 0.9970014992503748)
('epoch ', 1, 20, 'percent complete')
('Current accuracy: ', 0.9977477477477478)
('epoch ', 1, 25, 'percent complete')
('Current accuracy: ', 0.9981981981981982)
('epoch ', 1, 30, 'percent complete')
('Current accuracy: ', 0.9984984984984985)
('epoch ', 1, 35, 'percent complete')
('Current accuracy: ', 0.9987129987129987)
('epoch ', 1, 40, 'percent complete')
('Current accuracy: ', 0.9988732394366198)
('epoch ', 1, 45, 'percent complete')
('Current accuracy: ', 0.99899849774662)
('epoch ', 1, 50, 'percent complete')
('Current accuracy: ', 0.9990986931050022)
('epoch ', 1, 55, 'percent complete')
('Current accuracy: ', 0.9991806636624334)
('epoch ', 1, 60, 'percent complete')
('Current


epoch 5 loss: 0.0

[1.2713720500469208, 0.0, 0.0, 0.0, 0.0]
('epoch ', 6, 0, 'percent complete')
('Current accuracy: ', 0.0)
('epoch ', 6, 5, 'percent complete')
('Current accuracy: ', 0.9955156950672646)
('epoch ', 6, 10, 'percent complete')
('Current accuracy: ', 0.9977528089887641)
('epoch ', 6, 15, 'percent complete')
('Current accuracy: ', 0.9985007496251874)
('epoch ', 6, 20, 'percent complete')
('Current accuracy: ', 0.9988738738738738)
('epoch ', 6, 25, 'percent complete')
('Current accuracy: ', 0.9990990990990991)
('epoch ', 6, 30, 'percent complete')
('Current accuracy: ', 0.9992492492492493)
('epoch ', 6, 35, 'percent complete')
('Current accuracy: ', 0.9993564993564994)
('epoch ', 6, 40, 'percent complete')
('Current accuracy: ', 0.9994366197183099)
('epoch ', 6, 45, 'percent complete')
('Current accuracy: ', 0.99949924887331)
('epoch ', 6, 50, 'percent complete')
('Current accuracy: ', 0.9995493465525012)
('epoch ', 6, 55, 'percent complete')
('Current accuracy: ', 0.9995

('epoch ', 10, 90, 'percent complete')
('Current accuracy: ', 0.9997495617330328)
('epoch ', 10, 95, 'percent complete')
('Current accuracy: ', 0.9997627520759194)

epoch 10 loss: 0.0

[1.2713720500469208, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [67]:
#Testing
with torch.no_grad():
    total_coll = len(testing_data)
    correct_preds = 0

    for collection in np.array(testing_data):
#         label = torch.tensor(label, dtype = torch.long, device = device)
#         collection_embeds =[get_doc_embedding(text, EMBEDDING_DIM) for text in collection]
#         collection_embeds = torch.cat(collection_embeds).view(len(collection), -1)
#         collection_ixs = [prepare_sequence(text.split(),word_to_ix) for text in collection]
        score = model(collection['input_vecs'])
        _, predicted = torch.max(score,1)
        correct = 1 if (predicted == collection['label']) else 0
        correct_preds += correct
                
    print("Total collections : "+str(total_coll))
    print("Correct predictions: "+str(correct_preds))
    print ("Accuracy : "+str((correct_preds+0.0)/total_coll))

Total collections : 1901
Correct predictions: 1901
Accuracy : 1.0


In [17]:
d1 = pd.read_csv('./data/collectionsWithTaxonomy.csv')

In [18]:
d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125564 entries, 0 to 125563
Data columns (total 7 columns):
collection_id    125564 non-null object
sequence_id      125564 non-null int64
resource_id      125564 non-null object
taxonomy         75474 non-null object
title            125312 non-null object
description      113221 non-null object
is_deleted       125564 non-null object
dtypes: int64(1), object(6)
memory usage: 6.7+ MB


In [None]:
# torch.save(model,"./models/word_lstm_collections_csv.pt")