In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
import gensim
import os
import collections
import smart_open
import random
import pickle
%matplotlib inline

In [2]:
from gensim.models.doc2vec import Doc2Vec
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
model=Doc2Vec.load("Train_Corupus.model")

In [4]:
trainIndexLabelDictionary=load_obj("TrainIndextoLabelDictionary")
testIndexLabelDictionary=load_obj("TestIndextoLabelDictionary")
develIndexLabelDictionary=load_obj("DevelIndextoLabelDictionary")

In [5]:
unique_labels = set()
for val in trainIndexLabelDictionary.values():
    labels=val.split(",")
    for label in labels:
        unique_labels.add(label.rstrip())
n_classes=len(unique_labels)

In [6]:
uniqueLabelsToIndexMapping=dict()
for i,label in enumerate(unique_labels):
    uniqueLabelsToIndexMapping[label]=i

In [7]:
trainIndextoLabelIndexMap=dict()
for key,value in trainIndexLabelDictionary.items():
    dict_value=[uniqueLabelsToIndexMapping[label.rstrip()] for label in value.split(",")]
    trainIndextoLabelIndexMap[key]= dict_value

In [8]:
testIndextoLabelIndexMap=dict()
for key,value in testIndexLabelDictionary.items():
    dict_value=[uniqueLabelsToIndexMapping[label.rstrip()] for label in value.split(",")]
    testIndextoLabelIndexMap[key]= dict_value

In [9]:
develIndextoLabelIndexMap=dict()
for key,value in testIndexLabelDictionary.items():
    dict_value=[uniqueLabelsToIndexMapping[label.rstrip()] for label in value.split(",")]
    develIndextoLabelIndexMap[key]= dict_value

In [10]:
uniqueIndexToLabelsMapping={v:k for k,v in uniqueLabelsToIndexMapping.items()}

In [11]:
del trainIndexLabelDictionary,testIndexLabelDictionary,develIndexLabelDictionary
del unique_labels,uniqueLabelsToIndexMapping

In [12]:
def createTrainData(model,n_classes,trainIndextoLabelIndexMap):
    X_train=model.docvecs[0].reshape(1,-1)
    y_train=np.zeros((1,n_classes),dtype=np.int)
    y_train[:,trainIndextoLabelIndexMap[0][0]]=1
    for i in range(1,model.corpus_count):
        labelIndices=trainIndextoLabelIndexMap[i]
        for j in range(len(labelIndices)):
            X_train=np.r_[X_train,model.docvecs[i].reshape(1,-1)]
            addendum=np.zeros((1,n_classes))
            addendum[:,labelIndices[j]]=1
            y_train=np.r_[y_train,addendum]
    np.save("obj/X_train.npy",X_train)
    np.save("obj/y_train.npy",y_train)
    return True

In [13]:
if createTrainData(model,n_classes,trainIndextoLabelIndexMap):
    print("Training data created successfully\n")
else:
    print("Error!Check the code")

Training data created successfully



In [14]:
X_train=np.load("obj/X_train.npy")
y_train=np.load("obj/y_train.npy")

In [15]:
print("Training data shape:{}".format(X_train.shape))
print("Training labels shape:{}".format(y_train.shape))
print(y_train[1][16])
del X_train,y_train

Training data shape:(14823, 100)
Training labels shape:(14823, 49)
0.0


In [16]:
testCorpus=load_obj("TestCorpus")
print(testCorpus[:2])

[['alfr', 'santel', 'american', 'film', 'director', 'born', 'septemb', 'san', 'francisco', 'california', 'santel', 'direct', 'film', 'begin', 'two', 'reel', 'comedi', 'short', 'subject', 'hal', 'roach', 'product', 'compani', 'take', 'featur', 'film', 'santel', 'work', 'sever', 'major', 'studio', 'left', 'busi', 'die', 'june', 'salina', 'california'], ['niob', 'na', 'bi', 'ny', 'bee', 'larg', 'slowli', 'rotat', 'main', 'belt', 'asteroid', 'discov', 'german', 'astronom', 'robert', 'luther', 'august', 'name', 'niob', 'charact', 'greek', 'mytholog', 'bright', 'asteroid', 'shown', 'vari', 'german', 'astronom', 'friedrich', 'tietjen', 'examin', 'radar', 'use', 'arecibo', 'observatori', 'radio', 'telescop', 'puerto', 'rico', 'supplement', 'optic', 'observ', 'intend', 'build', 'lightcurv', 'result', 'estim', 'rotat', 'period', 'hour', 'earth', 'day', 'supersed', 'earlier', 'estim', 'rotat', 'period', 'hour', 'radar', 'data', 'produc', 'estim', 'maximum', 'equatori', 'diamet', 'km', 'consist', 

In [17]:
def createTestData(model,test_corpus,n_classes,testIndextoLabelIndexMap):
    testCorpus=load_obj(test_corpus)
    m = len(testCorpus)
    X_test=model.infer_vector(testCorpus[0]).reshape(1,-1)
    y_test=np.zeros((1,n_classes))
    y_test[:,testIndextoLabelIndexMap[0][0]]=1
    for i in range(1,m):
        labelIndices=testIndextoLabelIndexMap[i]
        for j in labelIndices:
            X_test=np.r_[X_test,model.infer_vector(testCorpus[i]).reshape(1,-1)]
            addendum=np.zeros((1,n_classes),dtype=np.int)
            addendum[:,j]=1
            y_test=np.r_[y_test,addendum]
    np.save("obj/X_test.npy",X_test)
    np.save("obj/y_test.npy",y_test)
    return True

In [18]:
if createTestData(model,"TestCorpus",n_classes,testIndextoLabelIndexMap):
    print("Test data successfully created")
else:
    print("Check your code!!\n")

Test data successfully created


In [19]:
X_test=np.load("obj/X_test.npy")
y_test=np.load("obj/y_test.npy")

In [20]:
print("Test data shape:{}".format(X_test.shape))
print("Test labels shape:{}".format(y_test.shape))
print(y_test[:2])
del X_test,y_test

Test data shape:(1998, 100)
Test labels shape:(1998, 49)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]]


In [21]:
def createDevelData(model,test_corpus,n_classes,develIndextoLabelIndexMap):
    develCorpus=load_obj(test_corpus)
    m = len(develCorpus)
    X_test=model.infer_vector(develCorpus[0]).reshape(1,-1)
    y_test=np.zeros((1,n_classes))
    y_test[:,develIndextoLabelIndexMap[0][0]]=1
    for i in range(1,m):
        labelIndices=develIndextoLabelIndexMap[i]
        for j in labelIndices:
            X_test=np.r_[X_test,model.infer_vector(develCorpus[i]).reshape(1,-1)]
            addendum=np.zeros((1,n_classes),dtype=np.int)
            addendum[:,j]=1
            y_test=np.r_[y_test,addendum]
    np.save("obj/X_devel.npy",X_test)
    np.save("obj/y_devel.npy",y_test)
    return True

In [22]:
if createDevelData(model,"TestCorpus",n_classes,testIndextoLabelIndexMap):
    print("Test data successfully created")
else:
    print("Check your code!!\n")

Test data successfully created


In [23]:
X_devel=np.load("obj/X_devel.npy")
y_devel=np.load("obj/y_devel.npy")

In [24]:
print("Dev data shape:{}".format(X_devel.shape))
print("Dev labels shape:{}".format(y_devel.shape))
print(y_devel[:2])
del X_devel,y_devel

Dev data shape:(1998, 100)
Dev labels shape:(1998, 49)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]]
