In [1]:
import gensim
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import json


In [2]:
training_dataframe = pd.read_csv("./data/text/training_data_50000.csv")
test_dataframe = pd.read_csv("./data/text/test_data_50000.csv")
epochs = 10
vec_size = 64
num_cores = 4
pred_to_context_word_dist_thresh = 10

In [3]:
train_doc_label = list(training_dataframe["Index"])
train_sentiment_label = np.array(training_dataframe["Sentiment"])
train_sentiment_text = list(training_dataframe["SentimentText"])

In [4]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])

In [8]:
sentences = LabeledLineSentence(train_sentiment_text, train_doc_label)

In [9]:
model = Doc2Vec(size=vec_size,window=pred_to_context_word_dist_thresh,workers=num_cores,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    print("Training done for epoch : ",epoch)
    model.train(sentences=sentences)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca

Training done for epoch :  0
Training done for epoch :  1
Training done for epoch :  2
Training done for epoch :  3
Training done for epoch :  4
Training done for epoch :  5
Training done for epoch :  6
Training done for epoch :  7
Training done for epoch :  8
Training done for epoch :  9


In [7]:
len(model.docvecs[999])

64

In [11]:
train_data = np.empty((len(train_sentiment_text),vec_size))
train_label = train_sentiment_label
for i in range(len(train_sentiment_text)):
    train_data[i] = model.docvecs[i]


In [12]:
train_data

array([[  3.97484660e-01,  -1.86675612e-03,   6.48568213e-01, ...,
         -1.10948592e-01,  -1.20801456e-01,  -8.50240067e-02],
       [  2.16693506e-01,   3.87730040e-02,  -4.29794751e-02, ...,
         -2.06749871e-01,  -5.00119887e-02,   1.54048175e-01],
       [  4.93733317e-01,   1.58587798e-01,  -6.35844290e-01, ...,
          1.86823115e-01,  -1.09569892e-01,   1.00702012e+00],
       ..., 
       [ -5.32004535e-01,  -1.73862994e-01,  -7.13054836e-02, ...,
          2.19970331e-01,   6.13714606e-02,   6.52387619e-01],
       [  4.10753012e-01,  -3.67842227e-01,   7.49355972e-01, ...,
         -1.10439241e-01,  -5.42498156e-02,   4.00628626e-01],
       [ -6.55020028e-03,   5.50892996e-03,  -1.85414415e-03, ...,
          4.56646690e-03,  -3.17600934e-04,  -1.49005587e-04]])

In [13]:
train_label

array([0, 0, 0, ..., 1, 1, 1])

In [14]:
train_data_dict = dict()
train_data_dict["data"] = train_data.tolist()
train_data_dict["label"] = train_label.tolist()

In [15]:
with open('./data/train_data_50000.json', 'w') as fp:
    json.dump(train_data_dict, fp)

In [16]:
test_sentiment_label = np.array(test_dataframe["Sentiment"])
test_sentiment_text = list(test_dataframe["SentimentText"])

In [17]:
test_data = np.empty((len(test_sentiment_text),vec_size))
test_label = test_sentiment_label
for i in range(len(test_sentiment_text)):
    test_data[i] = model.infer_vector(doc_words=test_sentiment_text[i].split())

In [19]:
test_data_dict = dict()
test_data_dict["data"] = test_data.tolist()
test_data_dict["label"] = test_label.tolist()

In [20]:
with open('./data/test_data_50000.json', 'w') as fp:
    json.dump(test_data_dict, fp)

{'data': [[0.23864182829856873,
   -0.10125044733285904,
   0.005920238792896271,
   0.17559321224689484,
   -0.16894826292991638,
   0.050493039190769196,
   -0.02559107355773449,
   -0.319383829832077,
   0.10270831733942032,
   -0.3130829930305481,
   -0.012258023023605347,
   0.10471195727586746,
   0.22911576926708221,
   -0.035789817571640015,
   0.30090412497520447,
   0.37908247113227844,
   0.1883271485567093,
   -0.25241291522979736,
   0.20102262496948242,
   0.044989049434661865,
   -0.2482946515083313,
   -0.034897591918706894,
   0.00010186216968577355,
   -0.01984826661646366,
   -0.12774090468883514,
   0.3017483353614807,
   -0.012982189655303955,
   -0.16416795551776886,
   0.034665655344724655,
   -0.06225951761007309,
   0.20736989378929138,
   -0.05790550261735916,
   -0.1323608011007309,
   0.3615015745162964,
   -0.26699185371398926,
   0.3883433938026428,
   0.018620209768414497,
   -0.17891734838485718,
   0.035667676478624344,
   -0.08131451159715652,
   0.461