In [1]:
import gensim
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import json


In [2]:
training_dataframe = pd.read_csv("./data/text/training_data_50000.csv")
epochs = 10
vec_size = 50
num_cores = 4
pred_to_context_word_dist_thresh = 10

In [3]:
docLabels = list(training_dataframe["Index"])
sentimentLabels = np.array(training_dataframe["Sentiment"])
data = list(training_dataframe["SentimentText"])

In [4]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])

In [5]:
sentences = LabeledLineSentence(data, docLabels)

In [6]:
model = Doc2Vec(size=vec_size,window=pred_to_context_word_dist_thresh,workers=num_cores,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    print("Training done for epoch : ",epoch)
    model.train(sentences=sentences)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca

Training done for epoch :  0
Training done for epoch :  1
Training done for epoch :  2
Training done for epoch :  3
Training done for epoch :  4
Training done for epoch :  5
Training done for epoch :  6
Training done for epoch :  7
Training done for epoch :  8
Training done for epoch :  9


In [7]:
len(model.docvecs[999])

50

In [9]:
train_data = np.empty((len(training_dataframe),vec_size))
train_label = sentimentLabels
for i in range(len(training_dataframe)):
    train_data[i] = model.docvecs[i]


In [10]:
train_data

array([[-0.21240686,  0.04456875, -0.23503773, ..., -0.1962624 ,
        -0.1204211 , -0.07198631],
       [ 0.0300697 , -0.11546917, -0.37795967, ..., -0.13256611,
         0.31415638,  0.19001837],
       [-1.04692352,  0.24685332,  0.27830318, ...,  0.17234065,
        -0.01301328,  0.09401999],
       ..., 
       [-0.55868679, -0.01660823, -0.72503167, ...,  0.72931635,
         0.41534641, -0.531416  ],
       [-0.65339279, -0.14587922, -0.86026508, ..., -0.28859982,
         0.02004016, -0.04188761],
       [ 0.00586572, -0.00149885,  0.00684214, ..., -0.00207298,
        -0.00182077,  0.00462301]])

In [11]:
train_label

array([0, 0, 0, ..., 1, 1, 1])

In [12]:
train_data_dict = dict()
train_data_dict["data"] = train_data.tolist()
train_data_dict["label"] = train_label.tolist()

In [13]:
with open('./data/train_data_50000.json', 'w') as fp:
    json.dump(train_data_dict, fp)