In [1]:
import gensim
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import json


In [2]:
training_dataframe = pd.read_csv("./data/text/training_data_50000.csv")
test_dataframe = pd.read_csv("./data/text/test_data_50000.csv")
epochs = 10
vec_size = 64
num_cores = 4
pred_to_context_word_dist_thresh = 10

In [3]:
train_doc_label = list(training_dataframe["Index"])
train_sentiment_label = np.array(training_dataframe["Sentiment"])
train_sentiment_text = list(training_dataframe["SentimentText"])

In [4]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])

In [5]:
sentences = LabeledLineSentence(train_sentiment_text, train_doc_label)

In [6]:
model = Doc2Vec(size=vec_size,window=pred_to_context_word_dist_thresh,workers=num_cores,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    print("Training done for epoch : ",epoch)
    model.train(sentences=sentences)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca

Training done for epoch :  0
Training done for epoch :  1
Training done for epoch :  2
Training done for epoch :  3
Training done for epoch :  4
Training done for epoch :  5
Training done for epoch :  6
Training done for epoch :  7
Training done for epoch :  8
Training done for epoch :  9


In [7]:
len(model.docvecs[999])

64

In [8]:
train_data = np.empty((len(train_sentiment_text),vec_size))
train_label = train_sentiment_label
for i in range(len(train_sentiment_text)):
    train_data[i] = model.docvecs[i]


In [9]:
train_data_dict = dict()
train_data_dict["data"] = train_data.tolist()
train_data_dict["label"] = train_label.tolist()

In [10]:
with open('./data/train_data_50000.json', 'w') as fp:
    json.dump(train_data_dict, fp)

In [11]:
test_sentiment_label = np.array(test_dataframe["Sentiment"])
test_sentiment_text = list(test_dataframe["SentimentText"])

In [12]:
test_data = np.empty((len(test_sentiment_text),vec_size))
test_label = test_sentiment_label
for i in range(len(test_sentiment_text)):
    test_data[i] = model.infer_vector(doc_words=test_sentiment_text[i].split())

In [13]:
test_data_dict = dict()
test_data_dict["data"] = test_data.tolist()
test_data_dict["label"] = test_label.tolist()

In [14]:
with open('./data/test_data_50000.json', 'w') as fp:
    json.dump(test_data_dict, fp)