In [1]:
import gensim
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import json


In [2]:
train_size = 1000
test_size = 1000

In [3]:
training_dataframe = pd.read_csv("./data/text/training_data_"+str(train_size)+".csv")
test_dataframe = pd.read_csv("./data/text/test_data_"+str(test_size)+".csv")
epochs = 10
vec_size = 64
num_cores = 4
pred_to_context_word_dist_thresh = 10

In [4]:
train_doc_label = list(training_dataframe["Index"])
train_sentiment_label = np.array(training_dataframe["Sentiment"])
train_sentiment_text = list(training_dataframe["SentimentText"])

In [5]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])

In [6]:
sentences = LabeledLineSentence(train_sentiment_text, train_doc_label)

In [7]:
model = Doc2Vec(size=vec_size,window=pred_to_context_word_dist_thresh,workers=num_cores,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    print("Training done for epoch : ",epoch)
    model.train(sentences=sentences)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca

Training done for epoch :  0
Training done for epoch :  1
Training done for epoch :  2
Training done for epoch :  3
Training done for epoch :  4
Training done for epoch :  5
Training done for epoch :  6
Training done for epoch :  7
Training done for epoch :  8
Training done for epoch :  9


In [8]:
len(model.docvecs[999])

64

In [9]:
train_data = np.empty((len(train_sentiment_text),vec_size))
train_label = train_sentiment_label
for i in range(len(train_sentiment_text)):
    train_data[i] = model.docvecs[i]


In [10]:
train_data

array([[ 0.02933925,  0.0069539 , -0.00766996, ..., -0.03350095,
        -0.00416064,  0.01841574],
       [ 0.05371504, -0.00780737, -0.03652588, ..., -0.04139413,
         0.00016622,  0.03016355],
       [ 0.0393327 ,  0.01138639, -0.00060792, ..., -0.05038377,
        -0.01572704,  0.0463664 ],
       ..., 
       [ 0.01203489,  0.01999757,  0.01105057, ..., -0.01962524,
        -0.01184632,  0.02960784],
       [ 0.03742968,  0.02216217, -0.01207131, ..., -0.01469093,
        -0.00445058,  0.03710216],
       [ 0.03186279,  0.01340237, -0.02426297, ..., -0.03130142,
        -0.00515216,  0.02908993]])

In [11]:
train_data_dict = dict()
train_data_dict["data"] = train_data.tolist()
train_data_dict["label"] = train_label.tolist()

In [12]:
with open("./data/number/train_data_"+str(train_size)+".json", 'w') as fp:
    json.dump(train_data_dict, fp)

In [13]:
test_sentiment_label = np.array(test_dataframe["Sentiment"])
test_sentiment_text = list(test_dataframe["SentimentText"])

In [14]:
test_data = np.empty((len(test_sentiment_text),vec_size))
test_label = test_sentiment_label
for i in range(len(test_sentiment_text)):
    test_data[i] = model.infer_vector(doc_words=test_sentiment_text[i].split())

In [15]:
test_data_dict = dict()
test_data_dict["data"] = test_data.tolist()
test_data_dict["label"] = test_label.tolist()

In [16]:
with open("./data/number/test_data_"+str(test_size)+".json", 'w') as fp:
    json.dump(test_data_dict, fp)

In [17]:
test_sentiment_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [18]:
test_dataframe["Sentiment"]

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
970    1
971    1
972    1
973    1
974    1
975    1
976    1
977    1
978    1
979    1
980    1
981    1
982    1
983    1
984    1
985    1
986    1
987    1
988    1
989    1
990    1
991    1
992    1
993    1
994    1
995    1
996    1
997    1
998    1
999    1
Name: Sentiment, Length: 1000, dtype: int64