In [1]:
#Import all the dependencies
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join
import numpy as np

import pandas as pd

MODEL_NAME = "doc2vec.model"
SIZE = 17 * 17

POS_TRAIN_PATH = "aclImdb/train/pos/"
NEG_TRAIN_PATH = "aclImdb/train/neg/"
POS_TEST_PATH = "aclImdb/test/pos/"
NEG_TEST_PATH = "aclImdb/test/neg/"

In [2]:
train_features = []
train_labels = []
test_features = []
test_labels = []

In [3]:
filenames = [f for f in listdir(POS_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(1)
    
filenames = [f for f in listdir(NEG_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(0)

In [4]:
filenames = [f for f in listdir(POS_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(1)
    
filenames = [f for f in listdir(NEG_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(0)

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

def nlp_clean(data):
   new_data = []
   for d in data:
      new_str = d.lower()
      dlist = tokenizer.tokenize(new_str)
      dlist = list(set(dlist).difference(stopword_set))
      new_data.append(dlist)
        
   return new_data

train_features = nlp_clean(train_features)
test_features = nlp_clean(test_features)

In [6]:
class LabeledLineSentence(object):
    
    def __init__(self, docs):
        self.docs = docs
        
    def __iter__(self):
        for idx, doc in enumerate(self.docs):
              yield gensim.models.doc2vec.LabeledSentence(doc,[str(idx)])

iterator = LabeledLineSentence(train_features)
model = gensim.models.Doc2Vec(size=SIZE, window=5, min_count=5, workers=16,alpha=0.025, min_alpha=0.025, iter=50)
model.build_vocab(iterator)
model.train(iterator, epochs=model.iter, total_examples=model.corpus_count)

model.save(MODEL_NAME)
print(MODEL_NAME + " saved")


  
  del sys.path[0]


doc2vec.model saved


In [7]:
columns = [str(x) for x in range(len(model.docvecs[1]))]
columns.append("Sentiment")

In [8]:
train_data = None

for idx in range(len(train_labels)):
    features = np.array(model.docvecs[str(idx)])
    label = np.array([train_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if train_data is None:
        train_data = row
    else:
        train_data = np.concatenate((train_data, row), axis=0)
    

In [9]:
np.shape(train_data)

(25000, 290)

In [10]:
test_data = None

for idx in range(len(test_labels)):
    features = np.array(model.infer_vector(test_features[idx]))
    label = np.array([test_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if test_data is None:
        test_data = row
    else:
        test_data = np.concatenate((test_data, row), axis=0)
        

In [11]:
np.shape(test_data)

(25000, 290)

In [12]:
train_data = pd.DataFrame(columns=columns, data=train_data)
test_data = pd.DataFrame(columns=columns, data=test_data)

In [13]:
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [14]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,280,281,282,283,284,285,286,287,288,Sentiment
4146,-0.190214,-0.775909,0.606918,-0.422152,1.006142,-1.099936,-0.591228,0.110556,-0.685102,0.233484,...,-0.40646,-0.145558,-1.306575,0.230331,0.624839,-1.368549,0.805194,-0.75895,0.145932,1.0
20857,-0.248992,-0.204263,0.585718,-0.340346,0.623218,-0.132182,-0.446606,-0.085173,0.247781,-0.290398,...,-0.172685,0.630882,0.110635,0.223473,0.180395,0.131066,0.163105,0.723476,-0.002338,0.0
2017,0.040464,1.032194,-0.982082,-0.054039,0.172879,0.061846,0.513306,-0.427153,0.800728,-0.842768,...,-0.071184,0.454823,-0.388393,-0.455052,0.816787,-0.742167,0.42305,0.541922,-0.391158,1.0
11935,-0.102443,-0.471759,0.03633,0.192056,1.049041,0.023328,-0.082436,-0.192707,-0.043875,0.224525,...,0.502373,-0.55204,-0.389139,0.310724,-0.114863,0.532303,-0.150409,0.128998,-0.101038,1.0
18574,0.029157,-0.296443,0.683454,0.026448,-0.72984,1.183778,0.836208,0.700741,-0.260888,-0.136369,...,-0.318998,-0.120458,0.013771,-0.661265,-0.215869,-1.508247,0.381819,-0.027416,-0.900369,0.0


In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 4146 to 4137
Columns: 290 entries, 0 to Sentiment
dtypes: float32(290)
memory usage: 27.8 MB


In [16]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,280,281,282,283,284,285,286,287,288,Sentiment
18311,-0.224514,-0.269337,0.149535,-0.204498,-0.319069,-0.178723,0.249171,0.177335,-0.430412,-0.079033,...,0.47758,0.326845,0.422711,-0.071163,-0.674291,0.361922,-0.018685,-0.334325,0.639806,0.0
629,-0.428177,0.066324,0.035085,-0.386542,-0.237192,0.059233,-0.268529,0.356585,0.09509,-0.36767,...,-0.086699,0.465709,-0.067263,-0.474439,-0.090253,-0.149597,-0.047571,-0.148241,-0.014998,1.0
16525,0.000866,-0.354202,-0.428207,-0.12796,0.188175,-0.415807,0.359235,0.047495,0.167235,-0.148469,...,0.468923,-0.178838,0.158628,0.260055,0.042674,-0.051289,0.167824,-0.244007,-0.143949,0.0
18672,0.094622,-0.01589,0.13104,0.235522,0.34424,0.094459,-0.260801,-0.167188,-0.103057,0.209749,...,-0.128503,-0.116131,-0.02683,-0.217328,0.366705,-0.037544,-0.442792,0.306934,0.205799,0.0
16952,0.41994,0.225918,0.188928,0.090584,-0.36056,-0.180346,0.190001,0.260009,0.066459,0.26829,...,0.014408,0.333524,-0.011516,-0.129948,-0.256695,-0.075641,0.134001,-0.198112,-0.013948,0.0


In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 18311 to 9884
Columns: 290 entries, 0 to Sentiment
dtypes: float32(290)
memory usage: 27.8 MB


In [18]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)