In [1]:
#Import all the dependencies
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join
import numpy as np

import pandas as pd

MODEL_NAME = "doc2vec.model"

POS_TRAIN_PATH = "aclImdb/train/pos/"
NEG_TRAIN_PATH = "aclImdb/train/neg/"
POS_TEST_PATH = "aclImdb/test/pos/"
NEG_TEST_PATH = "aclImdb/test/neg/"

In [2]:
train_features = []
train_labels = []
test_features = []
test_labels = []

In [3]:
filenames = [f for f in listdir(POS_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(1)
    
filenames = [f for f in listdir(NEG_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(0)

In [4]:
filenames = [f for f in listdir(POS_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(1)
    
filenames = [f for f in listdir(NEG_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(0)

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

def nlp_clean(data):
   new_data = []
   for d in data:
      new_str = d.lower()
      dlist = tokenizer.tokenize(new_str)
      dlist = list(set(dlist).difference(stopword_set))
      new_data.append(dlist)
        
   return new_data

train_features = nlp_clean(train_features)
test_features = nlp_clean(test_features)

In [6]:
class LabeledLineSentence(object):
    
    def __init__(self, docs):
        self.docs = docs
        
    def __iter__(self):
        for idx, doc in enumerate(self.docs):
              yield gensim.models.doc2vec.LabeledSentence(doc,[str(idx)])

iterator = LabeledLineSentence(train_features)
model = gensim.models.Doc2Vec(size=300, window=5, min_count=5, workers=11,alpha=0.025, min_alpha=0.025, iter=20)
model.build_vocab(iterator)
model.train(iterator, epochs=model.iter, total_examples=model.corpus_count)

model.save(MODEL_NAME)
print(MODEL_NAME + " saved")


  
  del sys.path[0]


doc2vec.model saved


In [7]:
columns = [str(x) for x in range(len(model.docvecs[1]))]
columns.append("Sentiment")

In [8]:
train_data = None

for idx in range(len(train_labels)):
    features = np.array(model.docvecs[str(idx)])
    label = np.array([train_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if train_data is None:
        train_data = row
    else:
        train_data = np.concatenate((train_data, row), axis=0)
    

In [9]:
np.shape(train_data)

(25000, 301)

In [10]:
test_data = None

for idx in range(len(test_labels)):
    features = np.array(model.infer_vector(test_features[idx]))
    label = np.array([test_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if test_data is None:
        test_data = row
    else:
        test_data = np.concatenate((test_data, row), axis=0)
        

In [11]:
np.shape(test_data)

(25000, 301)

In [12]:
train_data = pd.DataFrame(columns=columns, data=train_data)
test_data = pd.DataFrame(columns=columns, data=test_data)

In [13]:
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [14]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Sentiment
16678,0.157035,-0.077773,-0.173457,-0.045342,-0.025111,-0.128597,0.211673,0.033088,0.206322,0.027569,...,-0.230402,-0.016707,0.122809,-0.082858,-0.00775,0.112001,-0.028799,-0.162746,-0.228648,0.0
16879,0.226391,0.033195,0.059582,-0.014985,-0.044724,-0.28326,0.083601,0.16085,-0.125848,0.112303,...,0.008532,-0.293815,0.116829,-0.02888,-0.066116,0.162063,0.177438,0.249703,-0.057617,0.0
332,-0.031416,-0.162592,-0.079213,-0.025957,-0.125881,-0.472597,0.136442,0.731003,0.363835,-0.256579,...,0.038273,-0.433244,-0.071415,-0.4194,0.118262,0.031393,0.023171,0.335506,0.299827,1.0
23766,-0.377231,-0.515283,0.119052,-0.260444,-0.305069,-0.262944,-0.028479,0.34079,-0.200371,0.008493,...,-0.127249,-0.470775,0.063966,0.242453,-0.062366,-0.002597,0.128663,0.424465,0.468293,0.0
3785,0.020357,-0.190749,0.040272,-0.346926,-0.101267,-0.167717,-0.044523,0.366768,0.07376,-0.139833,...,-0.049578,-0.039752,-0.041411,-0.101943,0.285466,0.263511,0.312012,0.180807,-0.017537,1.0


In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 16678 to 3112
Columns: 301 entries, 0 to Sentiment
dtypes: float32(301)
memory usage: 28.9 MB


In [16]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Sentiment
12337,-0.342081,-0.569023,-0.026324,-0.420646,0.188004,-0.343321,0.220908,0.481408,0.124483,-0.247667,...,-0.077194,-0.382789,0.226403,0.042721,0.126723,0.462204,-0.169879,0.133532,0.317008,1.0
17706,-0.072118,0.151193,0.016482,0.082965,0.076544,0.029721,0.097374,-0.153005,0.056902,-0.049708,...,0.0232,0.092285,0.098781,-0.011472,0.057291,-0.087447,0.017755,0.107222,-0.04047,0.0
22715,-0.230624,-0.753719,-0.182895,-0.420529,0.009941,-0.632932,0.118572,0.179107,0.376835,-0.114257,...,-0.07405,-0.204363,0.161277,0.116236,0.198098,0.133722,-0.185597,0.263306,0.203471,0.0
8815,-0.332862,-0.669216,-0.298723,-0.210285,0.017021,-0.365092,0.326148,0.50861,0.265224,-0.05129,...,-0.244389,-0.479868,0.206616,-0.409209,0.023217,0.162596,0.037006,0.152857,0.388064,1.0
6185,0.204214,0.101525,0.229125,-0.179252,0.113265,-0.014263,0.262248,0.012457,-0.042572,-0.061777,...,0.171438,0.118616,0.027691,-0.046567,0.13243,-0.023816,-0.240989,0.305244,0.379513,1.0


In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 12337 to 19447
Columns: 301 entries, 0 to Sentiment
dtypes: float32(301)
memory usage: 28.9 MB


In [18]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)