In [1]:
#Import all the dependencies
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join
import numpy as np

import pandas as pd

MODEL_NAME = "doc2vec.model"
SIZE = 15
WINDOW_SIZE = 3

POS_TRAIN_PATH = "aclImdb/train/pos/"
NEG_TRAIN_PATH = "aclImdb/train/neg/"
POS_TEST_PATH = "aclImdb/test/pos/"
NEG_TEST_PATH = "aclImdb/test/neg/"

In [2]:
train_features = []
train_labels = []
test_features = []
test_labels = []

In [3]:
filenames = [f for f in listdir(POS_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(1)
    
filenames = [f for f in listdir(NEG_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(0)

In [4]:
filenames = [f for f in listdir(POS_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(1)
    
filenames = [f for f in listdir(NEG_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(0)

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

def nlp_clean(data):
   new_data = []
   for d in data:
      new_str = d.lower()
      dlist = tokenizer.tokenize(new_str)
      dlist = list(set(dlist).difference(stopword_set))
      new_data.append(dlist)
        
   return new_data

train_features = nlp_clean(train_features)
test_features = nlp_clean(test_features)

In [6]:
class LabeledLineSentence(object):
    
    def __init__(self, docs):
        self.docs = docs
        
    def __iter__(self):
        for idx, doc in enumerate(self.docs):
              yield gensim.models.doc2vec.LabeledSentence(doc,[str(idx)])

iterator = LabeledLineSentence(train_features)
model = gensim.models.Doc2Vec(size=SIZE * SIZE, window=WINDOW_SIZE, min_count=5, workers=16,alpha=0.025, min_alpha=0.025, iter=25)
model.build_vocab(iterator)
model.train(iterator, epochs=model.iter, total_examples=model.corpus_count)

model.save(MODEL_NAME)
print(MODEL_NAME + " saved")


  
  del sys.path[0]


doc2vec.model saved


In [7]:
columns = [str(x) for x in range(len(model.docvecs[1]))]
columns.append("Sentiment")

In [8]:
train_data = None

for idx in range(len(train_labels)):
    features = np.array(model.docvecs[str(idx)])
    label = np.array([train_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if train_data is None:
        train_data = row
    else:
        train_data = np.concatenate((train_data, row), axis=0)
    

In [9]:
np.shape(train_data)

(25000, 226)

In [10]:
test_data = None

for idx in range(len(test_labels)):
    features = np.array(model.infer_vector(test_features[idx]))
    label = np.array([test_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if test_data is None:
        test_data = row
    else:
        test_data = np.concatenate((test_data, row), axis=0)
        

In [11]:
np.shape(test_data)

(25000, 226)

In [12]:
train_data = pd.DataFrame(columns=columns, data=train_data)
test_data = pd.DataFrame(columns=columns, data=test_data)

In [13]:
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [14]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,216,217,218,219,220,221,222,223,224,Sentiment
10946,-0.293233,0.075336,-0.265466,-0.356092,0.06643,0.761425,-0.001033,-0.340051,-0.140389,-0.384717,...,-0.304356,0.110543,-0.257856,0.248262,0.304541,-0.454722,0.313093,0.096131,0.022209,1.0
13816,-0.281317,-0.052475,-0.227652,-0.145945,0.129469,0.254899,0.192881,-0.210649,-0.311281,0.397568,...,0.147703,-0.144117,0.197492,0.157018,0.054766,-0.073626,0.093853,0.080051,-0.250832,0.0
23255,0.065311,0.078494,0.018382,-0.024447,-0.185517,-0.06263,0.390812,0.222266,0.223175,-0.43704,...,0.39164,-0.040531,0.092544,-0.397565,0.517327,-0.164458,0.110916,0.456602,-0.073503,0.0
1520,0.123459,0.029362,0.05882,-0.235124,0.156227,0.575867,0.26641,-0.644778,-0.04655,-0.047256,...,0.125449,-0.121503,-0.549419,0.024966,0.150538,-0.30221,0.330751,-0.321547,-0.021141,1.0
5730,-0.417293,0.00713,0.499212,-0.30981,-0.079425,-0.137011,0.22843,-0.066997,0.284044,0.465361,...,0.143851,-0.092237,-0.244069,0.22202,-0.360323,-0.032631,-0.143966,-0.087031,-0.474025,1.0


In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 10946 to 4290
Columns: 226 entries, 0 to Sentiment
dtypes: float32(226)
memory usage: 21.7 MB


In [16]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,216,217,218,219,220,221,222,223,224,Sentiment
1803,0.109306,0.192315,-0.032794,0.152092,0.108466,0.081162,-0.143557,0.087043,0.225055,-0.240789,...,-0.189616,0.036832,0.027846,-0.331823,-0.037055,0.222175,-0.41043,-0.210313,0.229882,1.0
23376,-0.12446,0.135917,0.431114,0.262919,0.010424,0.204219,-0.222301,0.120114,-0.051119,-0.118828,...,-0.020186,0.1851,0.328118,0.036923,-0.085293,0.012201,-0.198617,0.033985,0.00332,0.0
22891,0.211367,0.194651,-0.231363,-0.311015,0.092923,0.164169,0.003516,-0.143032,0.253581,-0.160928,...,-0.193897,0.304636,-0.146392,-0.102875,-0.131144,-0.182048,-0.040771,-0.037335,0.028673,0.0
18862,0.082641,-0.154188,-0.111012,-0.165968,-0.01769,-0.15164,0.027495,-0.121806,0.32214,0.095643,...,-0.106269,-0.169688,0.20156,0.021454,0.247993,0.13258,0.07167,-0.093814,-0.1905,0.0
10268,-0.022724,0.149019,0.319221,-0.033267,0.012244,-0.029595,0.236312,0.370445,-0.128876,0.155701,...,0.138224,0.084814,0.122257,0.099517,0.144641,0.234967,-0.030566,-0.088829,-0.206466,1.0


In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 1803 to 16625
Columns: 226 entries, 0 to Sentiment
dtypes: float32(226)
memory usage: 21.7 MB


In [18]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)