In [1]:
#Import all the dependencies
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join
import numpy as np

import pandas as pd

PRUNE = False
POS_TRAIN_PATH = "aclImdb/train/pos/"
NEG_TRAIN_PATH = "aclImdb/train/neg/"
POS_TEST_PATH = "aclImdb/test/pos/"
NEG_TEST_PATH = "aclImdb/test/neg/"

In [2]:
train_features = []
train_labels = []
test_features = []
test_labels = []

In [3]:
filenames = [f for f in listdir(POS_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(1)
    
filenames = [f for f in listdir(NEG_TRAIN_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TRAIN_PATH + filename).read()
    train_features.append(data)
    train_labels.append(0)

In [4]:
filenames = [f for f in listdir(POS_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(POS_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(1)
    
filenames = [f for f in listdir(NEG_TEST_PATH) if f.endswith('.txt')]
for filename in filenames:
    data = open(NEG_TEST_PATH + filename).read()
    test_features.append(data)
    test_labels.append(0)

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

def nlp_clean(data):
   new_data = []
   for d in data:
      new_str = d.lower()
      dlist = tokenizer.tokenize(new_str)
      dlist = list(set(dlist).difference(stopword_set))
      new_data.append(dlist)
        
   return new_data

train_features = nlp_clean(train_features)
test_features = nlp_clean(test_features)

In [6]:
mean_len = 0
bow = []
for f in train_features:
    mean_len = mean_len + len(f)
    bow.extend(f)
    
for f in test_features:
    mean_len = mean_len + len(f)
    bow.extend(f)
    
mean_len = np.int32(mean_len / (len(bow)))
bow = np.array(bow)

In [7]:
mean_len = np.int16(np.ceil(mean_len / 100) * 100)

In [8]:
mean_len

100

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(bow)
print(len(encoder.classes_))

101791


In [10]:
x_train = []
x_test = []

counter = 0
for f in train_features:
    r = np.array(f)
    r = encoder.transform(r)
    x_train.append(r)
    counter = counter + 1
    if counter % 1000 == 0:
        print(counter)
        if PRUNE:
            break

counter = 0
for f in test_features:
    r = np.array(f)
    r = encoder.transform(r)
    x_test.append(r)
    counter = counter + 1
    if counter % 1000 == 0:
        print(counter)
        if PRUNE:
            break


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [11]:
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [12]:
from keras.preprocessing import sequence

x_train = sequence.pad_sequences(x_train_copy, maxlen=mean_len)
x_test = sequence.pad_sequences(x_test_copy, maxlen=mean_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [13]:
columns = [str(x) for x in range(mean_len)]
columns.append("Sentiment")

In [14]:
train_data = None

for idx in range(len(x_train)):
    features = x_train[idx]
    label = np.array([train_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if train_data is None:
        train_data = row
    else:
        train_data = np.concatenate((train_data, row), axis=0)
    

In [15]:
np.shape(train_data)

(25000, 101)

In [16]:
test_data = None

for idx in range(len(x_test)):
    features = x_test[idx]
    label = np.array([test_labels[idx]], dtype=np.int16)
    row = np.array([np.concatenate((features, label), axis = 0)])
    
    if test_data is None:
        test_data = row
    else:
        test_data = np.concatenate((test_data, row), axis=0)
        

In [17]:
np.shape(test_data)

(25000, 101)

In [18]:
train_data = pd.DataFrame(columns=columns, data=train_data)
test_data = pd.DataFrame(columns=columns, data=test_data)

In [19]:
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [20]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
23755,99187,2368,90203,51475,24966,62260,16205,63590,53094,76217,...,68811,14935,55817,79230,42445,61963,68850,76442,6285,0
2821,33203,64713,28182,7070,24333,64043,85544,86493,26421,21093,...,96864,69006,31851,97929,4235,98062,73259,79230,35573,1
15403,0,0,0,0,0,0,0,0,0,0,...,79825,74092,56950,47999,1183,85880,40416,20724,40419,0
13634,48814,76438,11971,13458,90869,16009,29884,35267,26070,81912,...,92543,80160,75957,85808,85880,85125,19165,3730,70162,0
24040,38407,83075,30942,69630,64068,100154,20913,94802,14091,37770,...,8899,79230,73259,30514,20193,61963,18991,45805,97918,0


In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 23755 to 7728
Columns: 101 entries, 0 to Sentiment
dtypes: int32(101)
memory usage: 9.8 MB


In [22]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
23998,0,0,0,0,0,0,0,0,0,0,...,100025,8964,32398,53540,79686,60609,37182,49600,31491,0
5742,0,0,0,0,0,39969,71564,28984,1317,28653,...,31775,17864,79693,37716,96071,33358,5215,56950,98226,1
215,33203,73295,45741,40901,24333,49592,22821,19556,7360,30900,...,18835,1318,992,76239,85880,18790,31024,96036,6940,1
744,33203,57395,100745,53532,85914,25041,92199,18712,63679,78424,...,84150,71781,7972,15360,44303,55006,53540,85880,58,1
23793,98112,22958,14091,60192,82544,42867,62002,68726,30441,66207,...,96071,39503,55006,99839,36245,85880,63518,3239,54646,0


In [23]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 23998 to 17097
Columns: 101 entries, 0 to Sentiment
dtypes: int32(101)
memory usage: 9.8 MB


In [24]:
train_data.to_csv("train_data_lstm.csv", index=False)
test_data.to_csv("test_data_lstm.csv", index=False)