In [241]:
from __future__ import absolute_import, print_function, unicode_literals, division
import os, sys

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.base import BaseEstimator

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

import tensorflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

[nltk_data] Downloading package punkt to /Users/vlsnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.listdir('aclImdb')

['.DS_Store',
 'imdbEr.txt',
 'test',
 'test.csv',
 'imdb.vocab',
 'README',
 'train',
 'train.csv']

In [239]:
max_length

601

## Load the IMBD Data

In [112]:
data_train = pd.DataFrame(columns=['text', 'target'])
data_test = pd.DataFrame(columns=['text', 'target'])

In [113]:
"""
Train data
"""
basic_path = os.path.join(*['aclImdb', 'train', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] train \'pos\' data loaded')
        
basic_path = os.path.join(*['aclImdb', 'train', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] train \'neg\' data loaded')


"""
Test data
"""       
basic_path = os.path.join(*['aclImdb', 'test', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] test \'pos\' data loaded')

basic_path = os.path.join(*['aclImdb', 'test', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] test \'neg\' data loaded')

[info] train 'pos' data loaded
[info] train 'neg' data loaded
[info] test 'pos' data loaded
[info] test 'neg' data loaded


In [114]:
data_train = shuffle(data_train).reset_index(drop=True)
data_test = shuffle(data_test).reset_index(drop=True)

data_train.to_csv('aclImdb/train.csv', index=False)
data_train.to_csv('aclImdb/test.csv', index=False)

In [115]:
data_train

Unnamed: 0,text,target
0,"This is a quirky little movie, and I have to a...",1
1,Those who only remember the late Sir Peter Ust...,1
2,We saw this at one of the local art movie thea...,1
3,THAT'S certainly a strange way to promote a fi...,0
4,"After 'Aakrosh' , this was the second film for...",1
...,...,...
995,"As a long-standing Barbra fan, any posting lik...",1
996,I never understood why some people dislike Bol...,0
997,"Hey,<br /><br />If your going to make a docume...",0
998,Proof that not everything Tarantino touches tu...,0


## Working with Word Embeddings

In [153]:
class TokenizeTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        tokenizer = tfds.features.text.Tokenizer()
        X['text'] = X['text'].map(lambda a: [word for word in encoder.tokenize(a) if len(word) >= 3])
        return X

In [168]:
class SteemerTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        steemer = PorterStemmer()
        X['text'] = X['text'].map(lambda a: ' '.join([steemer.stem(word) for word in a]))
        return X

In [344]:
class VectorizeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, vocab_size=100):
        self.vocab_size = vocab_size
        
        return self
    
    def transform(self, X, y=None):
        longest_sentence = lambda a: len(a.split())
        roi = max(X['text'], key=longest_sentence)
        max_length = len(roi.split())

        X['text'] = X['text'].map(lambda a: one_hot(a, 100))
        X['text'] = pad_sequences(X['text'], max_length, padding='post').tolist()
        
        return X

In [345]:
pipeline = Pipeline([
    ('tokenize', TokenizeTransform()),
    ('steem', SteemerTransform()),
    ('vectorize', VectorizeTransformer())
])

In [359]:
X = data_train.iloc[:20].copy()

In [360]:
X = pipeline.fit_transform(X)

In [368]:
X_train = np.array(X['text'].values.tolist(), dtype=np.int32)
y_train = X['target'].values.astype(np.int32)

In [370]:
model = Sequential([
    layers.Embedding(100, 20, input_length=max_length),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [373]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 601, 20)           2000      
_________________________________________________________________
flatten_4 (Flatten)          (None, 12020)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 12021     
Total params: 14,021
Trainable params: 14,021
Non-trainable params: 0
_________________________________________________________________
None


In [374]:
model.fit(X_train, y_train, epochs=100, verbose=1)
sys.stdout.flush()

Train on 20 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

In [376]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000
