In [664]:
from __future__ import absolute_import, print_function, unicode_literals, division
import os, sys
import pickle

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

import tensorflow
from tensorflow import keras

from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

[nltk_data] Downloading package punkt to /Users/vlsnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.listdir('aclImdb')

['.DS_Store',
 'imdbEr.txt',
 'test',
 'test.csv',
 'imdb.vocab',
 'README',
 'train',
 'train.csv']

In [239]:
max_length

601

## Load the IMBD Data

In [617]:
data_train = pd.DataFrame(columns=['text', 'target'])
data_test = pd.DataFrame(columns=['text', 'target'])

In [618]:
"""
Train data
"""
basic_path = os.path.join(*['aclImdb', 'train', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] train \'pos\' data loaded')
        
basic_path = os.path.join(*['aclImdb', 'train', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] train \'neg\' data loaded')


"""
Test data
"""       
basic_path = os.path.join(*['aclImdb', 'test', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] test \'pos\' data loaded')

basic_path = os.path.join(*['aclImdb', 'test', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] test \'neg\' data loaded')

[info] train 'pos' data loaded
[info] train 'neg' data loaded
[info] test 'pos' data loaded
[info] test 'neg' data loaded


In [619]:
data_train = shuffle(data_train).reset_index(drop=True)
data_test = shuffle(data_test).reset_index(drop=True)

data_train.to_csv('aclImdb/train.csv', index=False)
data_train.to_csv('aclImdb/test.csv', index=False)

In [749]:
data_train

Unnamed: 0,text,target
0,"Okay, I've tried and I've tried, but I STILL D...",0
1,Definitely spoilers in this review! I **adore*...,1
2,"Much like the comedy duo of its title, ""The Su...",1
3,I know that the real story of Little Richard i...,0
4,Hail Bollywood and men Directors !<br /><br />...,0
...,...,...
995,I am stunned to discover the amount of fans th...,0
996,This movie is just not worth your time. Its re...,0
997,Allow me to start this review by saying this: ...,0
998,Nightmare Weekend is proof positive that some ...,0


## Working with Word Embeddings

In [564]:
class TokenizeTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        tokenizer = tfds.features.text.Tokenizer()
        X['text'] = X['text'].map(lambda a: [word for word in encoder.tokenize(a) if len(word) >= 3])
        return X

In [565]:
class SteemerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, steemer=PorterStemmer()):
        self.steemer = steemer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: ' '.join([self.steemer.stem(word) for word in a]))
        return X

In [589]:
class VectorizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=100, max_length=None):
        self.vocab_size = vocab_size
        self.max_length = max_length
    
    def fit(self, X, y=None):
        
        if self.max_length is None:
            longest_sentence = lambda a: len(a.split())
            roi = max(X['text'], key=longest_sentence)
            self.max_length = len(roi.split())
        else:
            self.max_length = max_length
        
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: one_hot(a, 100))
        X['text'] = pad_sequences(X['text'], 1000, padding='post').tolist()
        
        return X

In [604]:
pipeline = Pipeline([
    ('tokenize', TokenizeTransform()),
    ('steem', SteemerTransform(steemer=SnowballStemmer('russian'))),
    ('vectorize', VectorizeTransformer(vocab_size=1000, max_length=300))
])

In [632]:
X = data_train.copy()

In [647]:
embeddings_index = {}
GLOVE_DIR = ''
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

In [677]:
texts = X['text'].values

In [678]:
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [680]:
word_index = tokenizer.word_index

In [711]:
num_words = min(1000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_index.items():
    if i >= 1000:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [719]:
data = pad_sequences(sequences, maxlen=200)

In [746]:
data

array([[  0,   0,   0, ..., 201,  15,  19],
       [  1, 103,  44, ...,  91,  30,  29],
       [244,  35, 400, ..., 141,  11,   7],
       ...,
       [902,  69,   7, ...,  11,  13,   1],
       [  5,   1, 375, ..., 155,  37,   8],
       [  0,   0,   0, ...,   3,   1, 366]], dtype=int32)

In [743]:
model = keras.Sequential([
    layers.Embedding(num_words,
                    100,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=200,
                    trainable=False),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

TypeError: The added layer must be an instance of class Layer. Found: <keras.layers.embeddings.Embedding object at 0x1a5d53c9d0>

In [493]:
data_train = pipeline.fit_transform(data_train)
data_test = pipeline.transform(data_test)

In [505]:
X_train = np.array(data_train['text'].tolist(), dtype=np.int32)
X_test = np.array(data_test['text'].tolist(), dtype=np.int32)

y_train = data_train['target'].values.astype(np.int32)
y_test = data_test['target'].values.astype(np.int32)

X_valid = X_test[500:]
y_valid = y_test[500:]
X_test = X_test[:500]
y_test = y_test[:500]

## Building the model

In [609]:
model = Sequential([
    #layers.Embedding(100, 20, input_length=1000),
    layers.Dense(126, input_shape=[1000], \
                activation='relu'),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [610]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 126)               126126    
_________________________________________________________________
flatten_14 (Flatten)         (None, 126)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 127       
Total params: 126,253
Trainable params: 126,253
Non-trainable params: 0
_________________________________________________________________
None


In [611]:
model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=32, validation_data=(X_valid, y_valid))

Train on 300 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a51a29310>

In [612]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 25.000000
