In [664]:
from __future__ import absolute_import, print_function, unicode_literals, division
import os, sys
import pickle

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

import tensorflow
from tensorflow import keras

from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

[nltk_data] Downloading package punkt to /Users/vlsnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.listdir('aclImdb')

['.DS_Store',
 'imdbEr.txt',
 'test',
 'test.csv',
 'imdb.vocab',
 'README',
 'train',
 'train.csv']

In [239]:
max_length

601

## Load the IMBD Data

In [617]:
data_train = pd.DataFrame(columns=['text', 'target'])
data_test = pd.DataFrame(columns=['text', 'target'])

In [618]:
"""
Train data
"""
basic_path = os.path.join(*['aclImdb', 'train', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] train \'pos\' data loaded')
        
basic_path = os.path.join(*['aclImdb', 'train', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] train \'neg\' data loaded')


"""
Test data
"""       
basic_path = os.path.join(*['aclImdb', 'test', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] test \'pos\' data loaded')

basic_path = os.path.join(*['aclImdb', 'test', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] test \'neg\' data loaded')

[info] train 'pos' data loaded
[info] train 'neg' data loaded
[info] test 'pos' data loaded
[info] test 'neg' data loaded


In [619]:
data_train = shuffle(data_train).reset_index(drop=True)
data_test = shuffle(data_test).reset_index(drop=True)

data_train.to_csv('aclImdb/train.csv', index=False)
data_train.to_csv('aclImdb/test.csv', index=False)

In [749]:
data_train

Unnamed: 0,text,target
0,"Okay, I've tried and I've tried, but I STILL D...",0
1,Definitely spoilers in this review! I **adore*...,1
2,"Much like the comedy duo of its title, ""The Su...",1
3,I know that the real story of Little Richard i...,0
4,Hail Bollywood and men Directors !<br /><br />...,0
...,...,...
995,I am stunned to discover the amount of fans th...,0
996,This movie is just not worth your time. Its re...,0
997,Allow me to start this review by saying this: ...,0
998,Nightmare Weekend is proof positive that some ...,0


## Working with Word Embeddings

In [564]:
class TokenizeTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        tokenizer = tfds.features.text.Tokenizer()
        X['text'] = X['text'].map(lambda a: [word for word in encoder.tokenize(a) if len(word) >= 3])
        return X

In [565]:
class SteemerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, steemer=PorterStemmer()):
        self.steemer = steemer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: ' '.join([self.steemer.stem(word) for word in a]))
        return X

In [589]:
class VectorizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=100, max_length=None):
        self.vocab_size = vocab_size
        self.max_length = max_length
    
    def fit(self, X, y=None):
        
        if self.max_length is None:
            longest_sentence = lambda a: len(a.split())
            roi = max(X['text'], key=longest_sentence)
            self.max_length = len(roi.split())
        else:
            self.max_length = max_length
        
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: one_hot(a, 100))
        X['text'] = pad_sequences(X['text'], 1000, padding='post').tolist()
        
        return X

In [604]:
pipeline = Pipeline([
    ('tokenize', TokenizeTransform()),
    ('steem', SteemerTransform(steemer=SnowballStemmer('russian'))),
    ('vectorize', VectorizeTransformer(vocab_size=1000, max_length=300))
])