In [573]:
from __future__ import absolute_import, print_function, unicode_literals, division
import os, sys
import pickle

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

import tensorflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

[nltk_data] Downloading package punkt to /Users/vlsnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.listdir('aclImdb')

['.DS_Store',
 'imdbEr.txt',
 'test',
 'test.csv',
 'imdb.vocab',
 'README',
 'train',
 'train.csv']

In [239]:
max_length

601

## Load the IMBD Data

In [488]:
data_train = pd.DataFrame(columns=['text', 'target'])
data_test = pd.DataFrame(columns=['text', 'target'])

In [489]:
"""
Train data
"""
basic_path = os.path.join(*['aclImdb', 'train', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] train \'pos\' data loaded')
        
basic_path = os.path.join(*['aclImdb', 'train', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] train \'neg\' data loaded')


"""
Test data
"""       
basic_path = os.path.join(*['aclImdb', 'test', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] test \'pos\' data loaded')

basic_path = os.path.join(*['aclImdb', 'test', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] test \'neg\' data loaded')

[info] train 'pos' data loaded
[info] train 'neg' data loaded
[info] test 'pos' data loaded
[info] test 'neg' data loaded


In [490]:
data_train = shuffle(data_train).reset_index(drop=True)
data_test = shuffle(data_test).reset_index(drop=True)

data_train.to_csv('aclImdb/train.csv', index=False)
data_train.to_csv('aclImdb/test.csv', index=False)

In [477]:
data_train

Unnamed: 0,text,target
0,"Like many Americans, I was first introduced to...",1
1,THE KING MAKER will doubtless be a success in ...,0
2,I work with children from 0  6 years old and ...,1
3,When I was chairman of our college's coffeehou...,1
4,I remember watching this on prime time when I ...,0
...,...,...
995,I feel totally ripped off. Someone needs to re...,0
996,"Having seen ""Triumph of the Will,"" I can only ...",0
997,This was just another marvelous film of the Be...,1
998,I've just visited Russian forum of our TV-chan...,0


## Working with Word Embeddings

In [564]:
class TokenizeTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        tokenizer = tfds.features.text.Tokenizer()
        X['text'] = X['text'].map(lambda a: [word for word in encoder.tokenize(a) if len(word) >= 3])
        return X

In [565]:
class SteemerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, steemer=PorterStemmer()):
        self.steemer = steemer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: ' '.join([self.steemer.stem(word) for word in a]))
        return X

In [589]:
class VectorizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=100, max_length=None):
        self.vocab_size = vocab_size
        self.max_length = max_length
    
    def fit(self, X, y=None):
        
        if self.max_length is None:
            longest_sentence = lambda a: len(a.split())
            roi = max(X['text'], key=longest_sentence)
            self.max_length = len(roi.split())
        else:
            self.max_length = max_length
        
        return self
    
    def transform(self, X, y=None):
        X['text'] = X['text'].map(lambda a: one_hot(a, 100))
        X['text'] = pad_sequences(X['text'], 1000, padding='post').tolist()
        
        return X

In [604]:
pipeline = Pipeline([
    ('tokenize', TokenizeTransform()),
    ('steem', SteemerTransform(steemer=SnowballStemmer('russian'))),
    ('vectorize', VectorizeTransformer(vocab_size=1000, max_length=300))
])

In [493]:
data_train = pipeline.fit_transform(data_train)
data_test = pipeline.transform(data_test)

In [505]:
X_train = np.array(data_train['text'].tolist(), dtype=np.int32)
X_test = np.array(data_test['text'].tolist(), dtype=np.int32)

y_train = data_train['target'].values.astype(np.int32)
y_test = data_test['target'].values.astype(np.int32)

X_valid = X_test[500:]
y_valid = y_test[500:]
X_test = X_test[:500]
y_test = y_test[:500]

In [549]:
import pickle

with open('dataset.pkl', 'rb') as f:
    articles = pickle.load(f)

In [553]:
dataset = pd.DataFrame(columns=['text', 'target'])

In [562]:
for k, v in articles['дача'].items():
    dataset = dataset.append({'text': v, 'target': 'дача'}, ignore_index=True)

In [563]:
dataset

Unnamed: 0,text,target
0,Вспоминаем работающие методы наших бабушек!\nТ...,здоровье
1,Пора вставать на каблуки или проверить щитовид...,здоровье
2,"Оказывается, существует специальная диета.\n5 ...",здоровье
3,Как выбрать зубную щетку и в чем разница между...,здоровье
4,Каждый третий в нашей стране страдает депресси...,здоровье
...,...,...
495,Хмель можно использовать не только в пивоварен...,дача
496,"Хмель – это красивое, декоративное растение с ...",дача
497,Пепино называют в народе дынной грушей и груше...,дача
498,"Пак-чой – это разновидность капусты, родом из ...",дача


In [605]:
dataset = shuffle(dataset).reset_index(drop=True)
X = dataset.copy()

In [606]:
X = pipeline.fit_transform(X)

In [607]:
X_train = np.array(X['text'].tolist(), dtype=np.int32)
y_train = X['target'].values

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train).astype(np.int32)

In [608]:
X_test = X_train[:100]
y_test = y_train[:100]

X_valid = X_train[100:200]
y_valid = y_train[100:200]

X_train = X_train[200:]
y_train = y_train[200:]

In [603]:
X_train.shape

(300, 1000)

In [609]:
model = Sequential([
    #layers.Embedding(100, 20, input_length=1000),
    layers.Dense(126, input_shape=[1000], \
                activation='relu'),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [610]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 126)               126126    
_________________________________________________________________
flatten_14 (Flatten)         (None, 126)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 127       
Total params: 126,253
Trainable params: 126,253
Non-trainable params: 0
_________________________________________________________________
None


In [611]:
model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=32, validation_data=(X_valid, y_valid))

Train on 300 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a51a29310>

In [612]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 25.000000
