In [224]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

### Haven`t find better way to download encoder

In [3]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True, as_supervised=True
)



In [4]:
encoder = info.features['text'].encoder

In [17]:
[f for f in os.listdir('DATA') if not f.startswith('.')]

['test_submission.csv',
 'embeddings',
 'test.csv',
 'train.csv',
 'sample_submission.csv']

In [113]:
def fetch_data_train(path='DATA'):
    data = pd.read_csv(os.path.join(path, 'train.csv'), nrows=10000)
    
    X = data.drop(['qid', 'target'], axis=1).values
    y = data['target'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

def fetch_data_test(path='DATA'):
    train = pd.read_csv(os.path.join(path, 'train.csv'))
    test = pd.read_csv(os.path.join(path, 'test.csv'))
    
    X_train = train.drop(['qid', 'target'], axis=1).values
    y_train = train['target'].values
    X_test = train.drop(['qid'], axis=1).values
    
    return X_train, X_test, y_train

In [243]:
X_train, X_test, y_train, y_test = fetch_data_train()

In [52]:
X_train

array([['What is the best possible way to utilise 15 minutes of reading time during class 10 board exams?'],
       ['How can life fascinate a person?'],
       ['What is the duration of the longest female orgasm?'],
       ...,
       ['What influenced Warren Sapp to become an athlete?'],
       ['Regarding a convergent (destructive) tectonic plate, why does the forcing of an oceanic plate under a continental plate force magma up into a volcano?'],
       ['Will liberals ever grow up and quit acting like children towards President Trump?']],
      dtype=object)

In [42]:
example_sentence = X_train[0][0]
encoded_sentence = encoder.encode(example_sentence)
decoded_sentence = encoder.decode(encoded_sentence)

assert example_sentence == decoded_sentence

print('Original sentence:', decoded_sentence)
print('Encoded sentence', encoded_sentence)

Original sentence: What are the best ways to warm bread in the oven?
Encoded sentence [274, 29, 1, 175, 1766, 7, 1892, 7961, 2144, 2189, 11, 1, 1928, 413, 7992]


In [238]:
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, encoder):
        self.encoder = encoder
        self.vocab_size = self.encoder.vocab_size
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = list()
        for line in X.reshape(1, -1)[0]:
            X_transformed.append(
                self.encoder.encode(line)
            )
        return np.array(X_transformed)

In [239]:
class PaddingTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.max_len = len(max(X, key=len))
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        
        for line in X:
            X_transformed.append(
                np.concatenate((
                    np.array(line),
                    np.array([0] * (self.max_len - len(line)))
                ), axis=0)
            )
        return np.array(X_transformed, dtype=np.int32)

In [244]:
pipeline = Pipeline([
    ('embedding', EmbeddingTransformer(encoder=encoder)),
    ('padding', PaddingTransform())
])

In [245]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# Use part of data for validation
X_valid = X_train[-1000:]
y_valid = y_train[-1000:]
X_train = X_train[:-1000]
y_train = y_train[:-1000]

In [223]:
print('X shape:', X_train.shape)
X_train

X shape: (8000, 125)


array([[ 274,   29,    1, ...,    0,    0,    0],
       [ 274,   29,   63, ...,    0,    0,    0],
       [ 274,    9,   74, ...,    0,    0,    0],
       ...,
       [ 809,  110, 2640, ...,    0,    0,    0],
       [ 809,   18,    1, ...,    0,    0,    0],
       [ 809,    9,  417, ...,    0,    0,    0]], dtype=int32)

## Creating model

In [225]:
embedding_dim = 16

model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 130,977
Trainable params: 130,977
Non-trainable params: 0
_________________________________________________________________


In [250]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_valid, y_valid),
)

Train on 7000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [253]:
model.layers[0].get_weights()[0].shape

(8185, 16)