In [93]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, recall_score, \
                                precision_score, f1_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import warnings
warnings.filterwarnings('ignore')

### Haven`t find better way to download encoder

In [3]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True, as_supervised=True
)



In [4]:
encoder = info.features['text'].encoder

In [267]:
with open(os.path.join('pickles', 'encoder.pickle'), 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
[f for f in os.listdir('DATA') if not f.startswith('.')]

['test_submission.csv',
 'embeddings',
 'test.csv',
 'train.csv',
 'sample_submission.csv']

In [3]:
def fetch_data_train(path='DATA'):
    data = pd.read_csv(os.path.join(path, 'train.csv'))
    data_positive = data.loc[data['target'] == 1][:80000]
    data_negative = data.loc[data['target'] == 0][:80000]
    
    data = data_positive.append(data_negative, ignore_index=True, sort=False)
    data = data.sample(frac=1).reset_index(drop=True)
    
    X = data.drop(['qid', 'target'], axis=1).values
    y = data['target'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

def fetch_data_test(path='DATA'):
    train = pd.read_csv(os.path.join(path, 'train.csv'))
    test = pd.read_csv(os.path.join(path, 'test.csv'))
    
    X_train = train.drop(['qid', 'target'], axis=1).values
    y_train = train['target'].values
    X_test = train.drop(['qid'], axis=1).values
    
    return X_train, X_test, y_train

In [6]:
X_train, X_test, y_train, y_test = fetch_data_train()

In [274]:
X_train

array([["What do I do to crack IGIDR in three months' time? (MSc economics)"],
       ['What is the best thing you have ever encountered?'],
       ['Which subjects should I choose for fashion technology in class 11th?'],
       ...,
       ['Can you show me pussy photos?'],
       ['Should Donald Trump sue Mexico for the Chicxulub crater that caused the extinction of almost all life on the planet Earth?'],
       ['Did Abraham use the DMT contained in the Acacia tree, to contact Yahweh?']],
      dtype=object)

In [42]:
example_sentence = X_train[0][0]
encoded_sentence = encoder.encode(example_sentence)
decoded_sentence = encoder.decode(encoded_sentence)

assert example_sentence == decoded_sentence

print('Original sentence:', decoded_sentence)
print('Encoded sentence', encoded_sentence)

Original sentence: What are the best ways to warm bread in the oven?
Encoded sentence [274, 29, 1, 175, 1766, 7, 1892, 7961, 2144, 2189, 11, 1, 1928, 413, 7992]


## Custom embedding pipeline

In [300]:
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, encoder):
        self.encoder = encoder
        self.vocab_size = self.encoder.vocab_size
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = list()
        for line in X.reshape(1, -1)[0]:
            X_transformed.append(
                self.encoder.encode(line)
            )
        return np.array(X_transformed)

In [302]:
class PaddingTransform(BaseEstimator, TransformerMixin):
    def __init__(self, max_len=None):
        if max_len is not None:
            self.max_len = max_len
    
    def fit(self, X, y=None):
        if 'max_len' not in self.__dir__():
            self.max_len = len(max(X, key=len))
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        
        for line in X:
            X_transformed.append(
                np.concatenate((
                    np.array(line),
                    np.array([0] * (self.max_len - len(line)))
                ), axis=0)
            )
        return np.array(X_transformed, dtype=np.int32)

In [303]:
pipeline = Pipeline([
    ('embedding', EmbeddingTransformer(encoder=encoder)),
    ('padding', PaddingTransform())
])

In [305]:
pipeline.fit(np.concatenate((
    X_train, X_test
), axis=0))
X_train = pipeline.transform(X_train)
X_test = pipeline.transform(X_test)

# Use part of data for validation
X_valid = X_train[-1000:]
y_valid = y_train[-1000:]
X_train = X_train[:-1000]
y_train = y_train[:-1000]

In [223]:
print('X shape:', X_train.shape)
X_train

X shape: (8000, 125)


array([[ 274,   29,    1, ...,    0,    0,    0],
       [ 274,   29,   63, ...,    0,    0,    0],
       [ 274,    9,   74, ...,    0,    0,    0],
       ...,
       [ 809,  110, 2640, ...,    0,    0,    0],
       [ 809,   18,    1, ...,    0,    0,    0],
       [ 809,    9,  417, ...,    0,    0,    0]], dtype=int32)

In [311]:
to_save = {
    'X_train': X_train,
    'y_train': y_train,
    'X_valid': X_valid,
    'y_valid': y_valid,
    'X_test': X_test,
    'y_test': y_test,
}

In [312]:
with open(os.path.join('pickles', 'dataset.pickle'), 'wb') as handle:
    pickle.dump(to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Some popular solutions

#### Basic preprocessing

In [4]:
X_train, X_test, y_train, y_test = fetch_data_train()

In [5]:
embed_size = 300
max_features = 50000
maxlen = 100

In [6]:
tokenizer = Tokenizer(num_words=max_features)

In [7]:
tokenizer.fit_on_texts(X_train.reshape(1, -1)[0])

In [8]:
X_train = tokenizer.texts_to_sequences(X_train.reshape(1, -1)[0])
X_test = tokenizer.texts_to_sequences(X_test.reshape(1, -1)[0])

In [9]:
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [10]:
X_train

array([[    0,     0,     0, ...,     1, 11922, 18557],
       [    0,     0,     0, ...,   167,     2,   110],
       [    0,     0,     0, ...,   415,  1030,  1119],
       ...,
       [    0,     0,     0, ...,  1968,    14,     1],
       [    0,     0,     0, ...,   688,  6050,  3516],
       [    0,     0,     0, ...,   313,  1245,  1692]], dtype=int32)

### Stanford`s Glove

In [36]:
file = os.path.join(*['DATA', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt'])
def load_coefs(word, *vect):
    return word, np.array(vect, dtype=np.float32)
embeddings = dict(load_coefs(*w.split(" ")) for w in open(file, 'r'))

In [39]:
print('Number of embeddings:', len(embeddings.keys()))
print('Embedding vector shape:', embeddings['foo'].shape)

Number of embeddings: 2196016
Embedding vector shape: (300,)


In [41]:
all_embs = np.stack(embeddings.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [79]:
all_embs

array([[-0.082752,  0.67204 , -0.14987 , ..., -0.1918  , -0.37846 ,
        -0.06589 ],
       [ 0.012001,  0.20751 , -0.12578 , ...,  0.13871 , -0.36049 ,
        -0.035   ],
       [ 0.27204 , -0.06203 , -0.1884  , ...,  0.13015 , -0.18317 ,
         0.1323  ],
       ...,
       [ 0.7344  , -0.33641 ,  0.26918 , ...,  0.63718 , -0.13914 ,
        -0.16472 ],
       [ 0.21215 , -0.99456 ,  1.1782  , ...,  0.93427 , -0.93286 ,
        -0.51479 ],
       [-0.07969 , -0.22905 ,  0.80366 , ..., -0.083561,  0.48532 ,
        -0.7313  ]], dtype=float32)

In [61]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_matrix = embedding_matrix.astype(np.float32)

In [77]:
embedding_matrix

array([[-0.9871725 ,  0.93450505,  0.19410798, ...,  0.0385003 ,
         0.05576302, -0.67120665],
       [ 0.27204   , -0.06203   , -0.1884    , ...,  0.13015   ,
        -0.18317   ,  0.1323    ],
       [ 0.31924   ,  0.06316   , -0.27858   , ...,  0.082745  ,
         0.097801  ,  0.25045   ],
       ...,
       [ 0.6047153 ,  0.28296432, -0.23499994, ..., -0.40001133,
        -0.24867615, -0.342037  ],
       [ 0.915036  , -0.61523443, -0.3464703 , ...,  0.8471742 ,
        -0.3274441 ,  0.20680809],
       [-0.08819957, -0.3774332 , -0.00939436, ..., -0.65271074,
         1.1646155 ,  0.20724036]], dtype=float32)

In [85]:
embedding_dim = 16

model = keras.Sequential([
    layers.Embedding(nb_words, embed_size),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         15000000  
_________________________________________________________________
global_average_pooling1d (Gl (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                4816      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 15,004,833
Trainable params: 15,004,833
Non-trainable params: 0
_________________________________________________________________


In [89]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=1,
    batch_size=32,
)

Train on 128000 samples


In [90]:
y_pred = model.predict(X_test)
y_pred[np.where(y_pred >= 0.5)] = 1
y_pred[np.where(y_pred < 0.5)] = 0

In [96]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Accuracy: 0.8888125
Recall: 0.8948978954599388
Precision: 0.8842949706880593
F1 Score: 0.8895648395306972
