In [1]:
# !python -m spacy download en_core_web_sm

import glob
import os
import pickle
from random import shuffle

import keras
import numpy as np
import spacy
from tqdm import tqdm_notebook as tqdm

Using TensorFlow backend.


In [2]:
np.random.seed(0)

nlp = spacy.load("en_core_web_sm")

MAX_LEN = 100
x = np.random.random((50, ))
OOV_VECTOR = x / np.linalg.norm(x)
PAD_VECTOR = np.zeros((50, ))

In [4]:
with open('data/oov_vector.p', 'wb') as f:
    pickle.dump(OOV_VECTOR, f)

In [4]:
# Dataset attribution
# https://ai.stanford.edu/~amaas/data/sentiment/
'''
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
'''

# Glove attribution
'''
@inproceedings{pennington2014glove,
  author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
  title = {GloVe: Global Vectors for Word Representation},
  year = {2014},
  pages = {1532--1543},
  url = {http://www.aclweb.org/anthology/D14-1162},
}
'''

'\n@inproceedings{pennington2014glove,\n  author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},\n  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},\n  title = {GloVe: Global Vectors for Word Representation},\n  year = {2014},\n  pages = {1532--1543},\n  url = {http://www.aclweb.org/anthology/D14-1162},\n}\n'

In [5]:
def load_imdb_data(dir_path):
    pos_filepath = os.path.join(dir_path, 'pos/*.txt')
    neg_filepath = os.path.join(dir_path, 'neg/*.txt')
    
    examples = []
  
    files = glob.glob(pos_filepath)   
    for file in files:     
        with open(file, 'r') as f:  
            examples.append((f.read(), 1))
        
    files = glob.glob(neg_filepath)   
    for file in files:     
        with open(file, 'r') as f:  
            examples.append((f.read(), 0))
        
    shuffle(examples)
    
    return examples

In [6]:
train = load_imdb_data('aclImdb/train')
test = load_imdb_data('aclImdb/test')  # If you want to use this later as a val or test set

In [7]:
print(f'Number of train examples: {len(train)}')
print(f'Number of test examples: {len(test)}')

Number of train examples: 25000
Number of test examples: 25000


In [8]:
print(keras.__version__)

2.0.0


In [11]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="data/glove.6B.50d.txt", word2vec_output_file="data/gensim_glove_vectors.txt")

(400000, 50)

In [12]:
from gensim.models.keyedvectors import KeyedVectors
wv_model = KeyedVectors.load_word2vec_format("data/gensim_glove_vectors.txt", binary=False)

In [13]:
def load_vecs(filepath):
    vecs = {}
    with open(filepath) as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            vecs[word] = embedding
    return vecs

In [15]:
wv_model = load_vecs('data/glove.6B.50d.txt')

In [16]:
len(wv_model)

400000

In [17]:
tokenized_sample = nlp('Hey there, tokenize me.'.lower())
print([x for x in tokenized_sample])
print(wv_model[str(tokenized_sample[0])])

[hey, there, ,, tokenize, me, .]
[-0.7001    0.36781   0.34424  -0.42318  -0.046018 -0.66072  -0.33993
  0.18271  -0.92863   0.5684   -0.43819   0.70827  -0.47459  -0.079269
  1.0187    0.2213    0.43073   0.76719   0.18774  -0.49214  -0.53063
  0.56379   0.63571   0.64622   1.2649   -0.82901  -1.3903    0.3749
  0.61316  -1.5994    1.3005    0.64347  -0.58004   1.0372   -0.27156
 -0.43382   0.8554   -0.8967    0.80176  -0.33333  -0.17654  -0.12277
 -0.70508  -0.28412   0.71149  -0.13487   0.049514 -0.8134    0.34293
  1.0381  ]


In [18]:
def preprocess_and_tokenize(line):
    tokens = nlp(line.lower())
    return [x for x in tokens if x]

In [19]:
def vectorize_pad_sample(example, wv_model):
    line, target = example
    vectors = []
    tokens = preprocess_and_tokenize(line)
    for token in tokens:
        try:
            vectors.append(wv_model[str(token)])
        except KeyError:
            vectors.append(OOV_VECTOR)
    pad_len = MAX_LEN - len(vectors)
    if pad_len > 0:
        vectors.extend([PAD_VECTOR] * pad_len)
    return (np.array(vectors[:MAX_LEN]), target)

In [20]:
a = ('python or else ' * 32, 1)
x, y = vectorize_pad_sample(a, wv_model)
print(x.shape)

(100, 50)


In [21]:
X_train = []
y_train = []
for example in tqdm(train):
    x, y = vectorize_pad_sample(example, wv_model)
    X_train.append(x)
    y_train.append(y)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [22]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [23]:
# Simple model

input_shape = X_train[0].shape
_input = keras.layers.Input(input_shape)
x = keras.layers.Conv1D(25, 3, activation='relu')(_input)
x = keras.layers.MaxPooling1D(2)(x)
x = keras.layers.Conv1D(50, 3, activation='relu')(x)
x = keras.layers.MaxPooling1D(2)(x)
x = keras.layers.Conv1D(100, 3, activation='relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dense(60, activation='relu')(x)
x = keras.layers.Dropout(.2)(x)
x = keras.layers.Dense(1, activation='sigmoid', name='final_output')(x)

model = keras.models.Model(_input, x)

W0905 20:15:19.688017 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:47: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0905 20:15:19.716189 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:349: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0905 20:15:19.726828 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3147: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0905 20:15:19.746971 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorfl

In [24]:
weights_file = 'tmp_model_1_weights'
callbacks = [
    keras.callbacks.ModelCheckpoint(weights_file, monitor='val_loss', save_weights_only=True),
    keras.callbacks.EarlyStopping(patience=3, monitor='val_loss')
]

In [25]:
model.compile(keras.optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

W0905 20:15:19.822947 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/optimizers.py:675: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0905 20:15:19.826447 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2614: The name tf.log is deprecated. Please use tf.math.log instead.

W0905 20:15:19.829594 4611966400 deprecation.py:323] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
model.fit(X_train, y_train, validation_split=0.2, batch_size=32, nb_epoch=30, callbacks=callbacks)  # Note old Keras api

W0905 20:15:20.103591 4611966400 deprecation.py:506] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:519: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0905 20:15:20.126255 4611966400 deprecation_wrapper.py:119] From /Users/cole/.local/share/virtualenvs/tf_blog-rq3FUUEN/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:762: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.



Train on 20000 samples, validate on 5000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


<keras.callbacks.History at 0x187500438>

In [27]:
model.load_weights(weights_file)
model.save('model_1.h5')

In [28]:
a = ('python is the most wonderful language ' * 100, 1)
x, y = vectorize_pad_sample(a, wv_model)
x.shape

(100, 50)

In [29]:
model.predict(np.array([x]))

array([[0.8178611]], dtype=float32)