In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.utils.data_utils import get_file

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import re
import numpy as np
import pickle

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text


In [4]:
import re
import numpy as np

In [7]:
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(index_from=3)


In [8]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

In [12]:
X_train_text = [(' '.join(id_to_word[id] for id in x)) for x in list(X_train)]

In [13]:
X_train_text = np.array(X_train_text)

"<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and sh

In [5]:
# !pip install gensim
import gensim

In [0]:
from google.colab import auth
auth.authenticate_user()

In [11]:
project_id = 'data4democracy-wwymak-explore'
bucket_name = 'dl-models-wwymak' 

!gcloud config set project {project_id}
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/mb
# !gsutil mb gs://{bucket_name}

Updated property [core/project].
Creating gs://dl-models-wwymak/...
ServiceException: 409 Bucket dl-models-wwymak already exists.


In [13]:
!gsutil cp gs://{bucket_name}/GoogleNews-vectors-negative300.bin /tmp/GoogleNews-vectors-negative300.bin

Copying gs://dl-models-wwymak/GoogleNews-vectors-negative300.bin...
\ [1 files][  3.4 GiB/  3.4 GiB]   89.6 MiB/s                                   
Operation completed over 1 objects/3.4 GiB.                                      


In [14]:
W2V_MODEL_PATH='~/code_work/models/GoogleNews-vectors-negative300.bin'
# W2V_MODEL_PATH='~/tmp/GoogleNews-vectors-negative300.bin'

In [16]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=True, limit=100000)
# construct embeeing matrix from word2vec as per 
# http://www.jacobsilterra.com/2017/05/03/classifying-text-with-keras-basic-text-processing/#Using_Pre-TrainedVectors
embedding_matrix = model.syn0
# embedding_matrix = model.wv.vectors
# Dictionary mapping from word --> row of embedding matrix
vocab_dict = {word: model.vocab[word].index for word in model.vocab.keys()}

In [17]:
vocab_dim = embedding_matrix.shape[1]# dimensionality of your word vectors
n_symbols = embedding_matrix.shape[0]
# define inputs here
embedding_layer = Embedding(output_dim=vocab_dim, input_dim=n_symbols, trainable=False)
embedding_layer.build((None,)) # if you don't do this, the next step won't work
embedding_layer.set_weights([embedding_matrix])


In [0]:
batch_size = 500
original_dim = 1024 #max number of wrods
latent_dim = 100
intermediate_dim = 256
epochs = 50
epsilon_std = 1.0

x = Input(shape=(original_dim,))

embedded = embedding_layer(x)
h = Dense(intermediate_dim, activation='relu')(embedded)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

In [0]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

In [0]:
z = Lambda(sampling)([z_mean, z_log_var])

# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# instantiate VAE model
vae = Model(x, x_decoded_mean)


In [0]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
embedding_dim = 300

In [0]:
# attempt to implement a modified version of 
# https://github.com/unsuthee/VariationalDeepSemanticHashing/blob/master/VDSH.py
# with keras and word2vec vectors
class VDSH:
    def __init__(self, latent_dim, n_feats_hidden):
        self.latent_dim = latent_dim
        self.n_feats_hidden = n_feats_hidden
    
    def network(self):
        

In [0]:
def network():
  encoder_inputs = Input(shape=(None, 300))
  encoder = LSTM(latent_dim, return_state=True)
  encoder_outputs, state_h, state_c = encoder(encoder_inputs)
  encoder_states = [state_h, state_c]