In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.metrics import binary_crossentropy

In [7]:
import re
import numpy as np
import pickle

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text


In [9]:
import re
import numpy as np

In [10]:
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(index_from=3)


In [11]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [12]:
X_train_text = [(' '.join(id_to_word[id] for id in x)) for x in list(X_train)]

In [13]:
X_train_text = np.array(X_train_text)

In [5]:
# !pip install gensim
import gensim

In [0]:
from google.colab import auth
auth.authenticate_user()

In [11]:
project_id = 'data4democracy-wwymak-explore'
bucket_name = 'dl-models-wwymak' 

!gcloud config set project {project_id}
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/mb
# !gsutil mb gs://{bucket_name}

Updated property [core/project].
Creating gs://dl-models-wwymak/...
ServiceException: 409 Bucket dl-models-wwymak already exists.


In [13]:
!gsutil cp gs://{bucket_name}/GoogleNews-vectors-negative300.bin /tmp/GoogleNews-vectors-negative300.bin

Copying gs://dl-models-wwymak/GoogleNews-vectors-negative300.bin...
\ [1 files][  3.4 GiB/  3.4 GiB]   89.6 MiB/s                                   
Operation completed over 1 objects/3.4 GiB.                                      


In [1]:
W2V_MODEL_PATH='~/models/GoogleNews-vectors-negative300.bin'
# W2V_MODEL_PATH='~/code_work/models/GoogleNews-vectors-negative300.bin'
# W2V_MODEL_PATH='~/tmp/GoogleNews-vectors-negative300.bin'

In [4]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=True, limit=100000)
# construct embeeing matrix from word2vec as per 
# http://www.jacobsilterra.com/2017/05/03/classifying-text-with-keras-basic-text-processing/#Using_Pre-TrainedVectors
embedding_matrix = model.vectors
# embedding_matrix = model.syn0
# Dictionary mapping from word --> row of embedding matrix
vocab_dict = {word: model.vocab[word].index for word in model.vocab.keys()}

In [14]:
vocab_dim = embedding_matrix.shape[1]# dimensionality of your word vectors
n_symbols = embedding_matrix.shape[0]
# define inputs here
embedding_layer = Embedding(output_dim=vocab_dim, input_dim=n_symbols, trainable=False)
embedding_layer.build((None,)) # if you don't do this, the next step won't work
embedding_layer.set_weights([embedding_matrix])


In [0]:
batch_size = 500
ORIGINIAL_DIM = 1024 #max number of wrods
latent_dim = 100
NUM_FEATS_HIDDEN = 256
epochs = 50
epsilon_std = 1.0



In [0]:

embedding_dim = 300

ORIGINIAL_DIM = 1024 #max number of wrods
LATENT_DIM = 100 # Latent dimensionality of the encoding space.
NUM_FEATS_HIDDEN = 256
epochs = 50
epsilon_std = 1.0

In [0]:
# attempt to implement a modified version of 
# https://github.com/unsuthee/VariationalDeepSemanticHashing/blob/master/VDSH.py
# with keras and word2vec vectors
class VDSH:
    def __init__(self, latent_dim=LATENT_DIM, n_feats_hidden=NUM_FEATS_HIDDEN, original_dim=ORIGINIAL_DIM):
        self.latent_dim = latent_dim
        self.n_feats_hidden = n_feats_hidden
        self.hidden_dim = 500
        self.original_dim = original_dim
        
    def sampling(self,args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.latent_dim), mean=0.,
                                  stddev=epsilon_std)
        return z_mean + K.exp(z_log_var / 2) * epsilon
    
    def network(self):
        x = Input(shape=(original_dim,))
        embedded = embedding_layer(x, name='embedding')
        h1 = Dense(self.hidden_dim, activation='relu', name='enc1')(embedded)
        h2 = Dense(self.hidden_dim, activation='relu', name='enc2')(h1)
        h3 = Dropout(0.3, name='enc3')(h2)
        
        z_mean = Dense(self.latent_dim, activation='linear', name='z_mean')(h3)
        z_log_var = Dense(self.latent_dim, activation='sigmoid',name='z_log_var')(h3)
        
        z = Lambda(self.sampling)([z_mean, z_log_var])
        
        decoder_h = Dense(intermediate_dim, activation='relu')
        decoder_mean = Dense(original_dim, activation='sigmoid')
        h_decoded = decoder_h(z)
        x_decoded_mean = decoder_mean(h_decoded)

# instantiate VAE model
vae = Model(x, x_decoded_mean)
        xent_loss = self.original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        vae_loss = K.mean(xent_loss + kl_loss)


In [0]:
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# instantiate VAE model
vae = Model(x, x_decoded_mean)

# Compute VAE loss
xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='rmsprop')
vae.summary()




def vae_loss

xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)


def calc_reconstr_error(self):
        # Pick score for those visiable words
        p_x_i_scores0 = tf.gather(self.p_x_i, self.input_bow_idx)
        weight_scores0 = tf.gather(tf.squeeze(self.input_bow), self.input_bow_idx)
        return -tf.reduce_sum(tf.log(tf.maximum(p_x_i_scores0 * weight_scores0, 1e-10)))

    def calc_KL_loss(self):
        return -0.5 * tf.reduce_sum(tf.reduce_sum(1 + self.z_log_var - tf.square(self.z_mean) 
                                              - tf.exp(self.z_log_var), axis=1))

In [1]:
class MedianHashing(object):
    
    def __init__(self):
        self.threshold = None
        self.latent_dim = None
    
    def fit(self, X):
        self.threshold = np.median(X, axis=0)
        self.latent_dim = X.shape[1]
        
    def transform(self, X):
        assert(X.shape[1] == self.latent_dim)
        binary_code = np.zeros(X.shape)
        for i in range(self.latent_dim):
            binary_code[np.nonzero(X[:,i] < self.threshold[i]),i] = 0
            binary_code[np.nonzero(X[:,i] >= self.threshold[i]),i] = 1
        return binary_code.astype(int)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)