In [1]:
import random
import numpy as np  
import pandas as pd
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.utils import Sequence
from keras.layers import Input, LSTM, CuDNNLSTM, Dense, Bidirectional, BatchNormalization, Dropout, Reshape, Concatenate, Add
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import backend as K
from keras import regularizers
from keras import optimizers
from keras.utils import to_categorical
from keras.models import load_model, model_from_json

import time
import datetime
from scipy import spatial


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Assemble dataset

In [2]:
regions = ["albuquerque", "billings", "calgary", "charlotte", "chicago", "cincinnati", "denver", "houston", "kansas city",
       "las vegas", "los angeles", "minneapolis", "montreal", "nashville", "new york", "oklahoma city", "phoenix",
       "pittsburgh", "san francisco", "seattle", "tampa", "toronto", "washington"]
df = pd.read_csv('data/tweets_labelled_balanced.csv', nrows=300000)
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
X2 = ["<s> "+x+" <e>" for x in X]
X3 = [x+" <e>" for x in X]
y = df['region'].tolist()

In [3]:
np.unique(df['region'])

array([ 3,  4,  5,  7, 10, 13, 14, 15, 18, 19, 20, 21, 22])

In [3]:
# Set Parameters
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
target_num_words = 10000
H = 500
epochs = 100
batch_size = 64
learning_rate = .001
embedding_vector_length = 200

In [4]:
# Encode strings
t = text.Tokenizer(num_words=target_num_words, lower=True, char_level=False, filters='')

# Convert strings to sequences, pad them to uniform length, and divide up training and test sets
t.fit_on_texts(X2)
word_index = t.word_index
V = target_num_words + 2 #len(word_index)+1
index_word = {v: k for k, v in t.word_index.items()}
X_seq = t.texts_to_sequences(X)
X2_seq = t.texts_to_sequences(X2)
X3_seq = t.texts_to_sequences(X3)
x_length = max(len(x) for x in X2_seq)
X_padded = sequence.pad_sequences(X_seq, maxlen=x_length, padding='post')
X2_padded = sequence.pad_sequences(X2_seq, maxlen=x_length, padding='post')
X3_padded = sequence.pad_sequences(X3_seq, maxlen=x_length, padding='post')

X_train = X_padded[:training_size]
X2_train = X2_padded[:training_size]
X3_train = X3_padded[:training_size]
X_test = X_padded[training_size:]
X2_test = X2_padded[training_size:]
X3_test = X3_padded[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]

# One-hot encode labels
encoded_y_train = to_categorical(y_train, num_classes=num_classes)
encoded_y_test = to_categorical(y_test, num_classes=num_classes)

#X_train_target = to_categorical(X3_train, num_classes=V)

In [86]:
print("There are {} unique words in this dataset, but we're only using the top {}.".format(len(word_index), V))

There are 802015 unique words in this dataset, but we're only using the top 10002.


## Load Classifier Model for validation

In [6]:
# Load the Classifier Model and set its variables

def load_model_weights(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    return model

cls_model = load_model_weights('models/Classifier_full_balanced_1Bi1L.json', 
                               'models/Classifier_full_balanced_1Bi1L_rms_weights.h5')

cls_df = pd.read_csv('data/tweets_labelled_balanced.csv')
cls_df.dropna(inplace=True)
cls_df.region = cls_df.region.astype(int)
cls_X = cls_df['text'].tolist()
cls_t = text.Tokenizer(num_words=10000, lower=True)
cls_t.fit_on_texts(cls_X)


In [7]:
# Test classifier model is loaded and working properly

def classify(tweets, target=False):
    '''
    Given a list of strings, return the predicted region for each string.  If a target is provided,
    return the probability that each string is from the target region.
    '''
    test_sequence = cls_t.texts_to_sequences(tweets)
    test_padded = sequence.pad_sequences(test_sequence, maxlen=50)
    test_prediction_probs = cls_model.predict_on_batch(test_padded)
    if target:
        return test_prediction_probs[:,target]
    else:
        return np.argmax(test_prediction_probs, axis=1)

    
tweet_list = ["if you're looking for work in va, check out this #job: #hiring #careerarc", 
              "i'm at cassell’s burgers in los angeles, ca",
              "go seahawks!",
              "dude i have a startup.  want to invest in electric scooters?"]

classify(tweet_list)

array([22, 10, 19, 18])

## Seq2Seq Autoencoder Model

In [5]:
# Generator to feed batches into the model
class OneHotBatch(Sequence):
  def __init__(self, X_data, X2_data, X3_data, y_data, batch_size, V, num_classes):
    self.X_data = X_data
    self.X2_data = X2_data
    self.X3_data = X3_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.V = V
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = self.X_data[start:finish]
    X2 = self.X2_data[start:finish]
    X3 = to_categorical(self.X3_data[start:finish], num_classes=self.V)
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_classes)

    return [X, X2], [y, X3]

In [6]:
# Load Glove embeddings
embeddings_index = {}
f = open('data/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((V, embedding_vector_length))
for word, i in word_index.items():
    if i == V:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [7]:
# Define the model
# 1Bi x 1LSTM
# 9m params with 1000 words

# define training encoder
encoder_inputs = Input(shape=(None, ), name="encoder_input")
encoder = Embedding(V, embedding_vector_length, weights=[embedding_matrix], trainable=True, 
                    name="encoder_embedding")(encoder_inputs)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(CuDNNLSTM(H, return_state=True), 
                                                                              name="encoder_lstm_1")(encoder)
#encoder_dropout = Dropout(0.2)
encoder_dense = Dense(num_classes, activation='softmax', name="encoder_dense")
#encoder_outputs = encoder_dropout(encoder_outputs)
encoder_outputs = encoder_dense(encoder_outputs)
encoder_fstates = [forward_h, forward_c]
encoder_bstates = [backward_h, backward_c]


# define training decoder
decoder_inputs = Input(shape=(None, ), name="decoder_input")
decoder_embedding = Embedding(V, embedding_vector_length, name="decoder_embedding")
embedded_input = decoder_embedding(decoder_inputs)
decoder_flstm = CuDNNLSTM(H, return_sequences=True, return_state=True, name="decoder_flstm_1")
decoder_blstm = CuDNNLSTM(H, return_sequences=True, return_state=True, name="decoder_blstm_1")
decoder_foutputs, _, _ = decoder_flstm(embedded_input, initial_state=encoder_fstates)
decoder_boutputs, _, _ = decoder_blstm(embedded_input, initial_state=encoder_bstates)
decoder_outputs = Concatenate(name="concatenate_decoder_outputs_1")([decoder_foutputs, decoder_boutputs])
decoder_dense = Dense(V, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Combine training inputs into a single training model
model = Model([encoder_inputs, decoder_inputs], [encoder_outputs, decoder_outputs])

# define inference encoder
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_fstates + encoder_bstates)

# define inference decoder
decoder_state_input_fh = Input(shape=(H,), name="decoder_state_input_fh")
decoder_state_input_fc = Input(shape=(H,), name="decoder_state_input_fc")
decoder_state_input_bh = Input(shape=(H,), name="decoder_state_input_bh")
decoder_state_input_bc = Input(shape=(H,), name="decoder_state_input_bc")
decoder_fstates_inputs = [decoder_state_input_fh, decoder_state_input_fc]
decoder_bstates_inputs = [decoder_state_input_bh, decoder_state_input_bc]

decoder_input_f2 = decoder_embedding(decoder_inputs)
decoder_input_b2 = decoder_embedding(decoder_inputs)

decoder_foutputs, state_fh, state_fc = decoder_flstm(decoder_input_f2, initial_state=decoder_fstates_inputs)
decoder_boutputs, state_bh, state_bc = decoder_blstm(decoder_input_b2, initial_state=decoder_bstates_inputs)

decoder_fstates = [state_fh, state_fc]
decoder_bstates = [state_bh, state_bc]
decoder_outputs = Concatenate(name="concatenate_decoder_outputs_2")([decoder_foutputs, decoder_boutputs])

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_fstates_inputs + decoder_bstates_inputs, 
                      [decoder_outputs] + decoder_fstates + decoder_bstates)

In [8]:
# Fit the model
start_time = time.time()

# Generators
train_generator = OneHotBatch(X_data=X_train, X2_data=X2_train, X3_data=X3_train, y_data=y_train, 
                              batch_size=batch_size, V=V, num_classes=num_classes)
validation_generator = OneHotBatch(X_data=X_test, X2_data=X2_test, X3_data=X3_test, y_data=y_test, 
                              batch_size=batch_size, V=V, num_classes=num_classes)

# Compile and train the model
opt = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False, clipvalue=.05)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'], loss_weights=[1.0,30.0])
model.summary()
callbacks = [EarlyStopping(monitor='val_decoder_dense_loss', patience=3, min_delta=.01, restore_best_weights=True),
             TensorBoard(log_dir='./logs/Twitter_300k10k_1BLx2L_nodrop', histogram_freq=0, batch_size=32, write_graph=False, 
                         write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, 
                         embeddings_metadata=None, embeddings_data=None, update_freq='epoch')]

model.fit_generator(generator=train_generator, callbacks=callbacks, epochs=12, validation_data=validation_generator)

# Final evaluation of the model
end_time = time.time()
run_time = datetime.timedelta(seconds=end_time-start_time)
print("Finished in {}".format(run_time))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 200)    2000400     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_lstm_1 (Bidirectional)  [(None, 1000), (None 2808000     encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_em

## Saving and Loading the Model

In [9]:
# Save the encoder/decoder model and weights to disk
model_name = 'Twitter_300k10k_1BLx2L_nodrop_8e'

model_weights = model_name+"_weights"
encoder_name = model_name+"_encoder"
decoder_name = model_name+"_decoder"
encoder_weights = encoder_name+"_weights"
decoder_weights = decoder_name+"_weights"

with open("models/{}.json".format(model_name), 'w', encoding='utf8') as f:
    f.write(model.to_json())
model.save_weights("models/{}.h5".format(model_weights))

with open("models/{}.json".format(encoder_name), 'w', encoding='utf8') as f:
    f.write(encoder_model.to_json())
encoder_model.save_weights("models/{}.h5".format(encoder_weights))

with open("models/{}.json".format(decoder_name), 'w', encoding='utf8') as f:
    f.write(decoder_model.to_json())
decoder_model.save_weights("models/{}.h5".format(decoder_weights))

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


In [None]:
# Load Trained Model
from keras.models import load_model, model_from_json

def load_model_weights(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    return model

#model_name = 'Twitter_Translator_100k5k_1Bix1L_masked'
model_name = 'Twitter_100k5k_1Bix1L_rms'
model = load_model('models/{}.h5'.format(model_name))

encoder_model = load_model_weights('encoder_model.json', 'models/Twitter_100k5k_1Bix1L_rms_encoder_weights.h5')
decoder_model = load_model_weights('decoder_model.json', 'models/Twitter_100k5k_1Bix1L_rms_decoder_weights.h5')

In [12]:
##### Test that the loaded model is working correctly
translate(X[20], 10)

('see our latest #seattle, wa #job and click to apply: software quality engineer -  #delljobs #qa #hiring #careerarc',
 'see our latest #sanfrancisco, #job now: click and to software software to be id - #indianapolis, #hiring #careerarc')

## Helper Functions

In [11]:
def check_fidelity(source):
    return (decode_padded_sequence(source), decode_padded_sequence(predict_sequence(source)[0]))

def cosine_similarity(emb1, emb2):
    return 1 - spatial.distance.cosine(emb1, emb2)

def check_fidelity_batch(source_array, num):
    translator_results = {'Source':[], 'Decoded':[], 'Fidelity': []}
    fidelities = []
    for _ in range(num):
        source_embeddings = get_embeddings(source_array[_])
        translation = predict_sequence(source_array[_], translate=False)
        translation_embeddings = get_embeddings(translation)
        fidelity = cosine_similarity(source_embeddings, translation_embeddings)
        translator_results['Source'].append(decode_padded_sequence(source_array[_]))
        translator_results['Decoded'].append(decode_padded_sequence(translation))
        translator_results['Fidelity'].append(fidelity)
        fidelities.append(fidelity)
    df_fidbatch = pd.DataFrame.from_dict(translator_results)[['Source', 'Decoded', 'Fidelity']]
    fidscore = np.mean(fidelities)
    return (df_fidbatch, fidscore)

def translate(source, target, probs=False):
    return_string = False
    if type(source) == str:
        source_sentence = source
        source_sequence = t.texts_to_sequences([source])
        source_padded = sequence.pad_sequences(source_sequence, maxlen=x_length, padding='post')
        return_string = True
    else:
        source_sentence = decode_padded_sequence(source)
        source_padded = source
    if probs:
        source_sentence = source_sentence
        source_prediction = get_nb_prediction(source_sentence)
        source_probstring = "({}, {:.2f}%)".format(region_dict[source_prediction[0][0]], source_prediction[1][0]*100)
        target_sentence = decode_padded_sequence(predict_sequence(source_padded, translate=target))
        target_prediction = get_nb_prediction(target_sentence)
        target_probstring = "({}, {:.2f}%)".format(region_dict[target_prediction[0][0]], target_prediction[1][0]*100)
        return  (source_sentence, source_probstring, target_sentence, target_probstring)
    else:
        if return_string:
            return (source_sentence, decode_padded_sequence(predict_sequence(source_padded, translate=target)))
        else:
            return (source_padded, predict_sequence(source_padded, translate=target))

def translate_batch(source_array, target, num, probs=False):
    translator_results = {'Source':[], 'Source_Prob': [], 'Translation':[], 'Translation_Prob':[]}
    for _ in range(num):
        translation = translate(source_array[_], target, False)
        translator_results['Source'].append(translation[0])
        translator_results['Translation'].append(translation[1])
    if probs:
        source_probs = get_nb_prediction(translator_results['Source'])
        trans_probs = get_nb_prediction(translator_results['Translation'])
        for x in range(len(translator_results['Source'])):
            translator_results['Source_Prob'].append("{}, {:.2f}%".format(region_dict[source_probs[0][x]], 
                                                                            source_probs[1][x]*100))
            translator_results['Translation_Prob'].append("{}, {:.2f}%".format(region_dict[trans_probs[0][x]], 
                                                                            trans_probs[1][x]*100))       
        return pd.DataFrame.from_dict(translator_results)[['Source', 'Source_Prob', 'Translation', 'Translation_Prob']]
    else:
        return pd.DataFrame.from_dict(translator_results)[['Source', 'Translation']]

def predict_sequence(source, translate=False, beam=False):
    '''
    Given a source array, feed it through the autoencoder to predict a string - either itself in the naive case 
    where translation is turned off, or run gradient ascent to convert the source array to a target category,
    and run that through the autoencoder to get the translated version.
    '''
    source = source.reshape(1, x_length)
    # feed the source into the encoder inference model
    encode = encoder_model.predict(source)
    
    # If set to translate, run gradient ascent to maximize to the target_label
    if translate:
        fstate, bstate = gradient_ascent(source, encoder_model, translate)
        decode_sequence = decode_latent(fstate, bstate, source=source, target_cat=translate, beam=beam)
    else:
        fstate = [encode[1:][0], encode[1:][1]]
        bstate = [encode[1:][2], encode[1:][3]]
        decode_sequence = decode_latent(fstate, bstate)
    
    return decode_sequence

def normalize(x):
    # utility function to normalize a tensor by its L2 norm, used to scale gradient ascent
    # return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())
    return x / K.max(x)

def gradient_ascent(seq, model, target, show_steps=False, step_rate=.5, target_prob=.9):
    '''
    Run gradient ascent to maximize a sequence to a target category.  Returns final state values.
    '''
    target_probability = target_prob # You want the model to be this certain the string is in the target category
    
    # Identify the target model layers and tensors for input and output
    layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
    lstm_input = layer_dict['encoder_lstm_1'].input # Use the layer that accepts the embeddings
    model_input = model.input
    
    loss = K.mean(model.output[0][:, target]) # The loss value for the target category
    states = K.identity(model.output[1:]) # The h, c values for this iteration
    grads = K.gradients(loss, lstm_input)[0] # The gradients for the lstm_input layer w/respect to the loss
    grads = normalize(grads) # Play with this function to scale the speed of the ascent
    
    # Define input/output functions
    get_embeddings = K.function([model_input], [lstm_input])
    run_ascent = K.function([lstm_input], [loss, grads, states])

    # Input sequence to model to initiate the ascent
    embeddings_value = get_embeddings([seq])[0]
    
    # Iterate through the model until loss exceeds target probability
    counter = 0
    while counter < 20:
        loss_value, grads_value, states_value = run_ascent([embeddings_value])
        target_shape = states_value.shape[2]
        fstate = [np.reshape(states_value[0][0], (1,target_shape)), np.reshape(states_value[1][0], (1,target_shape))]
        bstate = [np.reshape(states_value[2][0], (1,target_shape)), np.reshape(states_value[3][0], (1,target_shape))]        
        if show_steps:
            print("{}. ({:.2f}) {}".format(counter, loss_value, decode_padded_sequence(decode_latent(fstate, bstate))))
        return_value = (fstate, bstate)
        if loss_value > target_probability and not show_steps: # Exit the ascent
            return return_value if not show_steps else "Complete"
            break
        elif loss_value <= 0.: # Some inputs can zero out the loss
            break
        else:
            grads_value = grads_value * step_rate
            embeddings_value += grads_value
            counter += 1

def decode_latent(fstate, bstate, source=None, target_cat=None, beam=False):
    '''
    Given a pair of state vectors, run iteratively through the decoder to build an output sequence.
    Returns the padded output sequence of the most likely sentence, unless 'beam_search' is set
    to True, in which case it returns the most probably sequence and an array of the top 5 most
    likely sequences as a tuple.
    '''
    # start of sequence input
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = 1 # Start Character index
    output_seq = list()
    output_array = np.ones((5,1), dtype=int)
    stop = False
    word_counter = 1
    while not stop:
        output_tokens, fh, fc, bh, bc = decoder_model.predict([target_seq] + fstate + bstate)
        output_tokens = output_tokens[0, -1, :]

        # Sample the top n tokens
        top_tokens = np.argpartition(output_tokens, -5)[-5:]
        top_tokens = top_tokens[np.argsort(output_tokens[top_tokens])][::-1]
        top_token = top_tokens[0]

        # Exit if sampled character is end token or we've reached max length
        decoded_char = index_word[top_token] if top_token > 0 else ''
        if (decoded_char == '<e>' or word_counter + 1 == x_length):
            stop = True
            break

        # If we haven't reached the end of the sentence...
        output_seq.append(top_token)
        top_tokens = np.expand_dims(top_tokens, axis=1)
        if word_counter > 1:
            output_array = np.concatenate((output_array, top_tokens), axis=1)
        else:
            output_array = top_tokens

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = top_token

        # Update states and counter
        fstate = [fh, fc]
        bstate = [bh, bc]
        word_counter += 1
    
    if beam:
        beam_array = beam_search(source, output_array, target_cat, tag_weight=0)

    while len(output_seq) < x_length:
        output_seq.append(0)
    return (np.array([output_seq]), output_array) if beam else np.array([output_seq])


def beam_search(source, dec_array, target_cat, tag_weight=0):
    '''
    Given a source sequence and a an array of the top-5 potential words for each position from the decoder,
    optimize the potential words into the best five sentences according to both target region score
    and part-of-speech tag similarity.  'tag_weight' is set between -1 (full weight to regional score) to
    1 (full weight to tag similarity--NOT IMPLEMENTED).
    '''
    target_tags = pos_tagger([decode_padded_sequence(source[0])])
    buffer = dec_array[:,0].reshape(5,1)
    for i in range(1, dec_array.shape[1]):
        candidates = None # build the list of 25 new candidate sentences
        for j in dec_array[:,i]: # iterate through the next top 5 words
            for k in buffer: # for each of the top 5 previous sentences
                candidate = k.copy()
                candidate = np.append(candidate, j)
                if candidates is not None:
                    candidates = np.vstack([candidates, candidate])
                else:
                    candidates = candidate

        # From the 25 candidates, narrow down to the top 5
        sentences = []
        for c in candidates:
            sentences.append(decode_padded_sequence(c))
        tag_score = match_tags(sentences, target_tags[0][:i])
        cat_score = get_nb_prediction(sentences, target_cat)
        sen_score = (tag_score + cat_score) / 2
        top_sequences = np.argpartition(sen_score, -5)[-5:]
        top_sequences = top_sequences[np.argsort(sen_score[top_sequences])][::-1]
        top_5_sequences = candidates[top_sequences]
        buffer = top_5_sequences
    return buffer

def pos_tagger(tweets):
    ''' Fetch the Part-of-Speech tags for this text from the CMU Tweet Tagger.'''
    tags = []
    p = Popen('java -XX:ParallelGCThreads=2 -Xmx500m -jar ~/ark-tweet-nlp/ark-tweet-nlp-0.3.2.jar --no-confidence', 
                   stdin=PIPE, shell=True, stderr=PIPE, stdout=PIPE)
    stdout_data = p.communicate(input='\n'.join(tweets).encode())[0].decode("utf-8")
    stdout_data = stdout_data.split('\n')
    p.kill()
    for response in stdout_data[:-1]:
        tag_string = re.sub(' ', '', response.split('\t')[1])
        tags.append(tag_string)
    return tags

def get_embeddings(seq):
    ''' Given a sequence, get the word embeddings for this model'''
    layer_dict = dict([(layer.name, layer) for layer in encoder_model.layers[1:]])

    model_input = layer_dict['encoder_embedding'].input
    embedding_layer_output = encoder_model.layers[1].output
    lookup_embeddings = K.function([model_input], [embedding_layer_output])
    embeddings = lookup_embeddings([seq])[0].flatten()

    return list(embeddings)

def decode_padded_sequence(seq):
    seq = seq.flatten()
    return " ".join([index_word[x] for x in seq if x > 0])

def get_padded_sequence(sentence):
    # Given a sentence string, return the padded sequence of index numbers according to the tokenizer
    s_seq = [word_index[x] for x in sentence.split()]
    s_padded = sequence.pad_sequences([s_seq], maxlen=x_length, padding='post')
    return s_padded

def similar(a, b):
    return SequenceMatcher(None, a, b).quick_ratio()

def match_tags(taglist, target):
    tags = pos_tagger(taglist)
    return np.array([similar(x, target) for x in tags])

In [51]:
def predict_sequence(infenc, infdec, source, n_steps, translate=False):
    '''
    Given a source array, feed it through the autoencoder to predict a string - either itself in the naive case 
    where translation is turned off, or run gradient ascent to convert the source array to a target category,
    and run that through the autoencoder to get the translated version.
    '''
    region_dict = {0: "albuquerque", 1: "billings", 2: "calgary", 3: "charlotte", 4: "chicago", 5: "cincinnati", 6: "denver", 
               7: "houston", 8: "kansas city", 9: "las vegas", 10: "los angeles", 11: "minneapolis", 12: "montreal", 
               13: "nashville", 14: "new york", 15: "oklahoma city", 16: "phoenix", 17: "pittsburgh", 18: "san francisco", 
               19: "seattle", 20: "tampa", 21: "toronto", 22: "washington"}
    
    source_string = " ".join([index_word[x] for x in source[0] if x > 0])
    # feed the source into the encoder inference model
    encode = infenc.predict(source)
    # make prediction of category for source sequence
    label_prediction_probs = encode[0][0]
    label_prediction = np.argmax(label_prediction_probs)
    source_label_prediction = region_dict[label_prediction]
    source_label_certainty = label_prediction_probs[label_prediction]
    
    # If set to translate, run gradient ascent to maximize to the target_label
    if translate:
        fstate, bstate = gradient_ascent(source, infenc, translate)
    else:
        fstate = [encode[1:][0], encode[1:][1]]
        bstate = [encode[1:][2], encode[1:][3]]

    decode_sequence = decode_latent(fstate, bstate)
    decode_string = decode_padded_sequence(decode_sequence[0])
    
    # make prediction of category for predicted response
    decode_prediction = infenc.predict(decode_sequence)
    label_prediction_probs = decode_prediction[0][0]
    label_prediction = np.argmax(label_prediction_probs)
    decode_label_prediction = region_dict[label_prediction]
    decode_label_certainty = label_prediction_probs[label_prediction]
    
    return (source_string, source_label_prediction, source_label_certainty,
            decode_string, decode_label_prediction, decode_label_certainty, decode_sequence)

def normalize(x):
    # utility function to normalize a tensor by its L2 norm, used to scale gradient ascent
    #return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())
    return x / K.max(x)
'''
def gradient_ascent(seq, model, target):

    target_probability = .95 # You want the model to be this certain the string is in the target category
    
    # Identify the target model layers and tensors for input and output
    layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
    lstm_input = layer_dict['encoder_lstm_1'].input # Use the layer that accepts the embeddings
    model_input = model.input
    
    loss = K.mean(model.output[0][:, target]) # The loss value for the target category
    states = K.identity(model.output[1:]) # The h, c values for this iteration
    grads = K.gradients(loss, lstm_input)[0] # The gradients for the lstm_input layer w/respect to the loss
    grads = normalize(grads) # Play with this function to scale the speed of the ascent
    
    # Define input/output functions
    get_embeddings = K.function([model_input], [lstm_input])
    run_ascent = K.function([lstm_input], [loss, grads, states])

    # Input sequence to model to initiate the ascent
    embeddings_value = get_embeddings([seq])[0]
    
    # Iterate through the model until loss exceeds target probability
    while True:
        loss_value, grads_value, states_value = run_ascent([embeddings_value])
        if loss_value > target_probability: # Exit the ascent
            target_shape = states_value.shape[2]
            f_state = [np.reshape(final_output[0][0], (1,target_shape)), np.reshape(final_output[1][0], (1,target_shape))]
            b_state = [np.reshape(final_output[2][0], (1,target_shape)), np.reshape(final_output[3][0], (1,target_shape))]
            return (f_state, b_state)
            break
        elif loss_value <= 0.: # Some inputs can zero out the loss
            break
        else:
            embeddings_value += grads_value

'''
def gradient_ascent(seq, model, target, show_steps=False, step_rate=.5, target_prob=.9):
    target_probability = target_prob # You want the model to be this certain the string is in the target category
    
    # Identify the target model layers and tensors for input and output
    layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
    lstm_input = layer_dict['encoder_lstm_1'].input # Use the layer that accepts the embeddings
    model_input = model.input
    
    loss = K.mean(model.output[0][:, target]) # The loss value for the target category
    states = K.identity(model.output[1:]) # The h, c values for this iteration
    grads = K.gradients(loss, lstm_input)[0] # The gradients for the lstm_input layer w/respect to the loss
    grads = normalize(grads) # Play with this function to scale the speed of the ascent
    
    # Define input/output functions
    get_embeddings = K.function([model_input], [lstm_input])
    run_ascent = K.function([lstm_input], [loss, grads, states])

    # Input sequence to model to initiate the ascent
    embeddings_value = get_embeddings([seq])[0]
    
    # Iterate through the model until loss exceeds target probability
    counter = 0
    while counter < 20:
        loss_value, grads_value, states_value = run_ascent([embeddings_value])
        target_shape = states_value.shape[2]
        fstate = [np.reshape(states_value[0][0], (1,target_shape)), np.reshape(states_value[1][0], (1,target_shape))]
        bstate = [np.reshape(states_value[2][0], (1,target_shape)), np.reshape(states_value[3][0], (1,target_shape))]        
        if show_steps:
            print("{}. ({:.2f}) {}".format(counter, loss_value, decode_padded_sequence(decode_latent(fstate, bstate))))
        if loss_value > target_probability and not show_steps: # Exit the ascent
            return (fstate, bstate) if not show_steps else "Complete"
            break
        elif loss_value <= 0.: # Some inputs can zero out the loss
            break
        else:
            grads_value = grads_value * step_rate
            embeddings_value += grads_value
            counter += 1

            

def decode_latent(fstate, bstate):
    '''
    Given a pair of state vectors, run iteratively through the decoder to build an output sequence.
    Returns the padded output sequence
    '''
    # start of sequence input
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = 1 # Start Character index
    output = list()
    stop = False
    word_counter = 1
    while not stop:
        output_tokens, fh, fc, bh, bc = decoder_model.predict([target_seq] + fstate + bstate)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Exit if sampled character is end token or we've reached max length
        decoded_char = index_word[sampled_token_index] if sampled_token_index > 0 else ''
        if (decoded_char == '<e>' or word_counter == x_length):
            stop = True
            break

        output.append(sampled_token_index)

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states and counter
        fstate = [fh, fc]
        bstate = [bh, bc]
        word_counter += 1

    while len(output) < x_length:
        output.append(0)
    predicted_sequence = np.array([output])
    return predicted_sequence

def get_embeddings(seq):
    # Given a sequence, get the word embeddings for this model
    layer_dict = dict([(layer.name, layer) for layer in encoder_model.layers[1:]])

    model_input = layer_dict['encoder_embedding'].input
    embedding_layer_output = encoder_model.layers[1].output
    lookup_embeddings = K.function([model_input], [embedding_layer_output])
    embeddings = lookup_embeddings([seq])[0].flatten()

    return list(embeddings)

def decode_padded_sequence(seq):
    return " ".join([index_word[x] for x in seq if x > 0])

def get_padded_sequence(sentence):
    # Given a sentence string, return the padded sequence of index numbers according to the tokenizer
    s_seq = [word_index[x] for x in sentence.split()]
    s_padded = sequence.pad_sequences([s_seq], maxlen=x_length, padding='post')
    return s_padded

def translate_sentence(sentence, target):
    # Put in a sentence and a target region, get a result
    source = get_padded_sequence(sentence)
    result = predict_sequence(encoder_model, decoder_model, source, x_length, target)
    return ("Input: {} ({} {:.2f}) \nOutput: {} ({} {:.2f})"
            .format(result[0], result[1], result[2], result[3], result[4], result[5]))

## Translation

In [39]:
region_dict = {0: "albuquerque", 1: "billings", 2: "calgary", 3: "charlotte", 4: "chicago", 5: "cincinnati", 6: "denver", 
               7: "houston", 8: "kansas city", 9: "las vegas", 10: "los angeles", 11: "minneapolis", 12: "montreal", 
               13: "nashville", 14: "new york", 15: "oklahoma city", 16: "phoenix", 17: "pittsburgh", 18: "san francisco", 
               19: "seattle", 20: "tampa", 21: "toronto", 22: "washington"}

seq = X_train[[2]]
target = 20

target_probability = .95 # You want the model to be this certain the string is in the target category
layer_dict = dict([(layer.name, layer) for layer in encoder_model.layers[1:]])
layer_name = 'encoder_lstm_1'
layer_input = layer_dict[layer_name].input
input_txt = encoder_model.input
loss = K.mean(encoder_model.output[0][:, target])
states = K.identity(encoder_model.output[1:])
grads = K.gradients(loss, layer_input)[0]
grads = normalize(grads)
initiate = K.function([input_txt], [layer_input])
iterate = K.function([layer_input], [loss, grads])
terminate = K.function([layer_input], [states])

embedding = initiate([seq])[0]
for i in range(20):
    loss_value, grads_value = iterate([embedding])
    final_output = terminate([embedding])[0]
    target_shape = final_output.shape[2]
    f_state = [np.reshape(final_output[0][0], (1,target_shape)), np.reshape(final_output[1][0], (1,target_shape))]
    b_state = [np.reshape(final_output[2][0], (1,target_shape)), np.reshape(final_output[3][0], (1,target_shape))]
    print("{}: ({:.2f}%) {}".format(i, loss_value*100, decode_latent(f_state, b_state)))

    if loss_value > target_probability:
        final_output = terminate([embedding])[0]
        target_shape = final_output.shape[2]
        print("All Done")
        break
    elif loss_value <= 0.:
        break
    else:
        embedding += grads_value

0: (4.87%) [[  34  207  174   11   73    8 2979  220   42   14  407 1530   67  154
    66    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]
1: (22.26%) [[  34  207  174   11   73    8 2979  220   42   14  407 1530   66   58
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]
2: (68.14%) [[ 174   42  174   11  174    8   82  220   11   14  407 1530   67  154
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]
3: (90.47%) [[

In [94]:
# Show the steps for gradient ascent
gradient_ascent(X_padded[[500]], encoder_model, 14, show_steps=True)

0. (0.06) this season the packers fans have zero room to talk shit about the bears or
1. (0.14) this season the packers fans have zero room to talk shit about the bears or
2. (0.85) this season the #steelers packers have available somewhere to his talk about the bears or
3. (0.98) this season the #steelers #design have zero chicago to talk about his or bears to
4. (1.00) this sunday, the #steelers benefit event. have chicago to attention about to talk the evil
5. (1.00) this throwback the #steelers benefit fl have west him to talk about the black blue
6. (1.00) iowa yet. the matchup sports have fair york to shit talk about the bears or
7. (1.00) this throwback the matchup i've vs. local park, to talk about it the evil or
8. (1.00) iowa was the best. matchup weekend! york have to talk about just the evil
9. (1.00) section yet. the matchup #fall have sympathy minor to talk about just the evil or
10. (1.00) 17th turnout the matchup (@ paradise have known to talk about it the evil or
11. (

In [82]:
# Evaluate success of training to see similarity of input and output sentences

decoder_results = {'Source':[], 'Source Prediction':[], 'Source Certainty':[],
                   'Decoded':[], 'Decoded Prediction':[], 'Decoded Certainty':[], 'Score': []}

score_list = []

for _ in range(100):
    target = predict_sequence(encoder_model, decoder_model, X_train[[_]], x_length, False)
    score = 1 - spatial.distance.cosine(get_embeddings(X_train[[_]]), get_embeddings(target[6]))
    score_list.append(score)
    decoder_results['Source'].append(target[0])
    decoder_results['Source Prediction'].append(target[1])
    decoder_results['Source Certainty'].append(target[2])
    decoder_results['Decoded'].append(target[3])
    decoder_results['Decoded Prediction'].append(target[4])
    decoder_results['Decoded Certainty'].append(target[5])
    decoder_results['Score'].append(score)


pd.DataFrame.from_dict(decoder_results)[['Source', 'Source Prediction', 'Source Certainty', 
                                         'Decoded', 'Decoded Prediction', 'Decoded Certainty',
                                         'Score']]

TypeError: predict_sequence() takes from 1 to 3 positional arguments but 5 were given

In [34]:
np.mean(score_list)

0.6844770941138267

In [33]:
pd.options.display.max_colwidth = 200
pd.DataFrame.from_dict(decoder_results)[['Source', 'Decoded', 'Decoded Prediction', 'Decoded Certainty']]

Unnamed: 0,Source,Decoded,Decoded Prediction,Decoded Certainty
0,we got the earth in the blunt..,we got the in the city,san francisco,0.119070
1,i’m bout to be up doing this hw assignment that was due last week. i didn’t go to class last week cause i didn’t do it. now here we meet again &amp; i still didn’t do the shit but need to hand it ...,i’m going to be to be at this assignment this and i’m so i’m to see i am at the week. i am i’m at the week. i am here in the i am in the i am here in the i am i’m in the life. lol,oklahoma city,0.143612
2,"if you're looking for work in #woodstock, va, check out this #job: #schoolpsychology #hiring #careerarc","if you're looking for work in ca, check out this #job: #manufacturing #job #jobs #hiring",chicago,0.142153
3,i have the craving for peanut butter,i have the craving for peanut butter,oklahoma city,0.155811
4,a weight has been lifted off my shoulder and i feel so great omg 😭❤️,a few has been been been my mom and i was so i’m so 😭❤️,oklahoma city,0.188428
5,i found this screenshot from a few years ago and i’m crying why am i so funny,i was a few and i was a phone and i am so so so i hate,los angeles,0.109619
6,"you had me with your words lost me with your actions💖\r\n#styledbysilvay @ los angeles, california","you you with me with your favorite restaurant &amp; you @ los angeles, california",toronto,0.129498
7,the lyft ride wasn’t any better. i felt in danger and i’m a lyft driver myself 😂,the nail in the l i don’t don’t be a pair and i’m a lyft is. lol,charlotte,0.145585
8,this member was happy to get a #zerowaste flu shot! way to #conserve paper 🌳👏🏽,this is a way to be to be to be a great way to,oklahoma city,0.108641
9,citadel - cit-adel = trends from military words!,(feels: - humidity: 88% - block of words!,houston,0.135278


In [30]:
encode = encoder_model.predict(X_test[[0]])
encode[1:][:1].shape

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# Define the model
# 1Bi x 1LSTM
# 9m params with 1000 words

# define training encoder
encoder_inputs = Input(shape=(None, ), name="encoder_input")
encoder = Embedding(V, embedding_vector_length, weights=[embedding_matrix], trainable=True, 
                    name="encoder_embedding")(encoder_inputs)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(CuDNNLSTM(H, return_state=True), name="encoder_lstm_1")(encoder)
state_h = Concatenate(name="concatenate_state_h")([forward_h, backward_h])
state_c = Concatenate(name="concatenate_state_c")([forward_c, backward_c])
encoder_dropout = Dropout(0.4)
encoder_dense = Dense(num_classes, activation='softmax', name="encoder_dense")
encoder_outputs = encoder_dropout(encoder_outputs)
encoder_outputs = encoder_dense(encoder_outputs)
encoder_states = [state_h, state_c]

# define training decoder
decoder_inputs = Input(shape=(None, ), name="decoder_input")
decoder_embedding = Embedding(V, embedding_vector_length, name="decoder_embedding")
embedded_input = decoder_embedding(decoder_inputs)
decoder_lstm = CuDNNLSTM(H*2, return_sequences=True, return_state=True, name="decoder_lstm_1")
decoder_outputs, _, _ = decoder_lstm(embedded_input, initial_state=encoder_states)
decoder_dense = Dense(V, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Combine training inputs into a single training model
model = Model([encoder_inputs, decoder_inputs], [encoder_outputs, decoder_outputs])

# define inference encoder
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)

# define inference decoder
decoder_state_input_h = Input(shape=(H*2,), name="decoder_state_input_h")
decoder_state_input_c = Input(shape=(H*2,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_input_2 = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_input_2, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)