In [16]:
import random
import numpy as np  
import pandas as pd
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.utils import Sequence
from keras.layers import Input, LSTM, CuDNNLSTM, Dense, Bidirectional, BatchNormalization, Dropout, Reshape, Concatenate, Add
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import backend as K
from keras import regularizers
from keras import optimizers
from keras.utils import to_categorical
import time
import datetime
from scipy import spatial


## Assemble dataset

In [30]:
regions = ["albuquerque", "billings", "calgary", "charlotte", "chicago", "cincinnati", "denver", "houston", "kansas city",
       "las vegas", "los angeles", "minneapolis", "montreal", "nashville", "new york", "oklahoma city", "phoenix",
       "pittsburgh", "san francisco", "seattle", "tampa", "toronto", "washington"]
df = pd.read_csv('data/tweets_labelled.csv', nrows=100000)
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
X2 = ["<s> "+x+" <e>" for x in X]
X3 = [x+" <e>" for x in X]
y = df['region'].tolist()

In [32]:
np.unique(df['region'])

array([ 3,  4,  5,  7, 10, 13, 14, 15, 18, 19, 20, 21, 22])

In [31]:
# Set Parameters
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
target_num_words = 5000
H = 500
epochs = 100
batch_size = 64
learning_rate = .001
embedding_vector_length = 200

In [34]:
# Encode strings
t = text.Tokenizer(num_words=10000, lower=True, char_level=False, filters='')

# Convert strings to sequences, pad them to uniform length, and divide up training and test sets
t.fit_on_texts(X2)
word_index = t.word_index
V = 10002 #len(word_index)+1
index_word = {v: k for k, v in t.word_index.items()}
X_seq = t.texts_to_sequences(X)
X2_seq = t.texts_to_sequences(X2)
X3_seq = t.texts_to_sequences(X3)
x_length = max(len(x) for x in X2_seq)
X_padded = sequence.pad_sequences(X_seq, maxlen=x_length, padding='post')
X2_padded = sequence.pad_sequences(X2_seq, maxlen=x_length, padding='post')
X3_padded = sequence.pad_sequences(X3_seq, maxlen=x_length, padding='post')

X_train = X_padded[:training_size]
X2_train = X2_padded[:training_size]
X3_train = X3_padded[:training_size]
X_test = X_padded[training_size:]
X2_test = X2_padded[training_size:]
X3_test = X3_padded[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]

# One-hot encode labels
encoded_y_train = to_categorical(y_train, num_classes=num_classes)
encoded_y_test = to_categorical(y_test, num_classes=num_classes)

#X_train_target = to_categorical(X3_train, num_classes=V)

In [36]:
print("There are {} unique words in this dataset, but we're only using the top {}.".format(len(word_index), V))

There are 156840 unique words in this dataset, but we're only using the top 10002.


## Seq2Seq Autoencoder Model

In [37]:
# Generator to feed batches into the model
class OneHotBatch(Sequence):
  def __init__(self, X_data, X2_data, X3_data, y_data, batch_size, V, num_classes):
    self.X_data = X_data
    self.X2_data = X2_data
    self.X3_data = X3_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.V = V
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = self.X_data[start:finish]
    X2 = self.X2_data[start:finish]
    X3 = to_categorical(self.X3_data[start:finish], num_classes=self.V)
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_classes)

    return [X, X2], [y, X3]

In [38]:
# Load Glove embeddings
embeddings_index = {}
f = open('data/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((V, embedding_vector_length))
for word, i in word_index.items():
    if i == V:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [39]:
# Define the model
# 1Bi x 1LSTM
# 9m params with 1000 words

# define training encoder
encoder_inputs = Input(shape=(None, ), name="encoder_input")
encoder = Embedding(V, embedding_vector_length, weights=[embedding_matrix], trainable=True, 
                    name="encoder_embedding")(encoder_inputs)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(CuDNNLSTM(H, return_state=True), name="encoder_lstm_1")(encoder)
state_h = Concatenate(name="concatenate_state_h")([forward_h, backward_h])
state_c = Concatenate(name="concatenate_state_c")([forward_c, backward_c])
encoder_dense = Dense(num_classes, activation='softmax', name="encoder_dense")
encoder_outputs = encoder_dense(state_h)
encoder_states = [state_h, state_c]

# define training decoder
decoder_inputs = Input(shape=(None, ), name="decoder_input")
decoder_embedding = Embedding(V, embedding_vector_length, name="decoder_embedding")
embedded_input = decoder_embedding(decoder_inputs)
decoder_lstm = CuDNNLSTM(H*2, return_sequences=True, return_state=True, name="decoder_lstm_1")
decoder_outputs, _, _ = decoder_lstm(embedded_input, initial_state=encoder_states)
decoder_dense = Dense(V, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Combine training inputs into a single training model
model = Model([encoder_inputs, decoder_inputs], [encoder_outputs, decoder_outputs])

# define inference encoder
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)

# define inference decoder
decoder_state_input_h = Input(shape=(H*2,), name="decoder_state_input_h")
decoder_state_input_c = Input(shape=(H*2,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_input_2 = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_input_2, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
# Fit the model
start_time = time.time()

# Generators
train_generator = OneHotBatch(X_data=X_train, X2_data=X2_train, X3_data=X3_train, y_data=y_train, 
                              batch_size=batch_size, V=V, num_classes=num_classes)
validation_generator = OneHotBatch(X_data=X_test, X2_data=X2_test, X3_data=X3_test, y_data=y_test, 
                              batch_size=batch_size, V=V, num_classes=num_classes)

# Compile and train the model
opt = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False, clipvalue=.05)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
model.summary()
callbacks = [EarlyStopping(monitor='decoder_dense_loss', patience=3, min_delta=.05, restore_best_weights=True),
             ModelCheckpoint(filepath='models/Twitter_Translator_word1k.h5', 
                             monitor='decoder_dense_loss', save_best_only=True)]
             #TensorBoard(log_dir='./logs/Twitter_Translator_100k5k_1Bix1L_masked', histogram_freq=0, batch_size=32, write_graph=False, 
             #            write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, 
             #            embeddings_metadata=None, embeddings_data=None, update_freq='epoch')]

model.fit_generator(generator=train_generator, callbacks=callbacks, epochs=100, validation_data=validation_generator)
                    #max_queue_size=10, workers=5, use_multiprocessing=True)
# Final evaluation of the model
end_time = time.time()
run_time = datetime.timedelta(seconds=end_time-start_time)
print("Finished in {}".format(run_time))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 200)    2000400     encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_lstm_1 (Bidirectional)  [(None, 1000), (None 2808000     encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
concatenat

  '. They will not be included '


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

## Saving and Loading the Model

In [11]:
model_name = 'Twitter_100k5k_1Bix1L'
model.save('models/{}.h5'.format(model_name))

  '. They will not be included '


In [12]:
# Save the encoder/decoder model and weights to disk
with open('encoder_model.json', 'w', encoding='utf8') as f:
    f.write(encoder_model.to_json())
encoder_model.save_weights('models/Twitter_100k5k_1Bix1L_encoder_weights.h5')

with open('decoder_model.json', 'w', encoding='utf8') as f:
    f.write(decoder_model.to_json())
decoder_model.save_weights('models/Twitter_100k5k_1Bix1L_decoder_weights.h5')

  '. They will not be included '


In [None]:
# Load Trained Model
from keras.models import load_model, model_from_json

def load_model_weights(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    return model

#model_name = 'Twitter_Translator_100k5k_1Bix1L_masked'
model_name = 'Twitter_100k5k_1Bix1L'
model = load_model('/models/{}.h5'.format(model_name))

encoder_model = load_model_weights('encoder_model.json', 'models/Twitter_100k5k_1Bix1L_encoder_weights.h5')
decoder_model = load_model_weights('decoder_model.json', 'models/Twitter_100k5k_1Bix1L_decoder_weights.h5')

In [15]:
# Test that the loaded model is working correctly
predict_sequence(encoder_model, decoder_model, X_train[[2]], x_length, translate=False)

("if you're looking for work in check out this #job: #hiring #careerarc",
 'charlotte',
 0.13493766,
 "if you're looking for work in check out this #job: #hiring #careerarc",
 'charlotte',
 0.13493766)

## Helper Functions

In [None]:
def predict_sequence(infenc, infdec, source, n_steps, translate=False):
    '''
    Given a source array, feed it through the autoencoder to predict a string - either itself in the naive case 
    where translation is turned off, or run gradient ascent to convert the source array to a target category,
    and run that through the autoencoder to get the translated version.
    '''
    region_dict = {0: "albuquerque", 1: "billings", 2: "calgary", 3: "charlotte", 4: "chicago", 5: "cincinnati", 6: "denver", 
               7: "houston", 8: "kansas city", 9: "las vegas", 10: "los angeles", 11: "minneapolis", 12: "montreal", 
               13: "nashville", 14: "new york", 15: "oklahoma city", 16: "phoenix", 17: "pittsburgh", 18: "san francisco", 
               19: "seattle", 20: "tampa", 21: "toronto", 22: "washington"}
    
    source_string = " ".join([index_word[x] for x in source[0] if x > 0])
    # feed the source into the encoder inference model
    encode = infenc.predict(source)
    # make prediction of category for source sequence
    label_prediction_probs = encode[0][0]
    label_prediction = np.argmax(label_prediction_probs)
    source_label_prediction = region_dict[label_prediction]
    source_label_certainty = label_prediction_probs[label_prediction]
    
    # If set to translate, run gradient ascent to maximize to the target_label
    if translate:
        state = gradient_ascent(source, infenc, translate)
    else:
        state = encode[1:]

    decode_sequence = decode_latent(state)
    decode_string = decode_padded_sequence(decode_sequence[0])
    
    # make prediction of category for predicted response
    decode_prediction = infenc.predict(decode_sequence)
    label_prediction_probs = decode_prediction[0][0]
    label_prediction = np.argmax(label_prediction_probs)
    decode_label_prediction = region_dict[label_prediction]
    decode_label_certainty = label_prediction_probs[label_prediction]
    
    return (source_string, source_label_prediction, source_label_certainty,
            decode_string, decode_label_prediction, decode_label_certainty, decode_sequence)

def normalize(x):
    # utility function to normalize a tensor by its L2 norm, used to scale gradient ascent
    #return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())
    return x / K.max(x)

def gradient_ascent(seq, model, target):
    '''
    Run gradient ascent to maximize a sequence to a target category.  Returns final state values.
    '''
    target_probability = .95 # You want the model to be this certain the string is in the target category
    
    # Identify the target model layers and tensors for input and output
    layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
    lstm_input = layer_dict['encoder_lstm_1'].input # Use the layer that accepts the embeddings
    model_input = model.input
    
    loss = K.mean(model.output[0][:, target]) # The loss value for the target category
    states = K.identity(model.output[1:]) # The h, c values for this iteration
    grads = K.gradients(loss, lstm_input)[0] # The gradients for the lstm_input layer w/respect to the loss
    grads = normalize(grads) # Play with this function to scale the speed of the ascent
    
    # Define input/output functions
    get_embeddings = K.function([model_input], [lstm_input])
    run_ascent = K.function([lstm_input], [loss, grads, states])

    # Input sequence to model to initiate the ascent
    embeddings_value = get_embeddings([seq])[0]
    
    # Iterate through the model until loss exceeds target probability
    while True:
        loss_value, grads_value, states_value = run_ascent([embeddings_value])
        if loss_value > target_probability: # Exit the ascent
            target_shape = states_value.shape[2]
            return [np.reshape(states_value[0][0], (1,target_shape)), np.reshape(states_value[1][0], (1,target_shape))]
            break
        elif loss_value <= 0.: # Some inputs can zero out the loss
            break
        else:
            embeddings_value += grads_value

def decode_latent(state):
    '''
    Given a pair of state vectors, run iteratively through the decoder to build an output sequence.
    Returns the padded output sequence
    '''
    # start of sequence input
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = 1 # Start Character index
    output = list()
    stop = False
    word_counter = 1
    while not stop:
        output_tokens, h, c = decoder_model.predict([target_seq] + state)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Exit if sampled character is end token or we've reached max length
        decoded_char = index_word[sampled_token_index] if sampled_token_index > 0 else ''
        if (decoded_char == '<e>' or word_counter == x_length):
            stop = True
            break

        output.append(sampled_token_index)

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states and counter
        state = [h, c]
        word_counter += 1

    while len(output) < x_length:
        output.append(0)
    predicted_sequence = np.array([output])
    return predicted_sequence

def get_embeddings(seq):
    # Given a sequence, get the word embeddings for this model
    layer_dict = dict([(layer.name, layer) for layer in encoder_model.layers[1:]])

    model_input = layer_dict['encoder_embedding'].input
    embedding_layer_output = encoder_model.layers[1].output
    lookup_embeddings = K.function([model_input], [embedding_layer_output])
    embeddings = lookup_embeddings([seq])[0].flatten()

    return list(embeddings)

def decode_padded_sequence(seq):
    return " ".join([index_word[x] for x in seq if x > 0])

def get_padded_sequence(sentence):
    # Given a sentence string, return the padded sequence of index numbers according to the tokenizer
    s_seq = [word_index[x] for x in sentence.split()]
    s_padded = sequence.pad_sequences([s_seq], maxlen=x_length, padding='post')
    return s_padded

def translate_sentence(sentence, target):
    # Put in a sentence and a target region, get a result
    source = get_padded_sequence(sentence)
    result = predict_sequence(encoder_model, decoder_model, source, x_length, target)
    return (result[0], result[3])

## Translation

In [63]:
region_dict = {0: "albuquerque", 1: "billings", 2: "calgary", 3: "charlotte", 4: "chicago", 5: "cincinnati", 6: "denver", 
               7: "houston", 8: "kansas city", 9: "las vegas", 10: "los angeles", 11: "minneapolis", 12: "montreal", 
               13: "nashville", 14: "new york", 15: "oklahoma city", 16: "phoenix", 17: "pittsburgh", 18: "san francisco", 
               19: "seattle", 20: "tampa", 21: "toronto", 22: "washington"}

seq = X_train[[2]]
target = 20

target_probability = .95 # You want the model to be this certain the string is in the target category
layer_dict = dict([(layer.name, layer) for layer in encoder_model.layers[1:]])
layer_name = 'encoder_lstm_1'
layer_input = layer_dict[layer_name].input
input_txt = encoder_model.input
loss = K.mean(encoder_model.output[0][:, target])
states = K.identity(encoder_model.output[1:])
grads = K.gradients(loss, layer_input)[0]
grads = normalize(grads)
initiate = K.function([input_txt], [layer_input])
iterate = K.function([layer_input], [loss, grads])
terminate = K.function([layer_input], [states])

embedding = initiate([seq])[0]
for i in range(20):
    loss_value, grads_value = iterate([embedding])
    final_output = terminate([embedding])[0]
    target_shape = final_output.shape[2]
    new_state = [np.reshape(final_output[0][0], (1,target_shape)), np.reshape(final_output[1][0], (1,target_shape))]
    print("{}: ({:.2f}%) {}".format(i, loss_value*100, decode_latent(new_state)))

    if loss_value > target_probability:
        final_output = terminate([embedding])[0]
        target_shape = final_output.shape[2]
        print("All Done")
        break
    elif loss_value <= 0.:
        break
    else:
        embedding += grads_value

0: (4.96%) if you're looking for work in check out this #job: #hiring #careerarc
1: (5.70%) do everyone recommend looking for a time the show it as the but his games -
2: (8.87%) why wants yourself for working at it your second been one of left .
3: (13.15%) don’t care looking for for a country then the time in his has asking that wants a lot that more a of a few fit on @
4: (12.15%) don’t forget yourself for or be every time he the election has been the right about the person the real person that one of them a few some one at some video from work alone games at north
5: (18.67%) what’s twitter looking for work in it for the right time without he could pay his money he knows the real makes it knows about one of a little than some one about a few of a few around them taking work alone work from work in time. nc
6: (3.19%) don’t forget yourself for this - if we was next year for the election about the he needs to the one has a way of the real one person who has a little about some of a l

In [27]:
# Evaluate success of training to see similarity of input and output sentences

decoder_results = {'Source':[], 'Source Prediction':[], 'Source Certainty':[],
                   'Decoded':[], 'Decoded Prediction':[], 'Decoded Certainty':[], 'Score': []}

score_list = []

for _ in range(100):
    target = predict_sequence(encoder_model, decoder_model, X_test[[_]], x_length, 10)
    score = 1 - spatial.distance.cosine(get_embeddings(X_test[[_]]), get_embeddings(target[6]))
    score_list.append(score)
    decoder_results['Source'].append(target[0])
    decoder_results['Source Prediction'].append(target[1])
    decoder_results['Source Certainty'].append(target[2])
    decoder_results['Decoded'].append(target[3])
    decoder_results['Decoded Prediction'].append(target[4])
    decoder_results['Decoded Certainty'].append(target[5])
    decoder_results['Score'].append(score)


pd.DataFrame.from_dict(decoder_results)[['Source', 'Source Prediction', 'Source Certainty', 
                                         'Decoded', 'Decoded Prediction', 'Decoded Certainty',
                                         'Score']]

Unnamed: 0,Source,Source Prediction,Source Certainty,Decoded,Decoded Prediction,Decoded Certainty,Score
0,will never that to the he's not only to trump ...,new york,0.164445,lmao not new never to the album has no new it ...,los angeles,0.232524,0.432241
1,i miss you but i got no time for that,houston,0.160725,"lmao i don’t get yo la no la lmao no lmao , lm...",los angeles,0.778648,0.159367
2,in you needed a late night of awesome and welc...,new york,0.120146,lmao la la being la la &amp; its los the ride ...,los angeles,0.922191,0.279969
3,i’ve seen the of having students years in a ju...,seattle,0.100750,lmao having the la more years is having a of t...,los angeles,0.367551,0.558873
4,can i a for the fact that you use in your it m...,washington,0.105180,cant you feel for the new of my name no times ...,los angeles,0.504507,0.460299
5,you the man,oklahoma city,0.103490,lmao you tonight @ los,los angeles,0.934378,0.198028
6,tonight,los angeles,0.129843,los pm \r\n tryna understand it.,los angeles,0.740494,0.172306
7,and by my forever @,los angeles,0.118423,by &amp; la your baby @ 😭,los angeles,0.471474,0.262803
8,so if and then takes an to in and win out. the...,tampa,0.106776,i’m getting through when and gets the that get...,los angeles,0.555836,0.311067
9,and with a,seattle,0.125073,los new los fine,los angeles,0.938229,0.229712


In [28]:
np.mean(score_list)

0.28574797861278056