In [1]:
import numpy as np
import pandas as pd
import time
from keras import backend as K
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
# Load Trained Model
from keras.models import load_model
model = load_model('/var/models/twitter_40k_charlevel_lstm_onehot.h5')

In [3]:
def normalize(x):
    # utility function to normalize a tensor by its L2 norm
    #return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())
    return x / K.max(x)

In [4]:
target_category = 5
input_txt = model.input
layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
layer_name = 'dense_1'
layer_output = layer_dict[layer_name].output
loss = K.mean(model.output[:, target_category])
grads = K.gradients(loss, input_txt)[0]
grads = normalize(grads)
iterate = K.function([input_txt], [loss, grads])
step = 1.

In [50]:
input_tweet = ["i'm at cassell's burgers in los angeles, ca"]
num_unique_symbols = 500
x_length = 200

t = text.Tokenizer(
    char_level=True,
    filters=None,
    lower=True,
    num_words=num_unique_symbols-1,
    oov_token='unk'
)

df = pd.read_csv('/var/data/tweets_labelled_40k.csv')
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
t.fit_on_texts(X)

test_sequence = t.texts_to_sequences(input_tweet)
test_padded = sequence.pad_sequences(test_sequence, maxlen=x_length)
input_sequence = to_categorical(test_padded, num_classes=num_unique_symbols)

In [52]:
output_sequence = input_sequence.copy()
for i in range(50):
    loss_value, grads_value = iterate([output_sequence])
    output_sequence += grads_value * step
    if i % 3 == 0:
        blank = np.zeros((1, 200, 500))
        np.put_along_axis(blank[0], np.expand_dims(np.argmax(output_sequence[0], axis=1), axis=1), 1, axis=1)
        output_sequence = blank
    probs = model.predict_on_batch(output_sequence)
    cat = np.argmax(probs, axis=1)
    top_prob = probs[0][cat]
                    
    print('Current loss value: {}, predicted category: {}, certainty: {}'
          .format(loss_value, cat, top_prob))
    if loss_value <= 0. or (cat==target_category and top_prob > .9):
        # some filters get stuck to 0, we can skip them
        break

Current loss value: 0.015896862372756004, predicted category: [7], certainty: [0.11233685]
Current loss value: 0.09498601406812668, predicted category: [7], certainty: [0.9396867]
Current loss value: 0.01995289884507656, predicted category: [5], certainty: [0.31038865]
Current loss value: 0.3103886544704437, predicted category: [2], certainty: [0.6970527]
Current loss value: 0.008473051711916924, predicted category: [7], certainty: [0.7314852]
Current loss value: 0.05209602788090706, predicted category: [7], certainty: [0.11122768]
Current loss value: 0.09674761444330215, predicted category: [19], certainty: [0.16448238]
Current loss value: 0.09268669784069061, predicted category: [7], certainty: [0.5014479]
Current loss value: 0.1546216905117035, predicted category: [10], certainty: [0.41589633]
Current loss value: 0.0006312582991085947, predicted category: [7], certainty: [0.24123757]
Current loss value: 0.1267969012260437, predicted category: [7], certainty: [0.40363368]
Current los

In [7]:
def embedding_to_text(tokenizer, embedding):
    index_word = {v: k for k, v in tokenizer.word_index.items()} # map back
    embedding = embedding[0]
    output = []
    for l in range(len(embedding)):
        if np.argmax(embedding[l]) > 0:
            output.append(index_word[np.argmax(embedding[l])])
        else:
            continue
    return ''.join(output)

In [53]:
decode = [embedding_to_text(t, output_sequence)]
decode

["😈😈😈🔗🔗i'm a’’’a’sell'’ 🙄🙄🙄🙄📍📍📍k’’’’’s 🏾n🏾’’’🤔😎🎄”🙂"]

In [304]:
decode_sequence = t.texts_to_sequences(decode)
decode_padded = sequence.pad_sequences(decode_sequence, maxlen=x_length)
decode_onehot = to_categorical(decode_padded, num_classes=num_unique_symbols)
decode_prediction_probs = model.predict_on_batch(decode_onehot)
np.argmax(decode_prediction_probs, axis=1)[0]

8

In [283]:
probs = model.predict_on_batch(output_sequence)
probs[0][np.argmax(probs, axis=1)]

array([0.42322648], dtype=float32)

In [183]:
embedding_to_text(t, decode_onehot)

"🙄🙄🙄’’’’’’’’’’’’’🏾🏾😘😘💖\U0001f929🙌😘😘🙄🙄🏾🏾🙄🇸🦋)💫😈’’🎃😘’🙄by🙄p. ✌😘😤xz,🎃 '’‼$ 🙂"

In [171]:
np.argmax(decode_onehot[0][199])

159

In [236]:
def predict_region(model, x_length, num_symbols, tokenizer, string):
    decode_sequence = tokenizer.texts_to_sequences(string)
    decode_padded = sequence.pad_sequences(decode_sequence, maxlen=x_length)
    decode_onehot = to_categorical(decode_padded, num_classes=num_unique_symbols)
    decode_prediction_probs = model.predict_on_batch(decode_onehot)
    region = np.argmax(decode_prediction_probs, axis=1)[0]
    return (region, decode_prediction_probs[0][region]) 

In [277]:
def identify_regional_substring(string, length=None):
    target_region = predict_region(model, x_length, num_unique_symbols, t, [string])[0]
    best = [0, '']
    text_list = text.split()
    length = len(text_list)-1 if length is None else length+1
    for w in range(1, length):
        for snap in range(len(text_list)-w+1):
            search_string = ' '.join(text_list[snap:snap+w])
            search_response = predict_region(model, x_length, num_unique_symbols, t, [search_string])
            if search_response[0] == target_region and search_response[1] > best[0]:
                best = [search_response[1], search_string]
    return best

In [279]:
predict_text = "the most beautiful belle and our littlest pumpkin had so much fun for halloween! #chocolateoverload #trickortreat #myfirsthalloween #beautyandthebeast @ pembroke pines, florida"
identify_regional_substring(predict_text, 3)

[0.8441567, 'pembroke pines, florida']

In [321]:
predict_text = "🦋🦋🦋=’’'e'e🎄🎄🎄🎄)"
predict_region(model, x_length, num_unique_symbols, t, [predict_text])

(7, 0.13844642)

In [366]:
np.argmax(model.predict(output_sequence))

10

In [331]:
def deprocess_embedding(x):
    x -= x.mean()
    x /= (x.std() + K.epsilon())
    x *= 0.1

    # clip to [0, 1]
    x += 0.5
    x = np.clip(x, 0, 1)
    
    return x

In [40]:
blank = np.zeros((1, 200, 500))
np.put_along_axis(blank[0], np.expand_dims(np.argmax(output_sequence[0], axis=1), axis=1), 1, axis=1)

In [30]:
blank[0].shape

(200, 500)

In [37]:
ai = 
np.put_along_axis(blank[0], np.expand_dims(np.argmax(output_sequence[0], axis=1), axis=1), 1, axis=1)

In [39]:
blank[0][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.