In [1]:
import mlflow
import keras
import numpy as np
import mlflow.keras
import importlib
import os
import pandas as pd
import tensorflow as tf
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Lambda
from keras.layers import Embedding
from keras.layers import LSTM, RepeatVector
from sklearn.model_selection import train_test_split
from collections import Counter
from keras import backend as K
from sklearn.neighbors import NearestNeighbors
from keras.layers import Input
from scipy.optimize import fmin_l_bfgs_b
from keras.optimizers import Adam, Nadam, RMSprop, Adadelta
from keras.activations import softmax, relu, tanh
from keras.losses import categorical_crossentropy, logcosh, binary_crossentropy
from keras.initializers import Constant
from keras.models import Model
from keras.layers import TimeDistributed
from keras import objectives
from keras.layers import Conv1D, MaxPooling1D, UpSampling1D
from keras.layers import Dense, Dropout, Flatten
from scipy.spatial.distance import cdist
from keras.callbacks import ModelCheckpoint
from keras.layers import Cropping1D
from keras.layers import BatchNormalization
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction

Using TensorFlow backend.


In [2]:
# another way to process the csv data
df = pd.read_csv('../data/tweets_labelled_balanced.csv', nrows=300000)
df.dropna(inplace=True)
df.region = df.region.astype(int)
tweets_text = df.text.tolist()
tweets_regions = df.region.tolist()

In [3]:
# tokenize
# create the tokenizer at word level
t = Tokenizer(lower = True, filters ='')
t.fit_on_texts(tweets_text)

In [4]:
# get the vocab size
vocab = list(t.word_counts.keys())
vocab_size = len(t.word_counts) + 1
vocab_ids = list(t.word_index.values())
word_index = t.word_index

In [5]:
# convert the test tweets to sequence of id's
encoded_tweets = t.texts_to_sequences(tweets_text)#test_tweets

max_len = len(max(encoded_tweets, key=len))

In [6]:
# load trained model
model_dir_path = 'cnn_300k.h5'
model = load_model(model_dir_path)

In [7]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 68, 100)      33632000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv1D)                  (None, 68, 128)      38528       embedding_1[0][0]                
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 68, 128)      512         conv1[0][0]                      
__________________________________________________________________________________________________
conv2 (Con

In [8]:
# create dictionary of layers
layer_dict = dict([(layer.name, layer) for layer in model.layers])

In [32]:
layer_dict

{'batch_normalization_1': <keras.layers.normalization.BatchNormalization at 0x1a28377390>,
 'batch_normalization_2': <keras.layers.normalization.BatchNormalization at 0x1a283776a0>,
 'batch_normalization_3': <keras.layers.normalization.BatchNormalization at 0x1a28377d30>,
 'batch_normalization_4': <keras.layers.normalization.BatchNormalization at 0x1a20c471d0>,
 'conv1': <keras.layers.convolutional.Conv1D at 0x1a28377400>,
 'conv2': <keras.layers.convolutional.Conv1D at 0x1a28377518>,
 'conv5': <keras.layers.convolutional.Conv1D at 0x1a28377a58>,
 'conv6': <keras.layers.convolutional.Conv1D at 0x1a28377e80>,
 'dense1': <keras.layers.core.Dense at 0x1a283779e8>,
 'dense2': <keras.layers.core.Dense at 0x1a28377d68>,
 'dense3': <keras.layers.core.Dense at 0x1a20c472e8>,
 'dense4': <keras.layers.core.Dense at 0x1a20c47470>,
 'dense5': <keras.layers.core.Dense at 0x1a20c475c0>,
 'dropout_1': <keras.layers.core.Dropout at 0x1a28377898>,
 'dropout_2': <keras.layers.core.Dropout at 0x1a28377ba

In [9]:
# reverse word_index
reverse_word = dict((v,k) for k, v in word_index.items())

In [10]:
# get text from id's
def get_text(data, input_sequence_length):
    inner_string = []
    for i, id in enumerate(data):
        if i < input_sequence_length:
            if id != 0:
                inner_string.append(reverse_word[id])
        else:
            break
    text = ' '.join(inner_string)
    return text

In [11]:
# gradient ascent
def get_gradients(region_label):
    layer_output = layer_dict['dense4'].output
    # the target region to which we want the manupilation
    loss = K.mean(layer_output[:, region_label]) 
    # compute the gradient of the input text wrt to this loss
    grads = K.gradients(loss, model.layers[2].input)[0]
    # normalization trick: we normalize the gradient
    grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-5)
    # this function returns the loss and grads given the input picture
    iterate = K.function([model.layers[2].input], [loss, grads]) # passing embedding layer as input
    
    return iterate

In [31]:
def apply_gradient_ascent(text, region_label):
    step = 1
    gradients_iterator = get_gradients(region_label)
    for i in range(1):
        loss_value, grads_value = gradients_iterator([text])
        #print(loss_value, grads_value)
        if loss_value > 0.95 or loss_value <= 0.: 
            break
        text += grads_value * step
        
    return text

In [13]:
# get the embedding layer output for the input from model
get_embedding_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

In [14]:
# get final dense layer output with input as the embedding 
get_final_dense_layer_output = K.function([model.layers[2].input],
                                  [layer_dict['dense5'].output])

In [15]:
# get the final dense layer output for the original input
get_dense_output = K.function([model.layers[0].input],
                                  [layer_dict['dense5'].output])

In [16]:
# get region probabilities for a tweet, used for getting class of translated tweet
get_region_probabilities = K.function([model.layers[0].input],
                                  [layer_dict['dense4'].output])

In [17]:
# passing data for translation
def translate_tweet(input_tweet, region_translate_to):
    #pass through the embedding layer, to the output
    emb_input = get_embedding_layer_output([np.array([input_tweet])])[0]
    # apply gradient ascent
    after_gradient_ascent = apply_gradient_ascent(emb_input, region_translate_to)
    # get final dense layer output
    output_text_from_traslator = get_final_dense_layer_output([after_gradient_ascent])[0]
    # get the id's with maximum probability for each word
    translated_output = np.argmax(output_text_from_traslator, axis=-1)
    translated_output =  np.squeeze(translated_output)
    
    return translated_output

In [18]:
# bleu score
def get_bleu_score(original_tweet, retranslated_tweet):
    reference = original_tweet.split()
    candidate = retranslated_tweet.split()
    cc = SmoothingFunction()
    score = sentence_bleu([reference], candidate, smoothing_function = cc.method4)
    return score

In [19]:
def get_tweet_region(text):
    #print(text)
    probabilities = get_region_probabilities([np.array([text])])
    region = np.argmax(probabilities, axis=-1)
    
    return region

In [21]:
# Run the model on the 130 tweets
text = open("test_tweets.txt").read()
test_tweets = eval(text)

In [22]:
# translating the 130 test tweets, we translate from it's original region to the next region in this list
region_ids = [3,  4,  5,  7, 10, 13, 14, 15, 18, 19, 20, 21, 22]
region_dict = {0: "albuquerque", 1: "billings", 2: "calgary", 3: "charlotte", 4: "chicago", 5: "cincinnati", 6: "denver", 
               7: "houston", 8: "kansas city", 9: "las vegas", 10: "los angeles", 11: "minneapolis", 12: "montreal", 
               13: "nashville", 14: "new york", 15: "oklahoma city", 16: "phoenix", 17: "pittsburgh", 18: "san francisco", 
               19: "seattle", 20: "tampa", 21: "toronto", 22: "washington"}

df_scores = pd.DataFrame(columns=['Source', 'Target', 'Decile', 'Original', 'Translation', 'Trans Region', 'Trans BLEU', 
                                  'Retranslation', 'Retrans Region', 'Retrans BLEU'])

trans_score = 0
retrans_score = 0

for i in range(len(region_ids)):
    source = region_ids[i]
    target = region_ids[i+1] if i+1 < len(region_ids) else region_ids[0]
    test_seq = t.texts_to_sequences(test_tweets[source])
    padded_tweets = pad_sequences(test_seq,padding='post',maxlen=max_len)
    print("ith region:",i)
    print("number of tweets:",len(padded_tweets))

    for x in range(len(padded_tweets)):
        print("xth tweet:",x)
        length = len(test_seq[x])
        original = padded_tweets[x]
        translation = translate_tweet(original, target)
        retranslation = translate_tweet(translation, source)
        original_sentence = get_text(original, length)
        translated_sentence = get_text(translation, length)
        retranslated_sentence = get_text(retranslation, length)
        trans_bleu = get_bleu_score(original_sentence, translated_sentence)
        retrans_bleu = get_bleu_score(original_sentence, retranslated_sentence)
        trans_region = int(get_tweet_region(translation)[0][0])
        retrans_region = int(get_tweet_region(retranslation)[0][0])
        trans_score += 1 if trans_region==target else 0
        retrans_score += 1 if retrans_region==source else 0
        
        #print(trans_region)
        print("before dataframe")
        df_scores.loc[10*i + x] = [region_dict[source], region_dict[target], x, original_sentence, translated_sentence, 
                            region_dict[trans_region], trans_bleu, retranslated_sentence, region_dict[retrans_region], 
                            retrans_bleu]

ith region: 0
number of tweets: 10
xth tweet: 0
before dataframe
xth tweet: 1
before dataframe
xth tweet: 2
before dataframe
xth tweet: 3
before dataframe
xth tweet: 4
before dataframe
xth tweet: 5
before dataframe
xth tweet: 6
before dataframe
xth tweet: 7
before dataframe
xth tweet: 8
before dataframe
xth tweet: 9
before dataframe
ith region: 1
number of tweets: 10
xth tweet: 0
before dataframe
xth tweet: 1
before dataframe
xth tweet: 2
before dataframe
xth tweet: 3
before dataframe
xth tweet: 4
before dataframe
xth tweet: 5
before dataframe
xth tweet: 6
before dataframe
xth tweet: 7
before dataframe
xth tweet: 8
before dataframe
xth tweet: 9
before dataframe
ith region: 2
number of tweets: 10
xth tweet: 0
before dataframe
xth tweet: 1
before dataframe
xth tweet: 2
before dataframe
xth tweet: 3
before dataframe
xth tweet: 4
before dataframe
xth tweet: 5
before dataframe
xth tweet: 6
before dataframe
xth tweet: 7
before dataframe
xth tweet: 8
before dataframe
xth tweet: 9
before dataf

In [23]:
trans_score/130

0.16923076923076924

In [24]:
retrans_score/130

0.2

In [25]:
df_scores['Trans BLEU'].mean()

0.042453034848165358

In [26]:
df_scores['Retrans BLEU'].mean()

0.060408287369466497

In [34]:
df_scores

Unnamed: 0,Source,Target,Decile,Original,Translation,Trans Region,Trans BLEU,Retranslation,Retrans Region,Retrans BLEU
0,charlotte,chicago,0,not true. but i hear you,polar up then studios cared va,new york,0,"my my mf people , army",houston,0
1,charlotte,chicago,1,thursday isn’t coming fast enough i’m ready to...,palace going how invaded toast strawberry chee...,new york,0,they are we keep have showcase they my they,houston,0
2,charlotte,chicago,2,awful first quarter again for the still can’t ...,walked they freezing to gets per shoutout hood...,chicago,0.238255,we they never how it they just your your they ...,charlotte,0.168472
3,charlotte,chicago,3,compare trump manufacturing numbers to kindly ...,logan it it screens unnecessarily mailing stra...,new york,0,tx my schedule mf catching against be your you...,houston,0.180685
4,charlotte,chicago,4,why you should train like a if you're a | - if...,"terminal out toronto, routes park terminal wil...",toronto,0,you they they my they aim they willing just pu...,new york,0.203225
5,charlotte,chicago,5,if only we had that espn cam,england india woken rode terminal metro strawb...,new york,0,so never become died wore white buena,houston,0
6,charlotte,chicago,6,the comments on this unc g ucla feed 😂😂😂😂,what it you attack mueller harry terminal va feel,new york,0,bout florida few amen ky shoutout they my gladly,houston,0
7,charlotte,chicago,7,with the sc on our chest,f. ontario never va harry ive,new york,0,they my my it til south,houston,0
8,charlotte,chicago,8,➡️ for the 44 yard tech 7 ii georgia 28,go ontario harry gordon buena toast graffiti n...,new york,0,up while james others snapchat incredibly thro...,new york,0
9,charlotte,chicago,9,we're #hiring! read about our latest #job open...,tour button cringe minor 's worldwide what how...,new york,0,attended honored angels to tasty these from fy...,new york,0


In [35]:
df_scores.loc[df_scores['Target'] == 'houston']

Unnamed: 0,Source,Target,Decile,Original,Translation,Trans Region,Trans BLEU,Retranslation,Retrans Region,Retrans BLEU
20,cincinnati,houston,0,made this for &amp; hope they like it,tx carolina wore ion shoutout cowboys ion love,houston,0.0,jews had are black of due eb load,houston,0.0
21,cincinnati,houston,1,being than you are literally at when it comes ...,invite my gotta all shoutout against anymore i...,houston,0.0,my officially cheers thick latin illegal media...,houston,0.0
22,cincinnati,houston,2,one taught me love taught me one taught me pain,kno my asf hang breathe eating ma carolina the...,houston,0.0,dedicated named bitch hella had bday got incre...,houston,0.0
23,cincinnati,houston,3,i’m so thankful to have parents who are the gr...,my make my da army bday florida carolina ish v...,houston,0.209775,brought fruit potato toy potter cats potter md...,new york,0.176399
24,cincinnati,houston,4,does anyone else’s heart drop at the sound of ...,bout my ordered all bday eating my bout caroli...,houston,0.233115,she elves she have they bday them carolina adv...,houston,0.196026
25,cincinnati,houston,5,if only notre dame could figure out how to a c...,who my my my mf ion my succeed bday can they i...,houston,0.0,mcconnell alert x must are know per nov perfor...,new york,0.0
26,cincinnati,houston,6,may we all experience this joy at the end. is ...,my florida birthday suddenly sick wit cant alu...,houston,0.0,to it how how unhealthy prayed boomerang targe...,new york,0.0
27,cincinnati,houston,7,one could argue that no one brings more to a a...,dm my kindly ish awards couple bout decide bda...,houston,0.0,cheers street\r\nb/w discovered surprise cheer...,new york,0.0
28,cincinnati,houston,8,"interested in a #job in #indianapolis, in? thi...",they ncaa follow my they carolina my houston b...,houston,0.0,fr they just they people pussy shower need bea...,houston,0.0
29,cincinnati,houston,9,big game. big implications. i'm picking #4 mic...,plan my bday my tx my my carolina they never t...,houston,0.0,how can founders shoutout per saudi our busy e...,houston,0.0


In [None]:
# printing the original input and the translated output
for i in range(len(input_text_to_translator)):
    print(get_text(input_text_to_translator)[i], '->', get_text(translated_output)[i])

In [None]:
# printing the output of the final dense layer for the input and the translated output after gradient ascent
# get id's of words with maximum probability
dense_output = get_dense_output([input_text_to_translator])[0]
text_output = np.argmax(dense_output, axis=-1)
for i in range(len(input_text_to_translator)):
    print(get_text(text_output)[i], '->', get_text(translated_output)[i])

In [None]:
for i in range(len(input_text_to_translator)):
    print(get_text(text_output)[i], '->', get_text(retranslated_output)[i])