In [61]:
import tweets_processor
import mlflow
import keras
import numpy as np
import mlflow.keras
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from sklearn.model_selection import train_test_split
from collections import Counter
from keras import backend as K
from sklearn.neighbors import NearestNeighbors
from keras.layers import Input
from scipy.optimize import fmin_l_bfgs_b

In [None]:
###TODO section
#1. should we use same padding length for train and test data

In [3]:
# load the data
# get the tweets and the region labels from csv file
tweets_text, tweets_regions = tweets_processor.get_tweets_from_csv()

In [4]:
# preprocess the tweets
processed_tweets = []
for tweet in tweets_text:
    processed_tweet = tweets_processor.preprocessor(tweet)
    processed_tweets.append(processed_tweet)

In [5]:
# convert label regions to integers
encoded_labels = tweets_processor.encode_labels(tweets_regions)

In [6]:
# split the data into train and test
train_data, test_data, train_labels, test_labels = train_test_split(processed_tweets, encoded_labels, test_size=0.33, random_state=0)

In [7]:
# tokenize
# create the tokenizer at character level
t = Tokenizer(char_level=True)
# create the tokenizer at word level
# t = Tokenizer()
t.fit_on_texts(train_data)

In [127]:
# get the vocab size
vocab = list(t.word_counts.keys())
vocab_size = len(t.word_counts) + 1
vocab_ids = list(t.word_index.values())
#vocab
#vocab_ids

In [128]:
# convert the train data to sequence of id's
encoded_train_data = t.texts_to_sequences(train_data)
#print(type(encoded_train_data))

# make inputs of same length by using pad_sequences
padded_train_data = pad_sequences(encoded_train_data,padding='post')
# conver to numpy array
final_train_data = np.array(padded_train_data)

In [10]:
# convert the test data to sequence of id's
encoded_test_data = t.texts_to_sequences(test_data)

# make inputs of same length by using pad_sequences
padded_test_data = pad_sequences(encoded_test_data,padding='post')
# convert to numpy array
final_test_data = np.array(padded_test_data)

In [None]:
# just testing not logic, can ignore
for doc in encoded_docs:
    #print(doc)
    test = ''
    for index in doc:
        test = test.join(list(t.word_index.keys())[list(t.word_index.values()).index(index)])
        print(list(t.word_index.keys())[list(t.word_index.values()).index(index)])

In [13]:
# convert the integers to categorical labels for train set
# there are 23 regions as labels
x_train = keras.utils.to_categorical(train_labels, num_classes=23)
x_train[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.], dtype=float32)

In [14]:
# convert the integers to categorical labels for test set
y_test = keras.utils.to_categorical(test_labels, num_classes=23)

In [30]:
# create model with keras
model = Sequential()

with mlflow.start_run():
    # picking an arbitary value for the output_dim
    embedding_features = 20 # hyperparameter to tune
    #create embedding for vocab size which is 59 characters in this case
    model.add(Embedding(vocab_size, output_dim=embedding_features)) 
    lstm_units = 20 # hyperparameter to tune
    model.add(LSTM(lstm_units,return_sequences=True))
    model.add(LSTM(lstm_units))
    dropout = 0.5
    model.add(Dropout(dropout))
    num_regions = 23 # our set of regions
    model.add(Dense(num_regions, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
    
    batch_size = 32
    epochs = 10
    model.fit(final_train_data, x_train, batch_size=batch_size, epochs=epochs)
    
    score = model.evaluate(final_test_data, y_test)
    print(score)
    
    #mlflow logs
    mlflow.log_param("embedding_features", embedding_features)
    mlflow.log_param("lstm_units", lstm_units)
    mlflow.log_param("dropout", dropout)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("char", 'characterlevel with 2lstms')
    mlflow.log_metric("evaluation_loss", score[0])
    mlflow.log_metric("evaluation_accuracy", score[1])
    #mlflow.log_param("activation", activation) # default for now tanh
    
    mlflow.keras.log_model(model, "models")
    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[2.865085354429303, 0.12878787878787878]


In [None]:
# trying to see what the predictions are
predictions = model.predict(final_test_data)
label_preds = []
for pred in predictions:
    label_preds.append(np.argmax(pred))
Counter(label_preds)

In [None]:
# number of occurances of each label
Counter(encoded_labels)
Counter(test_labels)
#Counter(train_labels)

In [15]:
# load existing model
model_dir_path = '/Users/divyagorantla/Documents/MIDS/w266/final-project/mlruns/0/1c04d58a35024c6e826d43bd438fbeb1/artifacts/models/model.h5'
new_data = final_test_data
model = load_model(model_dir_path)
predictions = model.predict(new_data)

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 20)          1180      
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 20)          3280      
_________________________________________________________________
lstm_5 (LSTM)                (None, 20)                3280      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 23)                483       
Total params: 8,223
Trainable params: 8,223
Non-trainable params: 0
_________________________________________________________________


<h3>Regions-Words
<h4>Implementing the idea of getting words for a region based on nearest neighbors on embeddings

In [17]:
# using the character embedding but finding the embeddings for the vocab words when the train data is tokenized with words
# and then using it to find nearest neighbors for regions. The regions are also converted in terms of character embedings 

# get vocab by tokenizing using words
t_w = Tokenizer()
t_w.fit_on_texts(train_data)
vocab_words = list(t_w.word_counts.keys())

# convert the vocab words to sequence of characters for passing as input to the embedding layer to get the output
encoded_vocab_words = t.texts_to_sequences(vocab_words)
# make inputs of same length by using pad_sequences
padded_vocab_words = pad_sequences(encoded_vocab_words,padding='post', maxlen=140)
# convert to numpy array
final_vocab_words = np.array(padded_vocab_words)


In [18]:
# get embeddings for the regions
regions = list(tweets_processor.regions_mapping.keys())
# convert the region to sequence of characters for passing as input to the embedding layer to get the output
encoded_regions = t.texts_to_sequences(regions)
# make inputs of same length by using pad_sequences
padded_regions_data = pad_sequences(encoded_regions,padding='post', maxlen=140)
# convert to numpy array
final_regions_data = np.array(padded_regions_data)

In [19]:
# add regions to the vocab words as well and use that as input
concatenated_vocab = np.concatenate((final_vocab_words, final_regions_data), axis=0)

In [20]:
# coming up with words that belong to a region

# get the embeddings for the vocab_words set
get_embedding_layer_output = K.function([model.layers[0].input],
                                  [model.layers[0].output])
#vocab_embeddings = get_embedding_layer_output([final_vocab_words])[0]
#using the concatenated vocab
vocab_embeddings = get_embedding_layer_output([concatenated_vocab])[0]

#print(type(encoded_train_data))
region_embeddings = get_embedding_layer_output([final_regions_data])[0]


# find k nearest neighbours
# sum the embeddings of the characters to get it into two dimensions
final_vocab_embeddings = np.sum(vocab_embeddings, axis=1)
final_region_embeddings = np.sum(region_embeddings, axis=1)

nbrs = NearestNeighbors(n_neighbors=1000, algorithm='auto', leaf_size=10).fit(final_vocab_embeddings)
distances, indices = nbrs.kneighbors(final_region_embeddings)

In [21]:
concatenated_vocab.shape

(14046, 140)

In [22]:
#concatenate vocab words and regions
total_vocab = np.concatenate((np.array(vocab_words),np.array(regions)), axis = 0)

regions_word = {}
for i, word_index in enumerate(indices):
    #regions_word[vocab_words[i]] = np.array(vocab_words)[word_index]
    # using concatenated vocab
    regions_word[regions[i]] = total_vocab[word_index]
    

In [23]:
regions_word

{'albuquerque': array(['albuquerque', 'albuquerque', 'quailcreek', 'superfreq',
        'wearpurple', 'earthquake', 'acceptable', 'republican',
        'appearance', 'acceptance', 'perplexed', 'beachclub',
        'spectacular', 'appreciate', 'evacuated', 'quarterly', 'supremacy',
        'preserve', 'sequence', 'carebears', 'unacceptable', 'relevance',
        'recaptured', 'quarters', 'revamped', 'spectaclepic', 'jerusalem',
        'mealprep', 'peripheral', 'excavated', 'bluewave', 'caricature',
        'crumbles', 'makeupfeed', 'unprepared', 'embrace', 'particular',
        'lifecare', 'valuable', 'vanderpump', 'huckabee', 'bechamel',
        'subjected', 'chesapeake', 'ravenclaw', 'appliance', 'supercopa',
        'replaced', 'versace', 'bracelet', 'republic', 'preserver',
        'reappeared', 'purchases', 'revealed', 'peculiar', 'pleasure',
        'cauliflower', 'locktrumpup', 'exclusive', 'creatures', 'umbrella',
        'cupcakecafe', 'blacksburg', 'avalance', 'esəˈterik',
  

<h3> DeepDream
<h4> Train model on our dataset and apply deep dream to see how it transforms the original sentence from a different region into sentences of that region
<h4> Used the gradient ascent and optimization logic from an implementation online, need to try the way logic in https://blog.keras.io/category/demo.html

In [27]:
# creating dict of layer id and the corresponding layer
layer_number = len(model.layers)
layer_dict = dict([(lr_n, layer) for lr_n,layer in zip(range(layer_number),model.layers)])
layer_dict

{0: <keras.layers.embeddings.Embedding at 0x110c254a8>,
 1: <keras.layers.recurrent.LSTM at 0x110f1a048>,
 2: <keras.layers.recurrent.LSTM at 0x110f1a780>,
 3: <keras.layers.core.Dropout at 0x110f1d898>,
 4: <keras.layers.core.Dense at 0x112a5afd0>}

In [74]:
# apply gradient ascent to the layer choosen
coeff = 0.01
#input_template = Input((None,20))
# pick on layer and get it's output, picking the second lstm here
layer_output = layer_dict[2].output

loss =  - coeff * K.sum(K.square(layer_output))
grads = K.gradients(loss, model.layers[1].input)


f_outputs = K.function([model.layers[1].input], [loss] + grads)
#print(model.layers[1].input)
#print(input_template)

Tensor("embedding_3/embedding_lookup:0", shape=(?, ?, 20), dtype=float32)
Tensor("input_9:0", shape=(?, ?, 20), dtype=float32)


In [92]:
# convert our test input into the correct format to pass to the f_outputs function
test_input = ['hello browns, great job']
# convert the train data to sequence of id's
encoded_test_input = t.texts_to_sequences(test_input)
print(encoded_test_input)

# make inputs of same length by using pad_sequences
padded_test_input = pad_sequences(encoded_test_input,padding='post',maxlen=140)
# conver to numpy array
final_test_input = np.array(padded_test_input)
#pass through the embedding layer
emb_test_input = get_embedding_layer_output([final_test_input])[0]
emb_test_input.shape

[[13, 2, 10, 10, 4, 1, 21, 9, 4, 19, 6, 8, 1, 14, 9, 2, 3, 5, 1, 24, 4, 21]]


(1, 140, 20)

In [113]:
#get loss and grads for an input
def eval_loss_and_grads(x):
    x = x.reshape(1,140,20)
    # applying gradient ascent to the layer chosen
    outs = f_outputs([x])
    loss_value = outs[0]
    if len(outs[1:]) == 1:
        grad_values = outs[1].flatten().astype('float64')
    else:
        grad_values = np.array(outs[1:]).flatten().astype('float64')
    return loss_value, grad_values
loss_value, grad_values = eval_loss_and_grads(emb_test_input)
loss_value, grad_values.shape

(-0.008224455, (2800,))

In [76]:
class Evaluator(object):
    def __init__(self):
        self.loss_value = None
        self.grad_values = None

    def loss(self, x):
        assert self.loss_value is None
        loss_value, grad_values = eval_loss_and_grads(x)
        self.loss_value = loss_value
        self.grad_values = grad_values
        return self.loss_value

    def grads(self, x):
        assert self.loss_value is not None
        grad_values = np.copy(self.grad_values)
        self.loss_value = None
        self.grad_values = None
        return grad_values

In [114]:
# optimize the input sentence based on the gradient ascent applied to the layer choosen, here we choose 2nd lstm layer
evaluator = Evaluator()
test_input, min_val, info = fmin_l_bfgs_b(evaluator.loss, emb_test_input.flatten(),fprime=evaluator.grads, maxfun=7)

In [115]:
# these are the embeddings how do you convert them back to corresponding characters?????
test_input.reshape(1,140,20) 

array([[[ 1.08948655e-01, -9.03161839e-02, -9.19215009e-02, ...,
          7.00112358e-02,  8.33693296e-02,  5.57584129e-02],
        [ 3.82137150e-02, -1.10772543e-01, -1.44731537e-01, ...,
          4.91814166e-02,  8.75500962e-02,  1.35502279e-01],
        [ 9.29472744e-02, -2.51047895e-03, -7.24119172e-02, ...,
          4.09625880e-02,  7.05432594e-02,  1.26551315e-01],
        ...,
        [-1.57210751e+00,  3.68202918e-01,  2.67040573e+00, ...,
          2.09120048e-01, -3.04160323e+00,  2.67556183e+00],
        [-2.69522671e+00,  1.33230590e-01,  3.13712547e+00, ...,
          2.74107801e-01, -4.50829865e+00,  3.11517187e+00],
        [-9.12090705e+00, -1.33422417e+00,  5.78534930e-01, ...,
          2.20612721e+00, -3.45095793e+00,  3.84095627e+00]]])

In [None]:
def get_activations(model, layer, X_batch):
    get_activations = K.function([model.layers[0].input], 
                                 [model.layers[layer].output,])
    activations = get_activations([X_batch])
    return activations[0]

In [None]:
# get activations after optimizing the input sentence
activations = get_activations(img_recognition_network,layer_nr,tens)

In [116]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 20)          1180      
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 20)          3280      
_________________________________________________________________
lstm_5 (LSTM)                (None, 20)                3280      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 23)                483       
Total params: 8,223
Trainable params: 8,223
Non-trainable params: 0
_________________________________________________________________


In [122]:
model.layers[0].get_weights()[0].shape # embedding weights

(59, 20)

In [124]:
model.layers[1].get_weights()[0].shape # kernal weights of first lstm layer

(20, 80)

In [125]:
model.layers[1].get_weights()[1].shape # recurrent kernal weights of first lstm layer

(20, 80)

In [126]:
model.layers[1].get_weights()[2].shape # bias of first lstm layer

(80,)