In [30]:
import tweets_processor
import mlflow
import keras
import numpy as np
import talos as ta
import mlflow.keras
import importlib
import os
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from sklearn.model_selection import train_test_split
from collections import Counter
from keras import backend as K
from sklearn.neighbors import NearestNeighbors
from keras.layers import Input
from scipy.optimize import fmin_l_bfgs_b
from keras.optimizers import Adam, Nadam
from keras.activations import softmax, relu, tanh
from keras.losses import categorical_crossentropy, logcosh
from keras.initializers import Constant

In [19]:
importlib.reload(tweets_processor)
# load the data
# get the tweets and the region labels from csv file
tweets_text, tweets_regions = tweets_processor.get_tweets_from_csv()

In [20]:
# preprocess the tweets
processed_tweets = []
for tweet in tweets_text:
    processed_tweet = tweets_processor.preprocessor(tweet)
    processed_tweets.append(processed_tweet)

In [21]:
# tokenize
# create the tokenizer at word level
t = Tokenizer()
t.fit_on_texts(processed_tweets)

In [22]:
# get the vocab size
vocab = list(t.word_counts.keys())
vocab_size = len(t.word_counts) + 1
vocab_ids = list(t.word_index.values())
word_index = t.word_index

In [23]:
# convert the tweets to sequence of id's
encoded_tweets = t.texts_to_sequences(processed_tweets)

# make inputs of same length by using pad_sequences
padded_tweets = pad_sequences(encoded_tweets,padding='post',maxlen=140)

In [24]:
# convert labels to categorical
categorical_labels = keras.utils.to_categorical(tweets_regions, num_classes=23)

In [25]:
# split the data into train and test
train_data, test_data, train_labels, test_labels = train_test_split(padded_tweets, categorical_labels, test_size=0.33, random_state=0)


In [26]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join('./glove.6B', 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


Indexing word vectors.


In [27]:
# prepare embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [28]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.22115   ,  0.027844  , -0.053894  , ...,  0.52428001,
         0.67877001,  0.35839   ],
       [-0.2687    ,  0.81708002,  0.69896001, ..., -0.40110001,
         0.74656999,  0.31121999],
       ...,
       [-0.1116    ,  0.22662   ,  0.16662   , ..., -0.10404   ,
        -0.071575  ,  0.025429  ],
       [-0.35554001, -0.39142999, -0.22046   , ...,  0.65447998,
        -0.38712001, -0.019705  ],
       [ 0.055871  ,  0.18127   , -0.56800002, ..., -0.28742   ,
        -0.18173   , -0.33645001]])

In [31]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(len(word_index) + 1,
                            100,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=140,
                            trainable=False)

In [36]:
# putting the model in a function so we can pass params for hyperparameter tuning
def lstm_model(x_train,y_train,x_val,y_val,params):
    model = Sequential()
    with mlflow.start_run():
        
        model.add(embedding_layer) 
        model.add(LSTM(params['first_lstm_units'],activation=params['first_lstm_activation'],return_sequences=True))
        model.add(LSTM(params['second_lstm_units'],activation=params['second_lstm_activation']))
        model.add(Dropout(0.2))
        num_regions = 23 # our set of regions
        model.add(Dense(num_regions, activation='softmax'))

        model.compile(loss=params['losses'],
                     optimizer='adam',
                     metrics=['accuracy'])

        out = model.fit(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_data=[x_val, y_val])

        #mlflow logs
        mlflow.log_param("embedding_features", params['embedding_features'])
        mlflow.log_param("first_lstm_units", params['first_lstm_units'])
        mlflow.log_param("second_lstm_units", params['second_lstm_units'])
        mlflow.log_param("first_lstm_activation", params['first_lstm_activation'])
        mlflow.log_param("second_lstm_activation", params['second_lstm_activation'])
        mlflow.log_param("dropout", params['dropout'])
        mlflow.log_param("batch_size", params['batch_size'])
        mlflow.log_param("epochs", params['epochs'])
        mlflow.log_param("loss", params['losses'])
        mlflow.log_param("optimizer", params['optimizer'])
        #mlflow.log_param("char", 'characterlevel with 2lstms')
        #mlflow.log_metric("evaluation_loss", score[0])
        #mlflow.log_metric("evaluation_accuracy", score[1])
        #mlflow.log_param("activation", activation) # default for now tanh

        #mlflow.keras.log_model(model, "models_lstm")
        
    return out, model

In [39]:
# hyperparameter tuning params
p = {'first_lstm_units':[32,64],
     'second_lstm_units':[32,64],
     'batch_size': [30,20],
     'epochs': [10,5],
     'optimizer': [Adam],
     'losses': [categorical_crossentropy],
     'first_lstm_activation':[relu],
     'second_lstm_activation':[relu],
     'last_activation': [softmax]}

In [40]:
# talos scan
h = ta.Scan(train_data, train_labels,
          params=p,
          dataset_name='first_test',
          experiment_no='1',
          model=lstm_model,
          grid_downsample=0.1)

  0%|          | 0/1 [00:00<?, ?it/s]

Train on 18760 samples, validate on 8040 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


KeyError: 'embedding_features'