In [13]:
import numpy as np  
import pandas as pd
from collections import Counter
import mlflow
from keras.preprocessing import sequence
from keras.preprocessing import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers.embeddings import Embedding
from keras import optimizers
from keras.utils import to_categorical
import time
import datetime
# fix random seed for reproducibility
np.random.seed(7)

In [14]:
df = pd.read_csv('/var/data/tweets_labelled_40k.csv')
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
y = df['region'].tolist()
df_counts = df.groupby('region').count()
top_category_num = max(df_counts['text'])
top_category_name = df_counts[df_counts['text']==max(df_counts['text'])].index[0]
categories = df_counts.index.tolist()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
0,517
1,176
2,44
3,2387
4,2202
5,1726
6,624
7,3760
8,1034
9,699


In [49]:
# Dumb Baseline
print("Baseline accuracy:  If we just guessed '{}' every time we would have accuracy of {:.2f}%"
      .format(top_category_name, (top_category_num/df.shape[0])*100))

Baseline accuracy:  If we just guessed '14' every time we would have accuracy of 13.05%


In [55]:
# Get average length of each tweet
df['text'].apply(lambda x:len(x)).mean()

87.619408

In [15]:
# Set Parameters
x_length = 200
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
embedding_vector_length = 100
num_unique_symbols = 500
num_layers = 3
H = 200
epochs = 100
optimizer = 'rmsprop'
batch_size = 128
learning_rate = .0001
dropout = 0.2

In [5]:
# Set ML Flow parameters and start the run
mlflow.set_experiment('Twitter 40k v2')
mlflow.start_run()
mlflow.log_param('learning_rate', learning_rate)
mlflow.log_param('num_unique_symbols', num_unique_symbols)
mlflow.log_param('number_of_layers', num_layers)
mlflow.log_param('x_length', x_length)
mlflow.log_param('embedding_vector', embedding_vector_length)
mlflow.log_param('H', H)
mlflow.log_param('optimizer', optimizer)
mlflow.log_param('dropout', dropout)
mlflow.log_param('epochs', epochs)
mlflow.log_param('batch_size', batch_size)
mlflow.log_param('train_size', training_size)
mlflow.log_param('test_size', len(y)-training_size)

In [16]:
t = text.Tokenizer(
    char_level=True,
    filters=None,
    lower=True,
    num_words=num_unique_symbols-1,
    oov_token='unk'
)

t.fit_on_texts(X)
X_seq = t.texts_to_sequences(X)
X_padded = sequence.pad_sequences(X_seq, maxlen=x_length)
X_train = X_padded[:training_size]
X_test = X_padded[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]
one_hot_y_train = to_categorical(y_train, num_classes=num_classes)
one_hot_y_test = to_categorical(y_test, num_classes=num_classes)

#one_hot_x_train = to_categorical(X_train, num_classes=num_unique_symbols)

print("Training set has {} examples, test set has {} examples".format(len(X_train), len(X_test)))

Training set has 30000 examples, test set has 10000 examples


In [17]:
from keras.utils import Sequence

class OneHotBatch(Sequence):
  def __init__(self, X_data, y_data, batch_size, num_chars, num_classes):
    self.X_data = X_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.num_chars = num_chars
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = to_categorical(self.X_data[start:finish], num_classes=self.num_chars)
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_classes)
    return X, y

In [20]:
# Generators
train_generator = OneHotBatch(X_train, y_train, batch_size=batch_size, num_chars=num_unique_symbols, num_classes=num_classes)
validation_generator = OneHotBatch(X_test, y_test, batch_size=batch_size, num_chars=num_unique_symbols, num_classes=num_classes)

# Build and run the model using an Embedding
start_time = time.time()
model = Sequential()
if num_layers > 1:
    model.add(LSTM(H, return_sequences=True, input_shape=(x_length, num_unique_symbols)))
    for m in range(1, num_layers-1):
        model.add(LSTM(H, return_sequences=True))
    model.add(LSTM(H))
else:
    model.add(LSTM(H, input_shape=(x_length, num_unique_symbols)))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print(model.summary())
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='/var/models/twitter_40k_charlevel_lstm_onehot_chk.h5', monitor='val_loss', save_best_only=True)]

history = model.fit_generator(generator=train_generator, epochs=epochs, callbacks=callbacks, 
                              validation_data=validation_generator, max_queue_size=10,
                              workers=5, use_multiprocessing=True)
# Final evaluation of the model
end_time = time.time()
run_time = datetime.timedelta(seconds=end_time-start_time)
scores = model.evaluate_generator(generator=validation_generator, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_16 (LSTM)               (None, 200, 200)          560800    
_________________________________________________________________
lstm_17 (LSTM)               (None, 200, 200)          320800    
_________________________________________________________________
lstm_18 (LSTM)               (None, 200)               320800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 23)                4623      
Total params: 1,207,023
Trainable params: 1,207,023
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1

In [21]:
model_name = 'twitter_40k_charlevel_lstm_onehot_3layer'
mlflow.log_param('model_name', model_name)
mlflow.log_param('notes', 'No multiprocessing or shuffle')
mlflow.log_param('run_time', run_time)
mlflow.log_metric('accuracy', scores[1]*100)
mlflow.end_run()
model.save('/var/models/{}.h5'.format(model_name))

In [37]:
# Load Trained Model
from keras.models import load_model
model = load_model('/var/models/twitter_40k_charlevel_lstm_onehot.h5')

In [35]:
sample_tweet = ["i'm at cassell’s burgers in los angeles, ca"]

test_sequence = t.texts_to_sequences(sample_tweet)
test_padded = sequence.pad_sequences(test_sequence, maxlen=x_length)
test_onehot = to_categorical(test_padded, num_classes=num_unique_symbols)
test_prediction_probs = model.predict_on_batch(test_onehot)
np.argmax(test_prediction_probs, axis=1)

array([10])

In [1]:
test_padded[0]

NameError: name 'test_padded' is not defined

In [32]:
X_test_tweets = X[training_size:]

Xt = X_test
Xt_onehot = to_categorical(Xt, num_classes=num_unique_symbols)
prediction_probs = model.predict_on_batch(Xt_onehot)
predictions = np.argmax(prediction_probs, axis=1)

In [49]:
predicted_regions = np.unique(predictions).tolist()
predicted_regions

[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [36]:
def sequence_to_text(tokenizer, array):
    index_word = {v: k for k, v in tokenizer.word_index.items()} # map back
    return_tweet = []
    for i in array:
        if i != 0:
            return_tweet.append(index_word[i])
    return ''.join(return_tweet)

print(sequence_to_text(t, test_padded[0]))

i'm at cassell’s burgers in los angeles, ca


In [47]:
y[training_size:training_size+100][4]

14

In [51]:
# For each predicted region, find the tweet that the model is MOST confident belongs
regions = ["albuquerque", "billings", "calgary", "charlotte", "chicago", "cincinnati", "denver", "houston", "kansas city",
           "las vegas", "los angeles", "minneapolis", "montreal", "nashville", "new york", "oklahoma city", "phoenix",
           "pittsburgh", "san francisco", "seattle", "tampa", "toronto", "washington"]
best_tweets = dict()

for region in predicted_regions:
    best_tweets[regions[region]] = {'tweet': '', 'prob': 0, 'index': 0}

for i in range(len(prediction_probs)):
    top_region_int = np.argmax(prediction_probs[i])
    top_region = regions[top_region_int]
    top_score = prediction_probs[i][top_region_int]
    if top_score > best_tweets[top_region]['prob']:
        best_tweets[top_region]['prob'] = round(100*top_score, 2)
        best_tweets[top_region]['tweet'] = sequence_to_text(t, Xt[i])
        best_tweets[top_region]['index'] = i

In [52]:
pd.options.display.max_colwidth = 200
df_toptweets = pd.DataFrame.from_dict(best_tweets).T
df_toptweets

Unnamed: 0,index,prob,tweet
albuquerque,126,87.64,"this #job might be a great fit for you: barista/café server - temporary - #barista #lascruces, nm #hiring #careerarc"
billings,257,52.42,"we're #hiring! click to apply: travel labor and delivery registered nurse - #nursing #polson, mt #job #jobs #careerarc"
charlotte,47,13.23,"i can do better, i gotta stick to the plan 💯"
chicago,0,14.92,i'm sure you'll think this is funny. hilarious. i am still dying here laughing. and no offense to stevie wonder; he would laugh... #mirth #chuckle #smile #hilarious
cincinnati,696,7.4,brockton goes up 2-1 with a 25-21 win in the third set. rocketeers need to win then next set to keep their season alive. #hockomock
denver,40,98.07,"see our latest #fortlupton, co #job and click to apply: contract pharmacist - #pharmaceutical #hiring #careerarc"
houston,1,11.83,coumting down the days till i see my brother
kansas city,41,28.48,"this #job might be a great fit for you: merchandiser intern - #omaha, ne #hiring"
las vegas,23,13.8,students using #teamwork building houses out of cards! #workready2018
los angeles,4,84.47,"i'm at cassell’s burgers in los angeles, ca"


In [None]:
regions_mapping = {
    "albuquerque":0,
    "billings":1,
    "calgary":2,
    "charlotte":3,
    "chicago":4,
    "cincinnati":5,
    "denver":6,
    "houston":7,
    "kansas city":8,
    "las vegas":9,
    "los angeles":10,
    "minneapolis":11,
    "montreal":12,
    "nashville":13,
    "new york":14,
    "oklahoma city":15,
    "phoenix":16,
    "pittsburgh":17,
    "san francisco":18,
    "seattle":19,
    "tampa":20,
    "toronto":21,
    "washington":22
}

regions = ["albuquerque", "billings", "calgary", "charlotte", "chicago", "cincinnati", "denver", "houston", "kansas city",
           "las vegas", "los angeles", "minneapolis", "montreal", "nashville", "new york", "oklahoma city", "phoenix",
           "pittsburgh", "san francisco", "seattle", "tampa", "toronto", "washington"]