In [1]:
# LSTM with Dropout for sequence classification in the IMDB dataset
import numpy as np
import pandas as pd
import tensorflow as tf
import mlflow
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.utils import Sequence
from keras.layers import Input, LSTM, CuDNNLSTM, Dense, Bidirectional, BatchNormalization, Dropout, Reshape, Concatenate, Add
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import backend as K
from keras import regularizers
from keras import optimizers
from keras.utils import to_categorical
import time
import datetime
# fix random seed for reproducibility
np.random.seed(7)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [22]:
config = tf.ConfigProto()
config.gpu_options.allocator_type ='BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.90

In [2]:
df = pd.read_csv('data/tweets_labelled_balanced.csv')
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
y = df['region'].tolist()
df_counts = df.groupby('region').count()
top_category_num = max(df_counts['text'])
top_category_name = df_counts[df_counts['text']==max(df_counts['text'])].index[0]
categories = df_counts.index.tolist()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
3,134968
4,134968
5,134968
7,134968
10,134968
13,134968
14,134968
15,134968
18,134968
19,134967


In [3]:
# Set Parameters
V = 20000
x_length = 50
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
embedding_vector_length = 200
num_layers = 2
H = 200
epochs = 100
optimizer = 'rmsprop'
batch_size = 32
learning_rate = .001
dropout = 0.2

In [4]:
# Convert text to integer indices, separate test and training sets
t = text.Tokenizer(num_words=V, lower=True)
t.fit_on_texts(X)
X_seq = t.texts_to_sequences(X)
word_index = t.word_index
index_word = {v: k for k, v in t.word_index.items()}
X_pad = sequence.pad_sequences(X_seq, maxlen=x_length)
X_train = X_pad[:training_size]
X_test = X_pad[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]
one_hot_y_train = to_categorical(y_train, num_classes=num_classes)
one_hot_y_test = to_categorical(y_test, num_classes=num_classes)

print("Training set has {} examples, test set has {} examples".format(len(X_train), len(X_test)))

Training set has 1315937 examples, test set has 438646 examples


In [5]:
# Generator to feed batches into the model
class OneHotBatch(Sequence):
  def __init__(self, X_data, y_data, batch_size, V, num_classes):
    self.X_data = X_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.V = V
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = self.X_data[start:finish]
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_classes)

    return X, y

In [6]:
# Load Glove embeddings
embeddings_index = {}
f = open('data/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((V, embedding_vector_length))
for word, i in word_index.items():
    if i == V:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [7]:
# Define Classifier Model

classifier_inputs = Input(shape=(None, ))
classifier_embedding = Embedding(V, embedding_vector_length, weights=[embedding_matrix], name="classifier_embedding")
model_input = classifier_embedding(classifier_inputs)
classifier_lvl1 = Bidirectional(CuDNNLSTM(H, return_sequences=True), name="bidirectional_lstm1")
first_level = classifier_lvl1(model_input)
classifier_lvl2 = CuDNNLSTM(H, name="classifier_lstm2")
classifier_outputs = classifier_lvl2(first_level)
classifier_dropout = Dropout(0.2, name="classifier_dropout")
classifier_dense = Dense(num_classes, activation='softmax', name="classifier_dense")
classifier_outputs = classifier_dropout(classifier_outputs)
classifier_outputs = classifier_dense(classifier_outputs)

model = Model(classifier_inputs, classifier_outputs)

In [8]:
# Fit the model
start_time = time.time()

# Generators
train_generator = OneHotBatch(X_data=X_train, y_data=y_train, batch_size=batch_size, V=V, num_classes=num_classes)
validation_generator = OneHotBatch(X_data=X_test, y_data=y_test, batch_size=batch_size, V=V, num_classes=num_classes)

# Compile and train the model
#opt = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False, clipvalue=.05)
opt = optimizers.RMSprop(lr=learning_rate, rho=0.9, epsilon=None, decay=0.0)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
model.summary()
callbacks = [EarlyStopping(monitor='val_acc', patience=3, min_delta=.03, restore_best_weights=True),
             ModelCheckpoint(filepath='models/Twitter_Classifier_FULL.h5', 
                             monitor='val_acc', save_best_only=True),
             TensorBoard(log_dir='./logs/Twitter_Classifier_FULL', histogram_freq=0, batch_size=32, write_graph=False, 
                         write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, 
                         embeddings_metadata=None, embeddings_data=None, update_freq='epoch')]

model.fit_generator(generator=train_generator, callbacks=callbacks, epochs=100, validation_data=validation_generator)
                    #max_queue_size=10, workers=5, use_multiprocessing=True)
# Final evaluation of the model
end_time = time.time()
run_time = datetime.timedelta(seconds=end_time-start_time)
print("Finished in {}".format(run_time))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
classifier_embedding (Embedd (None, None, 200)         4000000   
_________________________________________________________________
bidirectional_lstm1 (Bidirec (None, None, 400)         643200    
_________________________________________________________________
classifier_lstm2 (CuDNNLSTM) (None, 200)               481600    
_________________________________________________________________
classifier_dropout (Dropout) (None, 200)               0         
_________________________________________________________________
classifier_dense (Dense)     (None, 23)                4623      
Total params: 5,129,423
Trainable params: 5,129,423
Non-trainable params: 0
_________________________________________________________________


In [18]:
#model_name = 'Classifier_full_balanced_1Bi1L_rms'
#model.save('models/{}.h5'.format(model_name))

# Save the model and weights to disk
with open('models/Classifier_full_balanced_1Bi1L_20k.json', 'w', encoding='utf8') as f:
    f.write(model.to_json())
model.save_weights('models/Classifier_full_balanced_1Bi1L_rms_weights.h5')

In [17]:
tweets = ["if you're looking for work in va, check out this #job: #hiring #careerarc", 
          "i'm at cassell’s burgers in los angeles, ca"]
test_sequence = t.texts_to_sequences(tweets)
test_padded = sequence.pad_sequences(test_sequence, maxlen=x_length)
test_prediction_probs = model.predict_on_batch(test_padded)
np.argmax(test_prediction_probs, axis=1)

array([22, 10])

In [12]:
X_test_tweets = X[training_size:]

Xt = X_test
validation_generator = OneHotBatch(X_test, y_test, batch_size=batch_size, V=V, num_classes=num_classes)
#Xt_onehot = to_categorical(Xt, num_classes=num_unique_symbols)
prediction_probs = model.predict_generator(validation_generator)
predictions = np.argmax(prediction_probs, axis=1)

In [13]:
predicted_regions = np.unique(predictions).tolist()
predicted_regions

[0, 3, 4, 5, 7, 10, 13, 14, 15, 18, 19, 20, 21, 22]

In [14]:
def sequence_to_text(tokenizer, array):
    return " ".join([index_word[x] for x in array if x > 0])

print(sequence_to_text(t, X_train[0]))

we got the earth in the blunt


In [15]:
# For each predicted region, find the tweet that the model is MOST confident belongs
regions = ["albuquerque", "billings", "calgary", "charlotte", "chicago", "cincinnati", "denver", "houston", "kansas city",
           "las vegas", "los angeles", "minneapolis", "montreal", "nashville", "new york", "oklahoma city", "phoenix",
           "pittsburgh", "san francisco", "seattle", "tampa", "toronto", "washington"]
best_tweets = dict()

for region in predicted_regions:
    best_tweets[regions[region]] = {'tweet': '', 'prob': 0, 'index': 0}

for i in range(len(prediction_probs)):
    top_region_int = np.argmax(prediction_probs[i])
    top_region = regions[top_region_int]
    top_score = prediction_probs[i][top_region_int]
    if top_score > best_tweets[top_region]['prob']:
        best_tweets[top_region]['prob'] = top_score
        best_tweets[top_region]['tweet'] = sequence_to_text(t, Xt[i])
        best_tweets[top_region]['index'] = i

In [16]:
pd.options.display.max_colwidth = 200
df_toptweets = pd.DataFrame.from_dict(best_tweets).T
df_toptweets

Unnamed: 0,index,prob,tweet
albuquerque,72883,0.155828,ulta beauty is hiring our newest professional in carsoncity nv we would love to connect with you if interested click here to learn more beauty advisor seasonal hiring ultabeauty cosmetology job jo...
charlotte,154751,0.997735,your talent drives your legacy at construction apply today conway sc hiring construction construction charleston sc job jobs careerarc
chicago,337882,0.99812,can you recommend anyone for this job software support engineer engineering chicago il veterans hiring careerarc
cincinnati,252895,0.997226,connect with us to help change the world if you’re ready to make an impact we’d love to hear from you please submit your for our newest open role in cincinnati oh site controller cincinnati oh fin...
houston,159792,0.994945,want to work at performance food group we're hiring in shreveport la click for details driver cdl transportation job jobs
los angeles,204404,0.99131,i love adore ed thank you the honor last night sir xo ace hotel downtown los angeles
nashville,227855,0.996379,at regions our mission is to make life better for our associates our communities and our customers come join our mission as our newest financial relationship consultant greater memphis tn area tod...
new york,192478,0.998271,downtown brooklyn tonight \r newyorkcity nyc brooklyn downtown brooklyn
oklahoma city,30615,0.992344,see our latest wichita ks job and click to apply hr assistant wichita ks hr hiring careerarc
san francisco,64656,0.993493,want to work at waste management we're hiring in reno nv click for details hr job jobs careerarc
