In [1]:
import tensorflow as tf
from keras.models import load_model
import csv
import pickle
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score)
from sklearn.metrics import confusion_matrix
import numpy as np
# custom functions
%load_ext autoreload
%autoreload 2
import util_helpers

Using TensorFlow backend.


## Read tokenizer and  Models

In [56]:
with open("keras_tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)

cnn_model = load_model('cnn_model.h5')
lstm_att_model = load_model('lstm_model.h5')

## Read File

In [3]:
with open('../labeling_tweets/Data/batch1_to_12.csv', 'rb') as csvfile:
    twitter_flu_reader = csv.reader(csvfile)
    twitter_flu_list = list(twitter_flu_reader) 
tweets= [row[1] for row in twitter_flu_list]
labels = [row[2] for row in twitter_flu_list]


In [4]:
n_labels = np.where(np.array(labels) =='Y', 1,0)

In [5]:
#labels[0:100]

## Preprocess Tweets

In [6]:
MAX_SEQ_LENGTH = 35
tweets_data = util_helpers.preprocess_tweets(tweets, tokenizer,MAX_SEQ_LENGTH)

## CNN Predict

In [36]:
pred_probabilities = cnn_model.predict(tweets_data, batch_size=50).flatten()
with open("cnn_pred_probabilities.pickle", "w") as f:
    pickle.dump(pred_probabilities, f)

In [37]:
util_helpers.show_scores(n_labels, pred_probabilities)

F1 scores
threshold   |  tn   |  fp  |   fn   |  tp   | f1 score                  | recall  |precision|
-----------------------------------------------------------------------------------------------
0.050:       3548    2640      19     768    ( 0.727 ,  0.366 ) = 0.619   0.976     0.225  
-----------------------------------------------------------------------------------------------
0.100:       4711    1477      30     757    ( 0.862 ,  0.501 ) = 0.784   0.962     0.339  
-----------------------------------------------------------------------------------------------
0.150:       5054    1134      33     754    ( 0.896 ,  0.564 ) = 0.833   0.958     0.399  
-----------------------------------------------------------------------------------------------
0.200:       5235     953      36     751    ( 0.914 ,  0.603 ) = 0.858   0.954     0.441  
-----------------------------------------------------------------------------------------------
0.250:       5353     835      37     750    ( 0

In [38]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(n_labels, n_pred_labels, pos_label = 1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.92382621116083996

### Predict one tweet samples

In [39]:
threshold = 0.8

In [47]:
def predict_one_tweet(tweet_text,MAX_SEQ_LENGTH, model, threshold):
    tweets_data = util_helpers.preprocess_tweets([tweet_text], tokenizer,MAX_SEQ_LENGTH)
    pred_probabilities = model.predict(tweets_data, batch_size=50).flatten()
    pred_labels = np.where(np.array(pred_probabilities) > threshold, 'Y', 'N')
    return pred_labels[0]    

In [48]:
# true label = Y
tweet = "you can get Guillain barre syndrome from the flu shot (aka you're paralyzed for months  including respiratory)"
predict_one_tweet(tweet,MAX_SEQ_LENGTH,cnn_model, threshold)

'Y'

In [49]:
# true label = N
tweet = "Pam made me go to Kroger and get my flu shot for fuel points..."
predict_one_tweet(tweet,MAX_SEQ_LENGTH,cnn_model, threshold)

'N'

## LSTM Predict

In [57]:
pred_probabilities = lstm_att_model.predict(tweets_data, batch_size=50).flatten()
with open("lst_att_pred_probabilities.pickle", "w") as f:
    pickle.dump(pred_probabilities, f)

In [58]:
util_helpers.show_scores(n_labels, pred_probabilities)

F1 scores
threshold   |  tn   |  fp  |   fn   |  tp   | f1 score                  | recall  |precision|
-----------------------------------------------------------------------------------------------
0.050:        946    5242       0     787    ( 0.265 ,  0.231 ) = 0.248   1.000     0.131  
-----------------------------------------------------------------------------------------------
0.100:       4182    2006      20     767    ( 0.805 ,  0.431 ) = 0.710   0.975     0.277  
-----------------------------------------------------------------------------------------------
0.150:       4800    1388      31     756    ( 0.871 ,  0.516 ) = 0.797   0.961     0.353  
-----------------------------------------------------------------------------------------------
0.200:       5069    1119      36     751    ( 0.898 ,  0.565 ) = 0.834   0.954     0.402  
-----------------------------------------------------------------------------------------------
0.250:       5241     947      39     748    ( 0

In [59]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(n_labels, n_pred_labels, pos_label = 1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.92382621116083996

In [60]:
# true label = Y
tweet = "you can get Guillain barre syndrome from the flu shot (aka you're paralyzed for months  including respiratory)"
predict_one_tweet(tweet,MAX_SEQ_LENGTH,lstm_att_model, threshold)

'Y'

In [61]:
# true label = N
tweet = "Pam made me go to Kroger and get my flu shot for fuel points..."
predict_one_tweet(tweet,MAX_SEQ_LENGTH,lstm_att_model, threshold)

'N'