# NLP
When you see NLP, you should think of Text and Speech\

Text -> turn into numbers -> build a model -> train a model to find patterns -> use patterns(make prediction)\



In [78]:
# Get data (10% of labels)
import zipfile

# Unzip the downloaded file
zip_ref = zipfile.ZipFile("nlp-getting-started.zip", "r")
zip_ref.extractall()
zip_ref.close()

In [79]:
# turn .csv files into pandas DataFrame's
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [80]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [81]:
# test data doesn't have a target ( we want to try to predict this)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [82]:
# How many examples of each class?
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [83]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total training samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total training samples: 10876


In [84]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # Create random index not higher than the total number
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")
    

Target: 0 (not real disaster)
Text:
@EMILY4EVEREVER haha it's alright..but more than twice is just stupid ;) he's traumatised ????

---

Target: 0 (not real disaster)
Text:
'Crash Test' Trailer: Paul Scheer &amp; Rob Huebel's Comedy Special Recorded on a ... http://t.co/flSa8mlDSn

---

Target: 1 (real disaster)
Text:
A Pyrotechnic Artwork by Cai Guo-Qiang Explodes into a Blossom on the Steps of the Philadelphia... http://t.co/orOvZFsKU2

---

Target: 0 (not real disaster)
Text:
'The day you learn the importance of emergency exits is the day your heartbeat stops sounding familiar.'

---

Target: 1 (real disaster)
Text:
Pak Army Helicopter crashed in Mansehra.

---



### Split data into training and validation sets

In [85]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation set
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
train_df_shuffled["target"].to_numpy(),
test_size=0.1, # dedicate 10% of samples to validation set,
random_state=42)

In [86]:
# check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [87]:
# View the first 10 training sentences and their labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

## Converting text into numbers

In NLP there are two concepts for turning text into numbers\
<b>Tokenization</b>\
A straight mapping from word or sentence to a numerical value\
<b>Embeddings</b>\
Representation of natural language which can be learned. Representation comes in the form of a feature vector.

In [88]:
import tensorflow as tf

text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=None, # how many words in the vocabulary
standardize="lower_and_strip_punctuation", # how to process text
split="whitespace", # how to split tokens
ngrams=None, # create groups of n-words
output_mode="int", # how to make tokens to numbers
output_sequence_length=None) # how long should the ouput sequence of tokens be?

In [89]:
# Find average number of tokens (words) in training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [90]:
# Setup another text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be

text_vectorizer =  tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
output_mode="int",
output_sequence_length=max_length)

In [91]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [92]:
# Create sample sentence and tokenize it
sample_sentence = "do you know this is sample sentence?"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  68,   12,  106,   19,    9, 8839,    1,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

notice output_sequence_length = 15, so no matter the size of the sequence we pass to text_vectorizer, it always return a sequence with length 15

In [93]:
# choose random sentence
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
    \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...    

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[752, 291, 333,  14, 702,   6, 803, 329,  39, 752, 291,  41,  59,
        333,  14]], dtype=int64)>

In [94]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens
bottom_5_words = words_in_vocab[-5:] # Least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Creating an Embedding using an Embedding Layer
Powerful thing about an embedding is it can be learned during training. Tokenization cannot learn(is static)\
<b>Input_dim</b> - The size of the vocabulary\
<b>Output_dim</b> - The size of the output embedding vector\
<b>Embeddings_initializer </b>- How to initialize the embedding matrix\
<b>Input_length</b> - Length of sequences being passed to embedding layer\

In [95]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # itialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")
embedding

<keras.layers.embeddings.Embedding at 0x112e5d6a6d0>

We just created embedding tensorflow layer. We can use it as part of a model, meaning its parameters(word represent) can be updated and improved as it learns.

In [96]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
    \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
@Glosblue66 no idea what this means. Look at our violent crime rate without weapons. Ban guns we become like Mexico not Australia    

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.03977952, -0.03782602, -0.03646283, ...,  0.00236253,
          0.03332629,  0.02803668],
        [-0.04811442,  0.04142534,  0.00805675, ..., -0.04650271,
         -0.03631166, -0.00753704],
        [-0.0328971 , -0.0237336 , -0.02128513, ..., -0.0004343 ,
         -0.01683635,  0.04333824],
        ...,
        [-0.04504747, -0.00579112, -0.03567293, ...,  0.04295422,
          0.00457615,  0.02769175],
        [-0.0078036 , -0.00991828,  0.03339667, ..., -0.02256166,
         -0.04323708,  0.01996011],
        [-0.02993214,  0.01396969,  0.03798832, ...,  0.0009527 ,
          0.00238551,  0.02462022]]], dtype=float32)>

In [97]:
# Check out a single token's embedding
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.03977952, -0.03782602, -0.03646283, -0.02449075, -0.00015752,
        0.02220254,  0.00162981,  0.00603487,  0.0085157 , -0.02620113,
        0.04101599,  0.03715892,  0.02397566,  0.00281113, -0.02704906,
       -0.04870148,  0.01457943,  0.0059551 , -0.02334484,  0.03581132,
        0.04377897,  0.04186075,  0.03245703, -0.045092  ,  0.04260418,
        0.03398135, -0.01812425, -0.03539513,  0.02954218,  0.02556742,
       -0.03345481,  0.04272738, -0.00798845, -0.0406163 , -0.00644834,
        0.00232404,  0.01703629,  0.03645121, -0.02622857,  0.03498118,
       -0.03059715,  0.02576998, -0.04221511,  0.02654583, -0.02192564,
       -0.0346157 ,  0.00075326,  0.01427345,  0.01027539, -0.04311384,
       -0.03973336, -0.00966626,  0.01032177, -0.04011822, -0.018892  ,
       -0.01233201,  0.02721632, -0.01232889, -0.02504088, -0.04715574,
        0.00558523, -0.00801403,  0.03058865, -0.01923352, -0.04175536,
       -0.036542

This is what our computer sees each word as. When our model looks for patterns in different samples, these values will be updated as necessary.

## Modeling a text dataset

We will try out a list of models and see which one performed best\
Naive Bayes\
Feed-Forward neural network\
LSTM Model\
GRU model\
Bidirectional-LSTM Model\
1D Convolutional Neural Network\
TensorFlow Hub Pretrained Feature Extractor\


In [98]:
# Baseline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modeling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [99]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [100]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Creating an evaluation function for our model experiments

Let's create helper function which takes an array of predictions and ground truth labels and computes the following\
Accuracy\
Precision\
Recall\
F1-Score\

In [101]:
# Function to evaluate: accuracy, precision, recall, f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classifcation model.
    
    Returns a dictionary of accuracy, precision, recall, f1-score.
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {'accuracy': model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1": model_f1}
    return model_results

In [102]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

## Model 1: A simple dense model Neural Net

In [103]:
# Create directory to save TensorBoard Logs
SAVE_DIR = "model_logs"

# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # Inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # Lower the dimensionality of the embedding
outputs = layers.Dense(1, activation='sigmoid')(x) # create the output layer, want binary outputs
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model


In [104]:
# Compile model
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [105]:
# Get summary of the model
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_3 (TextVe (None, 15)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 15, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [106]:
# Fit the model
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [107]:
# check the results
model_1.evaluate(val_sentences, val_labels)




[0.4766845405101776, 0.787401556968689]

In [108]:
# Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10] # only print out the first 10 prediction probabilities

array([[0.40490273],
       [0.74437743],
       [0.997895  ],
       [0.10890284],
       [0.11143022],
       [0.93556195],
       [0.91346425],
       [0.9925349 ],
       [0.97157186],
       [0.26568958]], dtype=float32)

In [109]:
# Turn prediction probabilities into a single-dimension tensor of floats
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs)) # squeeze removes single dimensions
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

## Recurrent Neural Networks
Simply put: Use information from the past to help you with the future. Take an input and compute an output based on all previous inputs\

<b>One to one:</b> One input, one output, such as image classification\
<b>One to many:</b> One input, many output, such as image captioning\
<b>Many to one:</b> Many input, one output, such as text classification\
<b>Many to Many:</b> Many input, Many output, such as machine translation\


In [110]:
# Model 2: LSTM
# Set random seed and create embedding layer
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                    output_dim=128,
                                    embeddings_initializer="uniform",
                                    input_length=max_length,
                                    name='embedding_2')

# Create LSTM Model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name='model_2_LSTM')


(None, 15, 128)
(None, 64)


In [111]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [112]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_3 (TextVe (None, 15)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 15, 128)           1280000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________________________


In [113]:
# Fit model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [114]:
# Make predictions on the validation dataset
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs.shape, model_2_pred_probs[:10] # view the first 10

((762, 1),
 array([[0.00711285],
        [0.78727347],
        [0.9996369 ],
        [0.0566547 ],
        [0.00258011],
        [0.99962366],
        [0.92162055],
        [0.9997991 ],
        [0.99949515],
        [0.6656076 ]], dtype=float32))

In [115]:
# Round out predictions and reduce to 1-dimensional array
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [116]:
# Calculate LSTM model results
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results                           

{'accuracy': 75.06561679790026,
 'precision': 0.7510077975908164,
 'recall': 0.7506561679790026,
 'f1': 0.7489268622514025}

### Model 3: GRU
GRU network has similar feature to an LSTM cell but has less parameters

In [117]:
# Set random seed and create embedding layer
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_3")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
x = layers.GRU(64)(x)

outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")


In [118]:
# Compile GRU model
model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [119]:
# Get a summary of the GRU model
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_3 (TextVe (None, 15)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 15, 128)           1280000   
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_________________________________________________________________


Notice LSTM model has more trainable parameters than the GRU cell.

In [120]:
# Fit model
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [121]:
# Make predictions on the validation data
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs.shape, model_3_pred_probs[:10]

((762, 1),
 array([[0.33335665],
        [0.8773997 ],
        [0.9980247 ],
        [0.11561046],
        [0.01235882],
        [0.9925684 ],
        [0.62149954],
        [0.99813336],
        [0.9982376 ],
        [0.5020565 ]], dtype=float32))

In [122]:
# Convert prediction probabilities to prediction classes
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [123]:
# Calculate model_3 results
model_3_results = calculate_results(y_true=val_labels,
                                    y_pred=model_3_preds)
model_3_results

{'accuracy': 76.77165354330708,
 'precision': 0.7675450859410361,
 'recall': 0.7677165354330708,
 'f1': 0.7667932666650168}

## Model 4: Bidirectional RNN model
Standard RNN process a sequence from left to right, where bidirectional RNN will process the sequence from left to right and then again from right to left.


In [124]:
# Set random seed and create embedding layer
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Build a Bidrectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)

x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both way so double the parameters to train
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidrectional")


In [125]:
model_4.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [126]:
# Get a summary
model_4.summary()

Model: "model_4_Bidrectional"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_3 (TextVe (None, 15)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 15, 128)           1280000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total params: 1,378,945
Trainable params: 1,378,945
Non-trainable params: 0
_________________________________________________________________


In [127]:
# Fit the model (takes longer because of the bidirectional layers)
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [128]:
# Make predictions with bidirectional RNN on the validation data
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

array([[0.03988695],
       [0.8280696 ],
       [0.99842036],
       [0.13511778],
       [0.00310867],
       [0.9921976 ],
       [0.9554615 ],
       [0.9994561 ],
       [0.99898237],
       [0.28109872]], dtype=float32)

In [129]:
# Convert prediction probabilities to labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [130]:
# Calculate bidirectional RNN model results
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

{'accuracy': 76.64041994750657,
 'precision': 0.7665895370389821,
 'recall': 0.7664041994750657,
 'f1': 0.7651213533864446}

## Pretrained Embeddings (Transfer learning for NLP)

In [131]:
sample_sentence = ["There's a flood in my street!"]

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [133]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # load Universal Sentence Encoder

In [134]:
# We can use this encoding layer in place of our text_vectorizer and embedding layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 

In [135]:
# Create model using the Sequential API
model_6 = tf.keras.Sequential([
  sentence_encoder_layer, # take in sentences and then encode them into an embedding
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_6_USE")

# Compile model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE (KerasLayer)             (None, 512)               256797824 
_________________________________________________________________
dense_10 (Dense)             (None, 64)                32832     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [136]:
# Train a classifier on top of pretrained embeddings
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [137]:
# Make predictions with USE TF Hub model
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]

array([[0.16247827],
       [0.75555104],
       [0.9883377 ],
       [0.20471899],
       [0.73026645],
       [0.6568301 ],
       [0.98056567],
       [0.9767353 ],
       [0.9273411 ],
       [0.08647768]], dtype=float32)

In [138]:
# Convert prediction probabilities to labels
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 1., 1., 0.], dtype=float32)>

In [139]:
# Calculate model 6 performance metrics
model_6_results = calculate_results(val_labels, model_6_preds)
model_6_results

{'accuracy': 81.75853018372703,
 'precision': 0.8202616926815424,
 'recall': 0.8175853018372703,
 'f1': 0.8159820561172786}

## Making predictions on the test dataset



In [140]:
# Making predictions on the test dataset
test_sentences = test_df['text'].to_list()
test_samples = random.sample(test_sentences, 10)
for test_sample in test_samples:
    pred_prob = tf.squeeze(model_6.predict([test_sample])) # has to be a list
    pred = tf.round(pred_prob)
    print(f"Pred: {int(pred)}, Prob: {pred_prob}")
    print(f"Text:\n{test_sample}\n")
    print("----\n")


Pred: 0, Prob: 0.17522850632667542
Text:
@JackiSheaffer I have the same battle!

----

Pred: 0, Prob: 0.13471625745296478
Text:
@misschaela_ not yet.  Everywhere else except us and like a few other shops like Panda are evacuated but they haven't come for us yet.

----

Pred: 0, Prob: 0.3568779528141022
Text:
Displaced

----

Pred: 1, Prob: 0.7234946489334106
Text:
Ahead of Print: A New Paradigm of Injuries From Terrorist Explosions as a Function of Explosion Setting Type.:... http://t.co/tqQc3yxBoR

----

Pred: 1, Prob: 0.8074379563331604
Text:
Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/zb8vVBiSY4

----

Pred: 0, Prob: 0.09625240415334702
Text:
This headache will be the death of me ??

----

Pred: 1, Prob: 0.602957010269165
Text:
@JacobHoggard @therealmattleaf  it's so sad out there in BC all the wild fires. Hope u are safe.

----

Pred: 0, Prob: 0.10517110675573349
Text:
@datshemmings_ moi c'est plutot disconnected t

# Predicting on Tweets from the wild


In [141]:
# Turn Tweet into string
daniels_tweet = "Life like an ensemble: take the best choices from others and make your own"

In [142]:
def predict_on_sentence(model, sentence):
    """
    use model to make prediction on sentence.
    returns the sentence, the predicted label and the prediction probability"""
    pred_prob = model.predict([sentence])
    pred_label = tf.squeeze(tf.round(pred_prob)).numpy()
    print(f"Pred: {pred_label}", "(real_disaster)" if pred_label > 0 else "(not real disaster)", f"Prob: {pred_prob[0][0]}")
    print(f"Text:\n{sentence}")

# Make a prediction on Tweet from the wild
predict_on_sentence(model=model_6, # use the USE model
                    sentence=daniels_tweet)

Pred: 0.0 (not real disaster) Prob: 0.04935939237475395
Text:
Life like an ensemble: take the best choices from others and make your own


In [143]:
# Source - https://twitter.com/BeirutCityGuide/status/1290696551376007168
beirut_tweet_1 = "Reports that the smoke in Beirut sky contains nitric acid, which is toxic. Please share and refrain from stepping outside unless urgent. #Lebanon"

# Source - https://twitter.com/BeirutCityGuide/status/1290773498743476224
beirut_tweet_2 = "#Beirut declared a “devastated city”, two-week state of emergency officially declared. #Lebanon"

In [144]:
# Predict on diaster Tweet 1
predict_on_sentence(model=model_6, 
                    sentence=beirut_tweet_1)

Pred: 1.0 (real_disaster) Prob: 0.9659295082092285
Text:
Reports that the smoke in Beirut sky contains nitric acid, which is toxic. Please share and refrain from stepping outside unless urgent. #Lebanon


In [145]:
# Predict on diaster Tweet 2
predict_on_sentence(model=model_6, 
                    sentence=beirut_tweet_2)

Pred: 1.0 (real_disaster) Prob: 0.9756180644035339
Text:
#Beirut declared a “devastated city”, two-week state of emergency officially declared. #Lebanon
