# Introduction to NLP Fundamentals in TensorFlow

## Check for GPU

In [1]:
!nvidia-smi -L


/bin/bash: line 1: nvidia-smi: command not found


## Get helper functions

In [2]:
!wget https://raw.githubusercontent.com/yacin-hamdi/deep_learning/master/tensorflow/helper_functions.py

--2024-02-17 17:43:03--  https://raw.githubusercontent.com/yacin-hamdi/deep_learning/master/tensorflow/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-02-17 17:43:03 (34.7 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [3]:
from helper_functions import *

## Get a text dataset

In [4]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-02-17 17:43:08--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.111.207, 142.251.16.207, 172.253.62.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.111.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-02-17 17:43:08 (103 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:
unzip_data("nlp_getting_started.zip")

## Visualizing a text dataset

In [6]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [11]:
# total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [12]:
# visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f'target:{target}', "(real disaster)" if target > 0 else "(not real disater)")
  print(f'test:\n{text}')
  print("----\n")

target:0 (not real disater)
test:
If you have an opinion and you don't put it on thh internet you will furst into flames.
----

target:0 (not real disater)
test:
Hollywood Movie About Trapped Miners Released in Chile http://t.co/Fk1vyh5QLk #newsdict #news  #Chile
----

target:1 (real disaster)
test:
Inciweb OR Update:  Rogue River-Siskiyou National Forest Fires  8/5/15 12:00 PM (Rogue River-Siskiyou NF AreaÛ_ http://t.co/LkwxU8QV7n
----

target:0 (not real disater)
test:
16 Stylishly Unique Houses That Might Help You Survive the Zombie Apocalypse | http://t.co/AU3DBCI7nf http://t.co/BOvJRF62T7
----

target:1 (real disaster)
test:
Smoke detectors not required in all buildings: An office building on Shevlin-Hixon Drive was on fire. There we... http://t.co/z6Ee1jVhNi
----



### Split data into training and validation sets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=32)

In [15]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [16]:
# Chekc the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['#LOL Plymouth (\x89Û÷Let\x89Ûªs Obliterate Litter\x89Ûª) http://t.co/GDrssjbH8q',
        'AND MY FAM HAD TO EVACUATE BC WE NEED POWER',
        'MH370: Aircraft debris found on La Reunion is from missing Malaysia Airlines ... - ABC Onlin... http://t.co/N3lNdJKYo3 G #Malaysia #News',
        '\x89Û÷Good Samaritans\x89Ûª shot in horror hijacking http://t.co/V5yUUALoqw #263Chat #Twimbos ZimpapersViews',
        "#FOXDebateQuestions:  To what degree has Obama's efforts to institute Sharia Law exacerbated the California wild fires?",
        "Looks like a war zone outside. What's going on?",
        '#hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/8JcYXhq1AZ #prebreak #best',
        "Doing Giveaway Music Kit Dren Death's Head Demolition: http://t.co/fHKhCqPl7j",
        'BBC News - India rail crash: Trains derail in Madhya Pradesh flash flood http://t.co/fU1Btuq1Et',
        "'Gunman who opened fire at Tennessee movie theater killed by

## Converting text into number

 ### Text Vectorization

In [17]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [18]:
text_vectorizer = TextVectorization(max_tokens=1000,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode='int',
                                    output_sequence_length=None,
                                    pad_to_max_tokens=True)

In [19]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [20]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

In [21]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [22]:
# Create a sample sentence and tokenize it
sample_sentence = "there's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[266,   3, 208,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [23]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f'original text:\n{random_sentence}\
        \n\nVectorized version:')
text_vectorizer([random_sentence])

original text:
I want some tsunami take out        

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  8, 133,  83, 483, 183,  36,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [24]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()

In [25]:
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
top_5_words, bottom_5_words

(['', '[UNK]', 'the', 'a', 'in'],
 ['pantofel', 'panties', 'panther', 'pantalonesfuego', 'panoramic'])

### Creating an Embedding using an Embedding Layer

In [26]:
import tensorflow as tf

embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.src.layers.core.embedding.Embedding at 0x7be42106c670>

In [27]:
random_sentence = random.choice(train_sentences)
print(f'original text:\n {random_sentence}\
      \n\nEmbedding version:')
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

original text:
 Sometimes logic gets drowned out in emotion but it's gotta surface at some point.      

Embedding version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04756946,  0.00889618, -0.0495582 , ..., -0.02375631,
         -0.03505193,  0.03877765],
        [-0.0334907 , -0.02273035,  0.04433774, ..., -0.04007398,
          0.00385095,  0.03322254],
        [-0.00963692, -0.02354585,  0.00017092, ..., -0.03850409,
          0.03504327,  0.04997769],
        ...,
        [ 0.04199969,  0.01753208,  0.04155041, ..., -0.03166531,
          0.03035804, -0.03858173],
        [ 0.00384118, -0.03881621,  0.0260849 , ..., -0.03506324,
         -0.02190324, -0.03784435],
        [ 0.0403582 ,  0.02406477, -0.00510454, ..., -0.0435176 ,
          0.04464206,  0.03290974]]], dtype=float32)>

In [28]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.04756946,  0.00889618, -0.0495582 , -0.0093552 , -0.01736857,
        -0.03353924, -0.03980967,  0.00411702, -0.04526687, -0.02100604,
        -0.02772763,  0.01105158,  0.01527769,  0.02813829, -0.02821037,
        -0.02503822,  0.0204209 , -0.03033196, -0.01076543, -0.04282738,
        -0.01992391,  0.04446273,  0.01388728,  0.04236187,  0.01588358,
        -0.02518317,  0.02039846,  0.005635  , -0.03404757,  0.01011669,
        -0.02777681, -0.00407139, -0.04578124, -0.01277552,  0.00064689,
        -0.01267131,  0.00594921, -0.01863968,  0.02945882,  0.04713846,
         0.02349489, -0.04132074,  0.04047814, -0.03030041,  0.01739601,
         0.00513893,  0.00215002,  0.00574304,  0.03851153, -0.02232273,
        -0.02680757,  0.00828497, -0.04130302, -0.01856421, -0.01493721,
         0.0075691 , -0.02851733,  0.01472973,  0.00453693,  0.01924611,
         0.03763447,  0.03273996, -0.0023127 ,  0.03616229, -0.04212528,
  

## Modelling a text dataset

* Model 0: Naive Bayes (baseline)
* Model 1: feed-forward neural network (dense model)
* Model 2: LSTM model(RNN)
* Model 3: GRU model(RNN)
* Model 4: bidirectional-LSTM model(RNN)
* Model 5: 1D Convolutional Neural Network (CNN)
* Model 6: Tensorflow Hub Pretrained Feature Extrator
* Model 7: Same as model 6 with 10% of training data

### Model 0: baseline

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipline to the training data
model_0.fit(train_sentences, train_labels)

In [30]:
# Evaluate our baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f'our baseline model achieves an accuracy of:{baseline_score*100:.2f}%')

our baseline model achieves an accuracy of:79.53%


In [31]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [32]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1])

### evaluation function

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluations(y_true, y_pred):
  eval = {}
  eval['accuracy'] = accuracy_score(y_true, y_pred) * 100
  eval['precision'] = precision_score(y_true, y_pred, average='weighted')
  eval['recall'] = recall_score(y_true, y_pred, average='weighted')
  eval['f1'] = f1_score(y_true, y_pred, average='weighted')

  return eval



In [34]:
baseline_results = evaluations(val_labels, baseline_preds)
baseline_results

{'accuracy': 79.52755905511812,
 'precision': 0.8123798624937205,
 'recall': 0.7952755905511811,
 'f1': 0.7894130279169649}

### Model 1: simple dense model

In [35]:
# Build model with the functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_1 = tf.keras.Model(inputs, outputs, name='model_1_dense')

In [36]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [37]:
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [38]:
history_1 = model_1.fit(x=train_sentences,
                        y=train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
model_1.evaluate(val_sentences, val_labels)



[0.4707689583301544, 0.7900262475013733]

In [40]:
model_1_predictions = model_1.predict(val_sentences)



In [41]:
model_1_predictions.shape

(762, 1)

In [42]:
model_1_predictions[:10]

array([[0.3047781 ],
       [0.14309128],
       [0.02609946],
       [0.9194758 ],
       [0.00351581],
       [0.75608873],
       [0.00841101],
       [0.46008536],
       [0.19537902],
       [0.35677952]], dtype=float32)

In [43]:
model_1_preds = tf.squeeze(tf.round(model_1_predictions))

In [44]:
model_1_results = evaluations(val_labels, model_1_preds)

In [45]:
model_1_results

{'accuracy': 79.00262467191601,
 'precision': 0.792836966480124,
 'recall': 0.7900262467191601,
 'f1': 0.7879178091069413}

## Visualizing learned embeddings

In [46]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [47]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [48]:
# Get the weight matrix of embeding layer
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape

(10000, 128)

In [49]:
# Create embedding files
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [50]:
# Download files from colab projector.tensorflow.org
# try:
#   from google.colab import files
#   files.download('vectors.tsv')
#   files.download('metadata.tsv')
# except Exception:
#   pass

## Recurrent Neural Networks (RNN)

## Model 2: LSTM

In [51]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")


In [52]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 1329473 (5.07 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [53]:
model_2.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [54]:
history_2 = model_2.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
model_2_pred_probs = model_2.predict(val_sentences)




In [56]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))

In [57]:
model_2_results = evaluations(val_labels, model_2_preds)

In [58]:
model_2_results

{'accuracy': 76.37795275590551,
 'precision': 0.7646345611672198,
 'recall': 0.7637795275590551,
 'f1': 0.7620550121520299}

In [59]:
inputs = tf.keras.layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_2_1 = tf.keras.Model(inputs, outputs)
model_2_1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_2 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                             

In [60]:
model_2_1.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])

In [61]:
history_2_1 = model_2_1.fit(train_sentences,
                            train_labels,
                            epochs=5,
                            validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

### Model 3: GRU

In [None]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(64)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs)
model_3.summary()

In [None]:
model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
history_3 = model_3.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

In [None]:
model_3_pred_probs = model_3.predict(val_sentences)


In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))

In [None]:
model_3_results = evaluations(val_labels, model_3_preds)
model_3_results

### Model 4: bidirectional lstm

In [None]:
# build a bidirectional RNN in tensorflow
inputs = tf.keras.layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs)
model_4.summary()

In [None]:
model_4.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
history_4 = model_4.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

In [None]:
model_4_pred_probs = model_4.predict(val_sentences)

In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))

In [None]:
model_4_results = evaluations(val_labels, model_4_preds)

In [None]:
model_4_results

### Model 5: convolution 1d

In [None]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(64, 5, activation='relu')(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_5 = tf.keras.Model(inputs, outputs)
model_5.summary()

In [None]:
model_5.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
history_5 = model_5.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences,val_labels))

In [None]:
 model_5_pred_probs = model_5.predict(val_sentences)
 model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
 model_5_results = evaluations(val_labels, model_5_preds)

In [None]:
model_5_results

## Model 6: Tensorflow hub use* feature extractor

In [None]:
import tensorflow_hub as hub
embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
embed_samples = embed([sample_sentence,
                        "when you can the universal sentence encoder on a sentence, it turns it into numbers."])

In [None]:
print(embed_samples[0][:50])

In [None]:
embed_samples[0].shape

In [None]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')


In [None]:
model_6 = tf.keras.Sequential([
    sentence_encoder_layer, # take in sentences and then encode them into an embedding
  tf.keras.layers.Dense(64, activation="relu"),
  tf.keras.layers.Dense(1, activation="sigmoid")
], name='model_6_USE')

In [None]:
model_6.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])


In [None]:
model_6.summary()

In [None]:
history_6 = model_6.fit(train_sentences,
                        train_labels,
                        epochs=10,
                        validation_data=(val_sentences, val_labels),
                        verbose=1)

In [None]:
# Make predictions
model_6_pred_probs = model_6.predict(val_sentences)
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_results = evaluations(val_labels, model_6_preds)

In [None]:
model_6_results

In [None]:
plot_loss_curves(history_6)

## Model 7: TF Hub 10% of training data

In [None]:
#data leakage
# Create subsets of 10% of the training data
# train_10_percent = train_df_shuffled[['text', 'target']].sample(frac=0.1, random_state=42)
# len(train_10_percent)

train_10_percent_split = int(0.1 * len(train_sentences))
train_sentences_10_percent = train_sentences[:train_10_percent_split]
train_labels_10_percent = train_labels[:train_10_percent_split]
len(train_sentences_10_percent), len(train_labels_10_percent)

In [None]:
pd.Series(np.array(train_labels_10_percent)).value_counts()

In [None]:
# train_sentences_10_percent = train_10_percent['text'].to_list()
# train_labels_10_percent = train_10_percent['target'].to_list()
# len(train_sentences_10_percent), len(train_labels_10_percent)

In [None]:
# Check the nubmer of targets in our subset of data
# train_10_percent['target'].value_counts()

In [None]:
train_df_shuffled['target'].value_counts()

In [None]:
embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                       input_shape=[],
                       dtype=tf.string)

model_7 = tf.keras.Sequential([
    embed,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_7.summary()



In [None]:
model_7.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
history_7 = model_7.fit(train_sentences_10_percent,
                        train_labels_10_percent,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

In [None]:
model_7_pred_probs = model_7.predict(val_sentences)
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))
model_7_results = evaluations(val_labels, model_7_preds)
model_7_results

## Comparing the performance of each of our models

In [None]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"0_baseline":baseline_results,
                                  "1_simple_dense": model_1_results,
                                  "2_lstm": model_2_results,
                                  "3_gru": model_3_results,
                                  "4_bidirectional":model_4_results,
                                  "5_conv1d":model_5_results,
                                  "6_tf_hub_use_encoder":model_6_results,
                                  "7_tf_hub_10_percent":model_7_results})
all_model_results = all_model_results.transpose()
all_model_results

In [None]:
# reduce the accuracy to the same scale as other metrics
all_model_results['accuracy'] = all_model_results['accuracy']/100
all_model_results

In [None]:
# plot_and compare all model results
all_model_results.plot(kind='bar', figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
# Sort model results by f1-score
all_model_results.sort_values("f1", ascending=False)['f1'].plot(kind='bar', figsize=(10, 7))

In [None]:
# Save the best model
model_6.save("best_model.h5")

In [None]:
loaded_best_model = tf.keras.models.load_model('best_model.h5',
                                               custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
loaded_model_pred_probs = loaded_best_model.predict(val_sentences)
loaded_model_preds = tf.squeeze(tf.round(loaded_model_pred_probs))
loaded_model_results = evaluations(val_labels, loaded_model_preds)
loaded_model_results

In [None]:
model_6_results

 ## Finding the most wrong examples

 * active learning

In [62]:
# Download a pretrain model
!wget https://storage.googleapis.com/ztm_tf_course/08_model_6_USE_feature_extractor.zip

--2024-02-17 17:46:04--  https://storage.googleapis.com/ztm_tf_course/08_model_6_USE_feature_extractor.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.16.207, 172.253.62.207, 142.251.167.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.16.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 960779165 (916M) [application/zip]
Saving to: ‘08_model_6_USE_feature_extractor.zip’


2024-02-17 17:46:17 (72.9 MB/s) - ‘08_model_6_USE_feature_extractor.zip’ saved [960779165/960779165]



In [63]:
!unzip 08_model_6_USE_feature_extractor.zip

Archive:  08_model_6_USE_feature_extractor.zip
   creating: 08_model_6_USE_feature_extractor/
   creating: 08_model_6_USE_feature_extractor/assets/
   creating: 08_model_6_USE_feature_extractor/variables/
  inflating: 08_model_6_USE_feature_extractor/variables/variables.data-00000-of-00001  
  inflating: 08_model_6_USE_feature_extractor/variables/variables.index  
  inflating: 08_model_6_USE_feature_extractor/saved_model.pb  


In [64]:
model_6_pretrained = tf.keras.models.load_model("08_model_6_USE_feature_extractor/")
model_6_pretrained.evaluate(val_sentences, val_labels)





[0.36397093534469604, 0.8293963074684143]

In [65]:
model_6_pretrained_pred_probs = model_6_pretrained.predict(val_sentences)
model_6_pretrained_preds = tf.squeeze(tf.round(model_6_pretrained_pred_probs))
model_6_pretrained_preds[:10]



<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 1., 0., 0., 0., 0., 0., 1.], dtype=float32)>

In [66]:
val_df = pd.DataFrame({'text':val_sentences,
                       'target':val_labels,
                       'pred':model_6_pretrained_preds,
                       'pred_prob':tf.squeeze(model_6_pretrained_pred_probs)})
val_df.head()

Unnamed: 0,text,target,pred,pred_prob
0,Is he about to crash?\nDid the Queen die?\nDid...,1,0.0,0.137729
1,Praise God that we have ministry that tells it...,0,0.0,0.121592
2,Last Chance Animal Rescue has 3 new posts. htt...,0,0.0,0.038939
3,'Among other main factors behind pedestrian fa...,1,1.0,0.849845
4,@iamHorsefly hide your kids hide your wife. He...,0,0.0,0.036543


In [67]:
# Find the wrong predictions
most_wrong = val_df[val_df['target'] != val_df['pred']].sort_values('pred_prob', ascending=False)
most_wrong[:10] # false positive

Unnamed: 0,text,target,pred,pred_prob
325,DISASTER AVERTED: Police kill gunman with Û÷h...,0,1.0,0.984139
582,Emergency Dispatchers in Boone County in the h...,0,1.0,0.876249
83,Tales of the #trees #deep water loving #Lake T...,0,1.0,0.869775
660,New doco tonight at 9pm Setanta Sports Ireland...,0,1.0,0.806903
57,@madonnamking RSPCA site multiple 7 story high...,0,1.0,0.803122
51,'Dangerous' property in downtown Phoenix demol...,0,1.0,0.792731
77,China detains seven Christians trying to prote...,0,1.0,0.791277
609,Bloor/Ossington arsonist also burned a mattres...,0,1.0,0.781458
655,#stlouis #caraccidentlawyer Speeding Among Top...,0,1.0,0.780523
632,Ah yes the gays are totally destroying America...,0,1.0,0.778259


In [68]:
most_wrong.tail() # false negative

Unnamed: 0,text,target,pred,pred_prob
431,Rand Paul's Debate Strategy 'demolish Some oth...,1,0.0,0.098877
134,Jack Wilshere has poor injury recordand his of...,1,0.0,0.097724
522,?? New Ladies Shoulder Tote #Handbag Faux Leat...,1,0.0,0.091532
575,The Dress Memes Have Officially Exploded On Th...,1,0.0,0.078386
339,Feel like I've got no control of anything that...,1,0.0,0.067387


In [69]:
for row in most_wrong[:10].itertuples():
  _, text, target, pred, pred_prob = row
  print(f'target:{target}, pred:{pred}, prob:{pred_prob}')
  print(f'Text:\n{text}\n')

target:0, pred:1.0, prob:0.9841393828392029
Text:
DISASTER AVERTED: Police kill gunman with Û÷hoax deviceÛª atåÊcinema http://t.co/5NG0FzpVdS

target:0, pred:1.0, prob:0.876249372959137
Text:
Emergency Dispatchers in Boone County in the hot seat http://t.co/5fHkxtrhYU

target:0, pred:1.0, prob:0.8697754144668579
Text:
Tales of the #trees #deep water loving #Lake Tahoe. And no #forest fires https://t.co/xuhMJ098Lq

target:0, pred:1.0, prob:0.8069027662277222
Text:
New doco tonight at 9pm Setanta Sports Ireland freeview. The largest police presence at a soccer game in Ireland stop prevent the rioting

target:0, pred:1.0, prob:0.80312180519104
Text:
@madonnamking RSPCA site multiple 7 story high rise buildings next to low density character residential in an area that floods

target:0, pred:1.0, prob:0.7927311658859253
Text:
'Dangerous' property in downtown Phoenix demolished  http://t.co/hiBDw7d7ja

target:0, pred:1.0, prob:0.7912769913673401
Text:
China detains seven Christians trying 

In [70]:
for row in most_wrong[-10:].itertuples():
  _, text, target, pred, pred_prob = row
  print(f'target:{target}, pred:{pred}, prob:{pred_prob}')
  print(f'Text:\n{text}\n')

target:1, pred:0.0, prob:0.11474896967411041
Text:
#Tweet4Taiji is a dolphin worship group based on superstitions! Just take a look at their tweets!

target:1, pred:0.0, prob:0.11070328950881958
Text:
@OllyMursAus I do feel sorry for him! He is not a piece of meat! He is a nice guy... People don't need to rush him and screams in his face!

target:1, pred:0.0, prob:0.11026783287525177
Text:
I liked a @YouTube video from @itsjustinstuart http://t.co/oDV3RqS8JU GUN RANGE MAYHEM!

target:1, pred:0.0, prob:0.10790392756462097
Text:
Watch how bad that fool get burned in coverage this year. Dat dude is all-pro practice squad material

target:1, pred:0.0, prob:0.1032678559422493
Text:
I came up with an idea of a fragrance concept for a bath bomb called The Blood of my Enemies. So you can say that's what you bathe in.

target:1, pred:0.0, prob:0.09887669235467911
Text:
Rand Paul's Debate Strategy 'demolish Some other bad ideas out there or point out maybe that there are some em... http://t.co/q

Making predictions on the test dataset

In [71]:
test_sentences = test_df['text'].to_list()
model_6_pretrained_pred_probs_test = model_6_pretrained.predict(test_sentences)
model_6_pretrained_preds_test = tf.squeeze(tf.round(model_6_pretrained_pred_probs_test))
model_6_pretrained_preds_test[:10]



<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.], dtype=float32)>

In [72]:
test_preds = pd.DataFrame({'text':test_sentences,
                           'pred':model_6_pretrained_preds_test,
                           'pred_probs':tf.squeeze(model_6_pretrained_pred_probs_test)})
test_preds.head()

Unnamed: 0,text,pred,pred_probs
0,Just happened a terrible car crash,1.0,0.51997
1,"Heard about #earthquake is different cities, s...",1.0,0.875251
2,"there is a forest fire at spot pond, geese are...",1.0,0.827833
3,Apocalypse lighting. #Spokane #wildfires,1.0,0.933507
4,Typhoon Soudelor kills 28 in China and Taiwan,1.0,0.967958


In [73]:
for row in test_preds[-10:].itertuples():
  _, text, pred, pred_prob = row
  print(f'pred:{pred}, prob:{pred_prob}')
  print(f'Text:\n{text}\n')

pred:1.0, prob:0.9871798753738403
Text:
Malaysian PM confirms debris is from missing flight MH370 http://t.co/pfAvW5QyqE

pred:1.0, prob:0.901348352432251
Text:
Officials: Alabama home quarantined over possible Ebola case - Washington Times

pred:1.0, prob:0.9779296517372131
Text:
See the 16yr old PKK suicide bomber who detonated bomb in Turkey Army trench released: Harun Ìàekdar ... http://t.co/hKuT5mSdtP @MsOreo_

pred:0.0, prob:0.43356144428253174
Text:
To conference attendees! The blue line from the airport has DERAILED - please look into taking a taxi to the hotel! See you soon!

pred:1.0, prob:0.9909708499908447
Text:
The death toll in a #IS-suicide car bombing on a #YPG position in the Village of Rajman in the eastern province of Hasaka has risen to 9

pred:1.0, prob:0.8005606532096863
Text:
EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn

pred:1.0, prob:0.9678621888160706
Text:
Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it

## speed/score tradeoff

In [76]:
# function to measure the time of prediction
import time
def pred_timer(model, samples):
  start_time = time.perf_counter()
  model_predict = model.predict(samples)
  end_time = time.perf_counter()
  total_time = end_time - start_time
  time_per_pred = total_time/len(samples)
  return total_time, time_per_pred

In [79]:
model_6_total_pred_time, model_6_time_per_pred = pred_timer(model_6_pretrained,
                                                            samples=val_sentences)
model_6_total_pred_time, model_6_time_per_pred



(0.731785957999989, 0.0009603490262467048)

In [80]:
# baseline model times per pred
baseline_total_pred_time, baseline_time_per_pred = pred_timer(model_0,
                                                              val_sentences)
baseline_total_pred_time, baseline_time_per_pred

(0.019388770000034583, 2.5444580052538823e-05)