# Introduction to NLP Fundamentals in TensorFlow

## Check for GPU

In [1]:
!nvidia-smi -L


GPU 0: Tesla T4 (UUID: GPU-5101b9fe-eaf4-a736-2fe0-86072bf34cf3)


## Get helper functions

In [2]:
!wget https://raw.githubusercontent.com/yacin-hamdi/deep_learning/master/tensorflow/helper_functions.py

--2024-02-15 16:33:36--  https://raw.githubusercontent.com/yacin-hamdi/deep_learning/master/tensorflow/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-02-15 16:33:37 (104 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [3]:
from helper_functions import *

## Get a text dataset

In [4]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-02-15 16:33:42--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 74.125.137.207, 2607:f8b0:4023:c0d::cf
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-02-15 16:33:42 (93.2 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:
unzip_data("nlp_getting_started.zip")

## Visualizing a text dataset

In [6]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [11]:
# total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [12]:
# visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f'target:{target}', "(real disaster)" if target > 0 else "(not real disater)")
  print(f'test:\n{text}')
  print("----\n")

target:1 (real disaster)
test:
My hand is burning
----

target:1 (real disaster)
test:
A Time-Lapse Map of Every Nuclear Explosion Since 1945 - by Isao Hashimoto #War #atomicbomb 
https://t.co/V0t8H4Iecc
----

target:0 (not real disater)
test:
I'm feeling so attacked https://t.co/CvkQiGr1AZ
----

target:0 (not real disater)
test:
#fun #instagramers http://t.co/M3NJvvtYgN

Jeb Bush said earlier this week that not only does he want to obliterate Planned Parenthood buÛ_
----

target:0 (not real disater)
test:
ÛÏ@LeoBlakeCarter: This dog thinks he's an ambulance ?????? http://t.co/MG1lpGr0RMÛ@natasha_rideout
----



### Split data into training and validation sets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=32)

In [15]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [16]:
# Chekc the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['#LOL Plymouth (\x89Û÷Let\x89Ûªs Obliterate Litter\x89Ûª) http://t.co/GDrssjbH8q',
        'AND MY FAM HAD TO EVACUATE BC WE NEED POWER',
        'MH370: Aircraft debris found on La Reunion is from missing Malaysia Airlines ... - ABC Onlin... http://t.co/N3lNdJKYo3 G #Malaysia #News',
        '\x89Û÷Good Samaritans\x89Ûª shot in horror hijacking http://t.co/V5yUUALoqw #263Chat #Twimbos ZimpapersViews',
        "#FOXDebateQuestions:  To what degree has Obama's efforts to institute Sharia Law exacerbated the California wild fires?",
        "Looks like a war zone outside. What's going on?",
        '#hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/8JcYXhq1AZ #prebreak #best',
        "Doing Giveaway Music Kit Dren Death's Head Demolition: http://t.co/fHKhCqPl7j",
        'BBC News - India rail crash: Trains derail in Madhya Pradesh flash flood http://t.co/fU1Btuq1Et',
        "'Gunman who opened fire at Tennessee movie theater killed by

## Converting text into number

 ### Text Vectorization

In [17]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [18]:
text_vectorizer = TextVectorization(max_tokens=1000,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode='int',
                                    output_sequence_length=None,
                                    pad_to_max_tokens=True)

In [19]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [20]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

In [21]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [22]:
# Create a sample sentence and tokenize it
sample_sentence = "there's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[266,   3, 208,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [23]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f'original text:\n{random_sentence}\
        \n\nVectorized version:')
text_vectorizer([random_sentence])

original text:
'Planted 390 years ago' it was moved to U-S. This Bonsai Survived Hiroshima But Its Story Was Nearly Lost http://t.co/jID4RO34gb via @NatGeo        

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[3517,    1,  141,  653,   15,   24, 1811,    5,   66,   19,    1,
         416,  129,   30,   38]])>

In [24]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()

In [25]:
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
top_5_words, bottom_5_words

(['', '[UNK]', 'the', 'a', 'in'],
 ['pantofel', 'panties', 'panther', 'pantalonesfuego', 'panoramic'])

### Creating an Embedding using an Embedding Layer

In [26]:
import tensorflow as tf

embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.src.layers.core.embedding.Embedding at 0x7ef43aca2d40>

In [27]:
random_sentence = random.choice(train_sentences)
print(f'original text:\n {random_sentence}\
      \n\nEmbedding version:')
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

original text:
 we was all just laughing and talking junk now everyone in panic mode      

Embedding version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01198484, -0.03231835,  0.03148196, ..., -0.01350764,
         -0.00019313,  0.01541648],
        [-0.02872924,  0.0217544 ,  0.03403253, ..., -0.00895811,
         -0.03609512,  0.04775922],
        [ 0.03162989,  0.04041822,  0.02409604, ..., -0.00631745,
          0.02522177,  0.02891171],
        ...,
        [ 0.03422249, -0.0157621 ,  0.03184894, ...,  0.03394742,
         -0.00019938, -0.00743318],
        [-0.00109933, -0.03742896,  0.01139531, ..., -0.04211677,
          0.04116995,  0.01139696],
        [-0.00109933, -0.03742896,  0.01139531, ..., -0.04211677,
          0.04116995,  0.01139696]]], dtype=float32)>

In [28]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.01198484, -0.03231835,  0.03148196,  0.02984964, -0.00354288,
         0.0156762 , -0.04036856,  0.02349034,  0.03818576,  0.00106641,
        -0.01211343, -0.04959725,  0.02225497, -0.02941315, -0.01433076,
        -0.04022082,  0.01349074, -0.04018861,  0.00484593, -0.04873386,
         0.04400057,  0.01270458, -0.01414464, -0.00284057, -0.02329973,
        -0.01010455,  0.02557026, -0.00689165,  0.04493431, -0.0284004 ,
         0.03679771,  0.04341346, -0.0322461 , -0.02868393,  0.00753237,
         0.01632119, -0.00221781, -0.00784175,  0.01868803,  0.02374746,
        -0.03109303,  0.00397886, -0.00698862, -0.00771539, -0.00990558,
        -0.00270619,  0.04654957, -0.03669442,  0.02021421, -0.04423087,
         0.04130444,  0.00993227, -0.03711117, -0.02495762,  0.00775573,
         0.03217978,  0.04538499, -0.04472976,  0.00804416, -0.02467178,
        -0.0324347 , -0.00504997,  0.03246455, -0.02389323, -0.03635646,
  

## Modelling a text dataset

* Model 0: Naive Bayes (baseline)
* Model 1: feed-forward neural network (dense model)
* Model 2: LSTM model(RNN)
* Model 3: GRU model(RNN)
* Model 4: bidirectional-LSTM model(RNN)
* Model 5: 1D Convolutional Neural Network (CNN)
* Model 6: Tensorflow Hub Pretrained Feature Extrator
* Model 7: Same as model 6 with 10% of training data

### Model 0: baseline

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipline to the training data
model_0.fit(train_sentences, train_labels)

In [30]:
# Evaluate our baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f'our baseline model achieves an accuracy of:{baseline_score*100:.2f}%')

our baseline model achieves an accuracy of:79.53%


In [31]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [32]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1])

### evaluation function

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluations(y_true, y_pred):
  eval = {}
  eval['accuracy'] = accuracy_score(y_true, y_pred) * 100
  eval['precision'] = precision_score(y_true, y_pred, average='weighted')
  eval['recall'] = recall_score(y_true, y_pred, average='weighted')
  eval['f1'] = f1_score(y_true, y_pred, average='weighted')

  return eval



In [34]:
baseline_results = evaluations(val_labels, baseline_preds)
baseline_results

{'accuracy': 79.52755905511812,
 'precision': 0.8123798624937205,
 'recall': 0.7952755905511811,
 'f1': 0.7894130279169649}

### Model 1: simple dense model

In [35]:
# Build model with the functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_1 = tf.keras.Model(inputs, outputs, name='model_1_dense')

In [36]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [37]:
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [38]:
history_1 = model_1.fit(x=train_sentences,
                        y=train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
model_1.evaluate(val_sentences, val_labels)



[0.4709113836288452, 0.7939632534980774]

In [40]:
model_1_predictions = model_1.predict(val_sentences)



In [41]:
model_1_predictions.shape

(762, 1)

In [42]:
model_1_predictions[:10]

array([[0.2533791 ],
       [0.1503093 ],
       [0.02130394],
       [0.89704484],
       [0.00363819],
       [0.75702626],
       [0.00983639],
       [0.30602443],
       [0.21606572],
       [0.31883526]], dtype=float32)

In [43]:
model_1_preds = tf.squeeze(tf.round(model_1_predictions))

In [44]:
model_1_results = evaluations(val_labels, model_1_preds)

In [45]:
model_1_results

{'accuracy': 79.39632545931758,
 'precision': 0.7981841493071892,
 'recall': 0.7939632545931758,
 'f1': 0.7914526045911954}

## Visualizing learned embeddings

In [46]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [47]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [48]:
# Get the weight matrix of embeding layer
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape

(10000, 128)

In [49]:
# Create embedding files
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [50]:
# Download files from colab projector.tensorflow.org
# try:
#   from google.colab import files
#   files.download('vectors.tsv')
#   files.download('metadata.tsv')
# except Exception:
#   pass

## Recurrent Neural Networks (RNN)

## Model 2: LSTM

In [51]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")


In [52]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 1329473 (5.07 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [53]:
model_2.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [54]:
history_2 = model_2.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
model_2_pred_probs = model_2.predict(val_sentences)




In [56]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))

In [57]:
model_2_results = evaluations(val_labels, model_2_preds)

In [58]:
model_2_results

{'accuracy': 76.50918635170603,
 'precision': 0.7697430229804574,
 'recall': 0.7650918635170604,
 'f1': 0.7615994978303363}

In [59]:
inputs = tf.keras.layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_2_1 = tf.keras.Model(inputs, outputs)
model_2_1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_2 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                             

In [60]:
model_2_1.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])

In [61]:
history_2_1 = model_2_1.fit(train_sentences,
                            train_labels,
                            epochs=5,
                            validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model 3: GRU

In [62]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(64)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs)
model_3.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1317313 (5.03 MB)
Trainable params: 1317313 (5.03 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

In [63]:
model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [64]:
history_3 = model_3.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [65]:
model_3_pred_probs = model_3.predict(val_sentences)




In [66]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))

In [67]:
model_3_results = evaluations(val_labels, model_3_preds)
model_3_results

{'accuracy': 73.22834645669292,
 'precision': 0.7364823609732651,
 'recall': 0.7322834645669292,
 'f1': 0.7327488497766634}

### Model 4: bidirectional lstm

In [68]:
# build a bidirectional RNN in tensorflow
inputs = tf.keras.layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs)
model_4.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1378945 (5.26 MB)
Trainable params: 1378945 (

In [69]:
model_4.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [70]:
history_4 = model_4.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
model_4_pred_probs = model_4.predict(val_sentences)



In [72]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))

In [73]:
model_4_results = evaluations(val_labels, model_4_preds)

In [74]:
model_4_results

{'accuracy': 73.75328083989501,
 'precision': 0.7374227582685459,
 'recall': 0.7375328083989501,
 'f1': 0.7374726897268776}

### Model 5: convolution 1d

In [75]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(64, 5, activation='relu')(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_5 = tf.keras.Model(inputs, outputs)
model_5.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d (Conv1D)             (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense_6 (Dense)             (None, 1)                 65  

In [76]:
model_5.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [77]:
history_5 = model_5.fit(train_sentences,
                        train_labels,
                        epochs=5,
                        validation_data=(val_sentences,val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
 model_5_pred_probs = model_5.predict(val_sentences)
 model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
 model_5_results = evaluations(val_labels, model_5_preds)



In [79]:
model_5_results

{'accuracy': 73.49081364829397,
 'precision': 0.7351691467422742,
 'recall': 0.7349081364829396,
 'f1': 0.7329728469706113}

## Model 6: Tensorflow hub use* feature extractor

In [82]:
import tensorflow_hub as hub
embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
embed_samples = embed([sample_sentence,
                        "when you can the universal sentence encoder on a sentence, it turns it into numbers."])

In [84]:
print(embed_samples[0][:50])

tf.Tensor(
[-0.01157028  0.0248591   0.02878048 -0.012715    0.03971538  0.0882776
  0.02680984  0.05589836 -0.0106873  -0.00597291  0.00639323 -0.01819518
  0.00030813  0.09105888  0.05874644 -0.03180628  0.01512474 -0.05162929
  0.00991367 -0.06865347 -0.04209306  0.02678981  0.03011006  0.00321069
 -0.00337973 -0.04787357  0.0226672  -0.00985925 -0.04063613 -0.01292092
 -0.04666384  0.05630299 -0.03949255  0.00517686  0.02495829 -0.0701444
  0.02871508  0.04947684 -0.00633979 -0.08960192  0.02807118 -0.00808364
 -0.01360602  0.0599865  -0.10361787 -0.05195374  0.00232954 -0.02332531
 -0.03758105  0.03327728], shape=(50,), dtype=float32)


In [85]:
embed_samples[0].shape

TensorShape([512])

In [89]:
sentence_encoder_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')


In [91]:
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(1, activation='relu')
], name='model_6_USE')

In [92]:
model_6.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuray'])

In [93]:
model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_7 (Dense)             (None, 1)                 513       
                                                                 
Total params: 256798337 (979.61 MB)
Trainable params: 513 (2.00 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________
