# Introduction to NLP Fundamentals in TensorFlow

NLP has the goal of deriving information out of natural language (could be sequences text or speech)

Another common term for NLP problems is sequence to sequence problems (seq2seq)

In [3]:
!pip3 install tensorflow[and-cuda]==2.16.1
!pip3 install keras==3.3.3
!pip3 install tensorboard==2.16.2



## Check for GPU

In [2]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-946be51f-909f-a42c-6f2a-d14b84c44f4f)


## Get helper functions

In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-05-10 06:22:33--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-05-10 06:22:33 (105 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [4]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

## Get a text dataset

The dataset we're going to be using is Kaggle's introduction to NLP dataset (text samples of Tweets labelled as disaster or not).

In [5]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-05-10 06:22:39--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.12.207, 172.217.194.207, 172.253.118.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.12.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-05-10 06:22:40 (718 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:
unzip_data("nlp_getting_started.zip")

## Visualizing a text dataset

To visualize our text samples, we first have to read them in, one way to do so would be to use Python

But it also can be pandas..

In [6]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [8]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_df['text'][1]

'Forest fire near La Ronge Sask. Canada'

In [10]:
# Shuffle training df
train_df =train_df.sample(frac=1, random_state=42)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [11]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
train_df.target.value_counts()

In [None]:
len(train_df), len(test_df)

In [None]:
# Let's visualize some random train examples
import random

random_index = random.randint(0, len(train_df)-5)
for row in train_df[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

### Split train_df into training and validation datasets

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(),
                                                                            train_df["target"].to_numpy(),
                                                                            test_size=0.1, # 10% of the train_df for validation_split
                                                                            random_state=42)

In [None]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

In [None]:
train_sentences[:10], train_labels[:10]

## Converting text into numbers

When dealing with a text problem, one of the first things you'll have to do before you can build a model is to convert text to numbers.

There are a few ways to do this, namely:
* Tokenization - direct mapping of token (could be word or char) to number;
* Embedding - create a matrix of featue vector for each token (the size of the feature vector can be defined and this embedding can be learned)

### Text vectorization (tokenization)

In [None]:
train_sentences[:5]

In [9]:
import tensorflow as tf
from keras.layers import TextVectorization

In [10]:
  # use default TextVectorization parameters
  text_vectorizer = TextVectorization(
      max_tokens=10000, # How many words are in vocab
      standardize="lower_and_strip_punctuation",
      split="whitespace",
      ngrams=None, # Create groups of n-words
      output_mode="int", # How to map words to num
      output_sequence_length=50, # how long does sequences will be
      pad_to_max_tokens=True
  )

In [11]:
# Find the avg number of tokens in the training tweets
round(sum([len(i.split()) for i in train_sentences])) / len(train_sentences)

14.87447088016348

In [12]:
# Setup text vectorization variables
max_vocab_length = 10000 # max num of words to have in our vocab
max_length = 15

text_vectorizer = TextVectorization(
    max_vocab_length,
    output_mode="int", # How to map words to num
  output_sequence_length=max_length, # how long does sequences will be
)

In [13]:
 text_vectorizer.adapt(train_sentences)

In [14]:
# Create a sample sentence and tokenize it
sample_sentence = "There is a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 75,   9,   3, 206,   4,  13, 674,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [15]:
# Choose a random sentence from the ds and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text: {random_sentence}")
print(f"Vectorized version: {text_vectorizer([random_sentence]).numpy()}")

NameError: name 'random' is not defined

In [16]:
words_in_vocab = text_vectorizer.get_vocabulary()

In [17]:
len(words_in_vocab), words_in_vocab[:5], words_in_vocab[-5:]

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pakthey', 'pakistan\x89Ûªs', 'pakistans', 'pajamas', 'paints'])

### Creating an Embedding using an Embedding Layer

To make our embedding, we're going to use TF embedding layer

The parameters we most care about:
* `input_dim` = the size of our vocab
* `output_dim` = the size of the output embedding vector, for example, a value of 100 would mean each token gets represented by a vector 100 long
* `input_length` = length of the sequences being passed to the embedding layer

In [18]:
from keras import layers

embedding = layers.Embedding(
    input_dim=max_vocab_length,  # set input shape
    output_dim=16 * 8, # 128 = 16 * 8
    input_length=max_length

)



In [None]:
embedding

In [None]:
random_sentence = random.choice(train_sentences)
print(f"Original text: {random_sentence}")

sample_embed = embedding(text_vectorizer([random_sentence]))
print(f"Embed:")
sample_embed

In [None]:
sample_embed[0][0]

## Modelling a text dataset

Now we've got a way to turn our text sequences into numbers. it's time to start building a series of modelling experiments

* Model 0: Naive Bayes (baseline)
* Model 1: Feed-forward neural network (dense model
* Model 2: LSTN (RNN)
* Model 3: GRU (RNN)
* Model 4:  Bidirectional-LSTM (RNN()
* Model 5: 1D CNN
* Model 6: TF Hub Pretrained Feature Extractor
* Model 7: the same as model 6 with 10% of train_data

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Model 0: baseline

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert word to numbers
    ("clf", MultinomialNB()), # model the text
])

# Fit the pipeline to the train_data
model_0.fit(train_sentences, train_labels)

In [20]:
# Evaluate our baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 77.82%


In [21]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [22]:
from sklearn import metrics

def evaluate_model(y_true, y_pred):
  "Evaluate a model on Accuracy, Precision, Recall and F1-score"

  accuracy_score = metrics.accuracy_score(y_true, y_pred)
  precision, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_true, y_pred, average="weighted")

  return {"accuracy_score": accuracy_score,
          "precision_score": precision,
          "recall_score": recall,
           "f1_score": f1_score}

In [23]:
baseline_results = evaluate_model(val_labels, baseline_preds)

### Model 1: dense model

In [24]:
SAVE_DIR = "model_logs"

In [26]:
import keras
tf.keras.__version__ == keras.__version__

True

In [None]:
inputs = layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs) # turn into numbers
x = embedding(x)
x = layers.GlobalMaxPooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [None]:
model_1.summary()

In [None]:
model_1.compile(
    loss=keras.losses.binary_crossentropy,
    optimizer=keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [None]:
model_1_history = model_1.fit(
     train_sentences,
     train_labels,
     epochs=5,
     validation_data=(val_sentences, val_labels),
     callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, experiment_name='model_1_dense')]
)

In [None]:
model_1.evaluate(val_sentences, val_labels)

In [None]:
model_1.evaluate(val_sentences, val_labels)

In [None]:
 model_1_pred_probs = model_1.predict(val_sentences)
 model_1_pred_probs.shape

In [None]:
model_1_pred_probs[0]

In [None]:
# Convert pred probs to [1, 0]
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds

In [None]:
model_1_results = evaluate_model(
    val_labels,
    model_1_preds
)

In [None]:
model_1_results

In [None]:
model_1_results

In [None]:
baseline_results

In [None]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

## Visualizing learned embeddings

In [None]:
# Get the vocab from the text vectorization
words_in_vocab = text_vectorizer.get_vocabulary()

In [None]:
len(words_in_vocab), words_in_vocab[:10]

In [None]:
model_1.summary()

In [None]:
# Get the weight matrix of embedding layer
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights, embed_weights.shape

Now we've got the embedding matrix our model has learned to prepresent our tokens, let's see how we can visulize it.

To do so, TensorFlow has a handy tool called projector: https://projector.tensorflow.org/

And Tensorflow also has an incredible guide word embeddings: https://www.tensorflow.org/text/guide/word_embeddings

In [None]:
# Saving embeddings
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
# Download to import on projector
try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass

## Recurrent Neural Networks (RNN's)

RNN's are useful for sequence data

The premise of a RNN is to use the representation of a previous input to aid the representation of a later input.

http://introtodeeplearning.com/

### Model 2: LSTM

LSTM = long short term memory (one of the most popular LSTM cells)

Our structer (RNN) looks like this:
```
Input (text) -> Tokenize -> Embedding -> Layers (RNNs/dense) -> Output (label probs)

In [None]:
# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64)(x)#, return_sequences=True)(x) # when you're stacking RNN cells together, you need to return seq
# print(x.shape)
# x = layers.LSTM(64)(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [None]:
model_2.summary()

In [None]:
model_2.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.Accuracy]
)

In [None]:
# Fit the model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "model_2_LSTM")])

In [None]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

In [None]:
# Convert model_2 pred pros to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

In [None]:
model_2_results = evaluate_model(val_labels, model_2_preds)

In [None]:
model_2_results

In [None]:
np.array(list(model_2_results.values())) > np.array(list(baseline_results.values()))

### Tasks:
* Replicate tanh
* Build GRU model

In [None]:
def tanh(x):
  return (tf.exp(x) - tf.exp(-x)) / (tf.exp(x) + tf.exp(-x))

In [None]:
inputs = layers.Input((1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64, activation=tanh)(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_3 = tf.keras.Model(inputs, outputs, name='model_3_GRU')

In [None]:
model_3.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy,
    metrics=["accuracy"]
)

In [None]:
model_3_history = model_3.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(SAVE_DIR,
                                           "model_3_GRU")]
)

In [None]:
model_3_pred_probs = model_3.predict(val_sentences)

In [None]:
model_3_pred_probs[:10]

In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))

In [None]:
model_3_preds

In [None]:
model_3_results = evaluate_model(val_labels, model_3_preds)

In [None]:
model_3_results

In [None]:
np.array(list(model_3_results.values())) > np.array(list(baseline_results.values()))

### Model 4: Bidirectional RNN

Bidirectional RNN combines representations from right to left and left to right (in senquences)

In [None]:
inputs = layers.Input((1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, activation=tanh))(x)
x = layers.Bidirectional(layers.GRU(128, return_sequences=True, activation=tanh))(x)
x = layers.Bidirectional(layers.LSTM(128, activation=tanh))(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_4 = tf.keras.Model(inputs, outputs)

In [None]:
model_4.summary()

In [None]:
model_4.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy,
    metrics=["accuracy"]
)

In [None]:
model_4_history = model_4.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(SAVE_DIR, "model_2_bidirectional")]
)

In [None]:
model_4_pred_probs = model_4.predict(val_sentences)

In [None]:
model_4_pred_probs[:10]

In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))

In [None]:
model_4_preds[:10]

In [None]:
model_4_results = evaluate_model(val_labels, model_4_preds)

In [None]:
np.array(list(model_4_results.values())) > np.array(list(baseline_results.values()))

In [None]:
np.array(list(model_4_results.values())) > np.array(list(model_3_results.values()))

In [None]:
model_4.summary()

### Model 5: 1D CNN

In [None]:
# Test out our embedding, Conv1D and maxpool layers
embedding_text = embedding(text_vectorizer(["This is a plan to win the world"]))
conv_1d = layers.Conv1D(
    filters=64,
    kernel_size=5, # looks at 5 words at a time then new word +1, the oldest -1
    strides=1,
    activation="relu",
    padding="same" # valid
)
conv_1d_output = conv_1d(embedding_text)
max_pool_output = layers.GlobalMaxPool1D()(conv_1d_output)

embedding_text.shape, conv_1d_output.shape, max_pool_output.shape

In [25]:
from tensorflow.keras import layers

inputs = layers.Input((1,), dtype=tf.string)

x = text_vectorizer(inputs)
x = embedding(x)

x = layers.Conv1D(256, 5, activation="relu")(x)
x = layers.MaxPooling1D()(x)

x = layers.Conv1D(1024, 5, activation="relu")(x)
x = layers.MaxPooling1D()(x)

x = layers.GlobalAveragePooling1D()(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

In [26]:
model_5.summary()

In [27]:
# Compile the model
model_5.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [None]:
model_5_history = model_5.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(SAVE_DIR, "Conv1D")]
)

Saving TensorBoard log files to: model_logs/Conv1D/20240510-062755
Epoch 1/5


In [None]:
model_5_pred_probs = model_5.predict(val_sentences)

In [30]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

NameError: name 'model_5_pred_probs' is not defined

In [31]:
# Evaluate model 5 preds
model_5_results = evaluate_model(val_labels, model_5_preds)
model_5_results

NameError: name 'model_5_preds' is not defined

In [None]:
baseline_results

## Model 6: TF Hub Pretrained embedding