# Intoduction to NLP Fundamentals in Tensorflow

Derive information from text or speech

## Helper Functions

In [1]:
import helper_functions
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

## Evaluation Function

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(y_true, preds):

    return {
        "Accuracy Score": (accuracy_score(y_true, preds) * 100),
        "Precision Score": (precision_score(y_true, preds)* 100),
        "Recall Score": (recall_score(y_true, preds)* 100),
        "F1-Score": (f1_score(y_true, preds)* 100)
    }

## Get Text Data Set

In [3]:
# Kaggle's intro to NLP dataset - target [1 - disaster, 0 - not disaster]
import pandas as pd
train_data = pd.read_csv('data/nlp-getting-started/train.csv')
test_data = pd.read_csv("data/nlp-getting-started/test.csv")

## Visualizing Text Dataset

In [4]:
# Shuffle Data
train_data_shuffled = train_data.sample(frac = 1, random_state=42)
train_data_shuffled.head(3)

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1


In [5]:
# Examples from each class?
train_data_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
# Visualize random training examples
import random
random_index = random.randint(0, len(train_data)-5)
for row in train_data_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")

Target: 1 (real disaster)
Text:
#DebateQuestionsWeWantToHear Why does #Saudi arabia and #Israel get away with mass murder?
#Wahhabism #Zionism

Target: 0 (not real disaster)
Text:
The Five Fatal Flaws in the Iran Deal https://t.co/ztfEAd8GId via @YouTube

Target: 1 (real disaster)
Text:
#Sismo DETECTADO #JapÌ_n [Report 3] 01:02:17 Okinawa Island region M3.8 Depth 10km Maximum seismic intensity 3 JST #??

Target: 0 (not real disaster)
Text:
for some reason im listening to curfew overtime and stuck in a kodak over and over again

Target: 1 (real disaster)
Text:
TV program I saw said US air plane flew to uranium mine in Fukushima and attacked by machine gun when student army were digging it.



### Split data into training and validation splits

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Split training data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_shuffled["text"].to_numpy(),
                                                                                train_data_shuffled["target"].to_numpy(), 
                                                                                test_size=0.1, random_state=42)
print(f"Train: {len(train_sentences)}, Validation: {len(val_sentences)}")

Train: 6851, Validation: 762


## Text Vectorization (Tokenization)

In [9]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [10]:
# Find the avg number of tokens (words) in each sequence of the training set

max_lenght = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_lenght

15

In [11]:
# Setup the vectorization variables
MAX_VOCAB_LENGTH = 10_000
MAX_LENGTH = max_lenght
OUTPUT_MODE = "int"



text_vectorizer = TextVectorization(max_tokens=MAX_VOCAB_LENGTH, # how many words in the vocab
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode=OUTPUT_MODE,
                                    output_sequence_length=MAX_LENGTH, # how long should the sequences be
                                    pad_to_max_tokens=True)

2022-05-27 01:19:23.676599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [13]:
# Create a sample sentence and tokenize it
sample_sentence = "There is a flood in my street"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]])>

## Embedding Layers

Parameter to care for:
* `input_dim`
* `output_dim`
* `input_length`

In [14]:
from tensorflow.keras import layers

OUTPUT_DIM = 128

embedding = layers.Embedding(input_dim = MAX_VOCAB_LENGTH,
                            output_dim = OUTPUT_DIM,
                            input_length = MAX_LENGTH)

In [15]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original Text:\n {random_sentence}\
    \n\nEmbedded Sentence:")

# Embed
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original Text:
 Time collapse is such a cool video technique.  https://t.co/upLFSqMr0C    

Embedded Sentence:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.02616053, -0.03138056,  0.01228428, ..., -0.02613548,
         -0.0083745 ,  0.00446911],
        [ 0.032688  , -0.01101407,  0.01527759, ...,  0.00511795,
         -0.01518694,  0.04057634],
        [-0.03250631,  0.01465285, -0.01969502, ..., -0.02715397,
          0.01426859, -0.00975158],
        ...,
        [-0.00662961,  0.03503707,  0.04418975, ..., -0.00381732,
         -0.00194566,  0.00450187],
        [-0.00662961,  0.03503707,  0.04418975, ..., -0.00381732,
         -0.00194566,  0.00450187],
        [-0.00662961,  0.03503707,  0.04418975, ..., -0.00381732,
         -0.00194566,  0.00450187]]], dtype=float32)>

## Modelling Text Data - Running Multiple Experiments

* Model 0: Naive Bayes (baseline)
* Model 1: Feed Forward Neural Network (dense network)
* Model 2: LSTM Model (RNN)
* Model 3: GRU Model (RNN)
* Model 4: Bidirectional-LSTM Model (RNN)
* Model 5: 1-D Convolutional Network (CNN)
* Model 6: Transfer Learning (Tensorflow Hub)
* Model 7: Model 6, only 10% of data

### Model 0: Naive Bayes - Baseline Model
* `Multinomial Navie Bayes`

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create Tokenization and Modelling Pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert text to numbers
    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)



In [17]:
# Evaluate the baseline model

baseline_score = model_0.score(val_sentences, val_labels)
print(f"Baseline model achieves an accuracy of: {baseline_score * 100:.2f}")

Baseline model achieves an accuracy of: 79.27


In [18]:
# Get Baseline Results
baseline_preds = model_0.predict(val_sentences)
baseline_results = evaluate(y_true=val_labels,
                            preds=baseline_preds)
baseline_results

{'Accuracy Score': 79.26509186351706,
 'Precision Score': 88.6178861788618,
 'Recall Score': 62.643678160919535,
 'F1-Score': 73.4006734006734}

### Model 1: Feed-Forward Neural Net (Dense Network)

In [19]:
# Create Tensorboard Callback 
from helper_functions import create_tensorboard_callback

# Create a directory to save TensoBoard API
SAVE_DIR = "model_logs"

In [20]:
# Build a model with the Functional API
from tensorflow.keras import layers

inputs = layers.Input(shape = (1, ), dtype = tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name = "model_1_dense")

In [21]:
# Summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [22]:
# Compile the model
model_1.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [23]:
# Fit the model

model_1_history = model_1.fit(
    x = train_sentences,
    y = train_labels,
    epochs  = 5,
    validation_data = (val_sentences, val_labels),
    callbacks = [create_tensorboard_callback(dir_name = SAVE_DIR,
                                            experiment_name = "model_1_dense")]
)

Saving TensorBoard log files to: model_logs/model_1_dense/20220527-011926
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
model_1_preds_probs = model_1.predict(val_sentences)
model_1_preds_probs.shape




(762, 1)

In [28]:
# Convert Porbs to label format
model_1_preds = tf.squeeze(tf.round(model_1_preds_probs))
model_1_preds[:5]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 1., 1., 0., 0.], dtype=float32)>

In [34]:
# Evaluate the model
model_1_results = evaluate(val_labels, model_1_preds)
print(f"Model_0 Results:\n {baseline_results}")
print(f"Model_1 Results:\n {model_1_results}")

Model_0 Results:
 {'Accuracy Score': 79.26509186351706, 'Precision Score': 88.6178861788618, 'Recall Score': 62.643678160919535, 'F1-Score': 73.4006734006734}
Model_1 Results:
 {'Accuracy Score': 79.13385826771653, 'Precision Score': 84.36363636363636, 'Recall Score': 66.66666666666666, 'F1-Score': 74.47833065810595}
