<a href="https://colab.research.google.com/github/terrysu64/Machine-Learning-and-Data-Science-Projects/blob/main/introduction_to_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#NLP Fundamentals

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2022-08-25 15:00:57--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.1’


2022-08-25 15:00:57 (58.4 MB/s) - ‘helper_functions.py.1’ saved [10246/10246]



In [None]:
from helper_functions import create_tensorboard_callback, plot_loss_curves, unzip_data, walk_through_dir

### import dataset
kaggle: https://www.kaggle.com/competitions/nlp-getting-started

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2022-08-25 15:00:58--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 108.177.98.128, 74.125.197.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.1’


2022-08-25 15:00:58 (127 MB/s) - ‘nlp_getting_started.zip.1’ saved [607343/607343]



In [None]:
unzip_data('nlp_getting_started.zip')

### visualize text dataset

In [None]:
import pandas as pd
train_df = pd.read_csv('train.csv').sample(frac=1,
                                           random_state=42)
test_df = pd.read_csv('test.csv').sample(frac=1,
                                         random_state=42)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
len(train_df), len(test_df)

(7613, 3263)

In [None]:
from random import randint
for _ in range(3):
  i = randint(1,7612)
  print(f"{'disaster' if train_df.target[i] else 'not disaster'}: {train_df.text[i]}")

not disaster: @TheSewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me
not disaster: @ACarewornHeart Have a good un fella sorry I won't be there to get annihilated with you :(
not disaster: niggas love hating.


### split data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_df.text.to_numpy(),
                                                  train_df.target.to_numpy(),
                                                  test_size=0.1,
                                                  random_state=42)

In [None]:
train_df.head().to_numpy() #changes df => numpy arr

array([[3796, 'destruction', nan,
        'So you have a new weapon that can cause un-imaginable destruction.',
        1],
       [3185, 'deluge', nan,
        'The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@',
        0],
       [7769, 'police', 'UK',
        'DT @georgegalloway: RT @Galloway4Mayor: \x89ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q',
        1],
       [191, 'aftershock', nan,
        'Aftershock back to school kick off was great. I want to thank everyone for making it possible. What a great night.',
        0],
       [9810, 'trauma', 'Montgomery County, MD',
        'in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3',
        0]], dtype=object)

### converting text => numbers

In [None]:
#tokenization: word => a specific number
#embedding: representing relationships between tokens (can limit size + can be learned)

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

MAX_VOCAB=10000
MAX_LENGTH=15
vectorizer = TextVectorization(max_tokens=MAX_VOCAB, #limit to how many words in vocab
                               standardize="lower_and_strip_punctuation",
                               split="whitespace",
                               ngrams=None, #create grops of n words
                               output_mode="int",
                               output_sequence_length=MAX_LENGTH, #how long to limit tweet
                               pad_to_max_tokens=True #replace anything not in max_tokens to 0
)

In [None]:
vectorizer.adapt(X_train)

In [None]:
vectorizer(['hi there buddy'])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[1663,   74,    1,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]])>

In [None]:
vocab = vectorizer.get_vocabulary()
vocab[:5] #top 5 words

['', '[UNK]', 'the', 'a', 'in']

### creating embedding layer

In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=MAX_VOCAB,
                             output_dim=128,
                             input_length=MAX_LENGTH) #neural networks work well with %8==0

embedding(vectorizer("hi there buddy"))

<tf.Tensor: shape=(15, 128), dtype=float32, numpy=
array([[-0.04017498, -0.03317352, -0.01811362, ...,  0.00358735,
        -0.02809337, -0.03491491],
       [-0.00638739,  0.03134339,  0.01986064, ..., -0.00869809,
        -0.04893377, -0.04181733],
       [ 0.03532271, -0.025186  ,  0.00998145, ..., -0.01108205,
         0.00590509, -0.04034468],
       ...,
       [-0.02443194,  0.02620092,  0.03965751, ..., -0.00576602,
         0.03838165, -0.01329004],
       [-0.02443194,  0.02620092,  0.03965751, ..., -0.00576602,
         0.03838165, -0.01329004],
       [-0.02443194,  0.02620092,  0.03965751, ..., -0.00576602,
         0.03838165, -0.01329004]], dtype=float32)>

## building models for the dataset
###multinomial naive bayes using TF-IDF formula

note: it is common to use no DL algos first

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [None]:
baseline_score = model.score(X_val, y_val)
baseline_score

0.7926509186351706

In [None]:
model.predict(X_val)[:5], y_val[:5]

(array([1, 1, 1, 0, 0]), array([0, 0, 1, 1, 1]))

In [None]:
from sklearn.metrics import classification_report

report = classification_report(model.predict(X_val),
                      y_val)
report

'              precision    recall  f1-score   support\n\n           0       0.93      0.75      0.83       516\n           1       0.63      0.89      0.73       246\n\n    accuracy                           0.79       762\n   macro avg       0.78      0.82      0.78       762\nweighted avg       0.83      0.79      0.80       762\n'

In [None]:
#               precision    recall  f1-score   support

#            0       0.93      0.75      0.83       516
#            1       0.63      0.89      0.73       246

#     accuracy                           0.79       762
#    macro avg       0.78      0.82      0.78       762
# weighted avg       0.83      0.79      0.80       762


model 1: RNN (LSTM)
connections between tensors form a directed or undirected graph along a temporal sequence.

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(64, return_sequences=False)(x)
print(x.shape)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs,outputs,name="model_LSTM")

(None, 64)


In [None]:
model.summary()

Model: "model_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 lstm_4 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                        

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"]) 

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### model 2: GRU

In [None]:
inputs = layers.Input(shape=(1,), dtype="string")
x = vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64, return_sequences=True)(x)
x = layers.LSTM(64, return_sequences=False)(x)
print(x.shape)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs,outputs,name="model_GRU")

(None, 64)


In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"]) 

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###model 3: Bidirectional-LSTM
creates representation for text (l to r and vice-versa)

In [None]:
inputs = layers.Input(shape=(1,), dtype="string")
x = vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=False))(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs,outputs,name="model_LSTM_bidirectional")

In [None]:
model.summary()

Model: "model_LSTM_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 15, 128)          74496     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                          

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"]) 

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### model 4: USE feature extractor

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(["The quick brown fox jumps over the lazy dog.","I am a sentence for which I would like to get its embedding"])

print(embeddings)

# The following are example embedding output of 512 dimensions per sentence
# Embedding for: The quick brown fox jumps over the lazy dog.
# [-0.03133016 -0.06338634 -0.01607501, ...]
# Embedding for: I am a sentence for which I would like to get its embedding.
# [0.05080863 -0.0165243   0.01573782, ...]


tf.Tensor(
[[-0.03133019 -0.06338634 -0.01607501 ... -0.03242778 -0.04575738
   0.05370455]
 [ 0.0508086  -0.01652432  0.01573777 ...  0.00976659  0.0317012
   0.01788118]], shape=(2, 512), dtype=float32)


In [None]:
use_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                           input_shape=[],
                           trainable=False,
                           name="USE",
                           dtype=tf.string)

In [None]:
model = tf.keras.Sequential([
    use_layer,
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"]) 

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
