# 🛠 Exercises

## 0. Prerequisites

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
# get helper functions
!wget https://raw.githubusercontent.com/yhs2773/TensorFlow-for-Deep-Learning/main/helper_functions.py

from helper_functions import unzip_data

--2023-12-20 12:05:20--  https://raw.githubusercontent.com/yhs2773/TensorFlow-for-Deep-Learning/main/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10872 (11K) [text/plain]
Saving to: ‘helper_functions.py’


2023-12-20 12:05:20 (108 MB/s) - ‘helper_functions.py’ saved [10872/10872]



In [3]:
# download and unzip data
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

unzip_data("nlp_getting_started.zip")

--2023-12-20 12:05:20--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.117.207, 142.250.99.207, 173.194.202.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.117.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-12-20 12:05:21 (110 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [4]:
# load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [5]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
# split train data into train and validation
train_sen, val_sen, train_label, val_label = train_test_split(train_df['text'],
                                                              train_df['target'],
                                                              test_size=0.1,
                                                              random_state=42)

## 1. Rebuild, compile, and train `model_1`, `model_2`, and `model_5` using the [Keras Sequential API](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential) instead of the Functional API.

In [7]:
# text vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = 10000, output_sequence_length = 15)

# fit text vectorizer with the train text
text_vectorizer.adapt(train_sen)

In [8]:
# model 1 with new embedding
model_1_embedding = tf.keras.layers.Embedding(input_dim=10000,
                                              output_dim=128,
                                              input_length=15)

# build
model_1 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1,), dtype='string'),
    text_vectorizer,    # with Sequential API, just use variables
    model_1_embedding,  # with Sequential API, just use variables
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

# compile
model_1.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

# train
history_1 = model_1.fit(train_sen,
                        train_label,
                        epochs=5,
                        validation_data=(val_sen, val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
# model 2 with new embedding
model_2_embedding = tf.keras.layers.Embedding(input_dim=10000,
                                              output_dim=128,
                                              input_length=15)

# build model
model_2 = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype='string'),
    text_vectorizer,
    model_2_embedding,
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile
model_2.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

# train
history_2 = model_2.fit(train_sen,
                        train_label,
                        epochs=5,
                        validation_data=(val_sen, val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# model 5 with embedding
model_5_embedding = tf.keras.layers.Embedding(input_dim=10000,
                                              output_dim=128,
                                              input_length=15)

# build
model_5 = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype='string'),
    text_vectorizer,
    model_5_embedding,
    tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu'),
    tf.keras.layers.MaxPool1D(),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile
model_5.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

# train
history_5 = model_5.fit(train_sen,
                        train_label,
                        epochs=5,
                        validation_data=(val_sen, val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 2. Retrain the baseline model with 10% of the training data. How does perform compared to the Universal Sentence Encoder model with 10% of the training data?

In [11]:
# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [12]:
# split train data into 90% and 10%
train_90_sen, train_10_sen, train_90_label, train_10_label = train_test_split(train_sen,
                                                                              train_label,
                                                                              test_size=0.1,
                                                                              random_state=42)

In [13]:
# baseline model
model_0 = Pipeline([
    ['tfidf', TfidfVectorizer()],
    ['clf', MultinomialNB()]
])

# train
model_0.fit(train_10_sen, train_10_label)

In [14]:
# predict
base_pred = model_0.predict(val_sen)

In [15]:
# function to get metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    results = {"accuracy": accuracy,
               "precision": precision,
               "recall": recall,
               "f1": f1}
    return results

In [16]:
calculate_results(y_true=val_label,
                  y_pred=base_pred)

{'accuracy': 0.7165354330708661,
 'precision': 0.751881230363446,
 'recall': 0.7165354330708661,
 'f1': 0.6941582093397392}

`model_7` (USE model with 10% train data) scored 0.78 on accuracy, precision, and recall and 0.77 on f1.

So overall `model_7` scored higher than `model_0` when trained on 10% train data.

## 3. Try fine-tuning the TF Hub Universal Sentence Encoder model by setting `training=True` when instantiating it as a Keras layer.
```
We can use this encoding layer in place of our text_vectorizer and embedding layer

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=True) # turn training on to fine-tune the TensorFlow Hub model
```

In [17]:
# USE layer
import tensorflow_hub as hub

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=True)

In [18]:
# build
model_6_fine_tune = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile
model_6_fine_tune.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

# train
history_6_fine_tune = model_6_fine_tune.fit(train_sen,
                                            train_label,
                                            epochs=5,
                                            validation_data=(val_sen, val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# get prediction probabilities
model_6_pred_prob = model_6_fine_tune.predict(val_sen)



In [20]:
# get predictions
model_6_pred = tf.squeeze(tf.round(model_6_pred_prob))

In [21]:
# get metrics
calculate_results(val_label, model_6_pred)

{'accuracy': 0.7913385826771654,
 'precision': 0.7908066898847257,
 'recall': 0.7913385826771654,
 'f1': 0.7907616325889775}

Metrics were 0.82 for accuracy, precision, and recall and 0.81 for f1 without fine-tuning USE (`model_6` from chapter 8).

Seems like the best model is the pre-trained USE model.

## 4. Retrain the best model you've got so far on the whole training set (no validation split). Then use this trained model to make predictions on the test dataset and format the predictions into the same format as the `sample_submission.csv` file from Kaggle (see the Files tab in Colab for what the `sample_submission.csv` file looks like). Once you've done this, [submit it to the Kaggle competition](https://www.kaggle.com/c/nlp-getting-started/data), how did your model perform?

In [22]:
# pre-trained USE
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False)

# build
final_model = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [23]:
y_pred_prob = final_model.predict(test_df['text'])
y_pred = tf.squeeze(tf.round(y_pred_prob))



In [24]:
submission = pd.read_csv("sample_submission.csv")
submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [30]:
submission['target'] = y_pred.numpy()

In [34]:
# need int type instead of float type for the target column
submission['target'] = submission['target'].astype(int)
submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [35]:
submission.to_csv('submission.csv', index=False)

Scored 0.4266 in Kaggle

## 5. Combine the ensemble predictions using the majority vote (mode), how does this perform compared to averaging the prediction probabilities of each model?

In [37]:
# predict with model 2
model_2_pred_prob = model_2.predict(val_sen)
model_2_pred = tf.squeeze(tf.round(model_2_pred_prob))



In [60]:
# predict with model 6 (pre-trained USE)
model_6_pred_prob = final_model.predict(val_sen)
model_6_pred = tf.squeeze(tf.round(model_6_pred_prob))



In [74]:
model_6_pred_prob.shape

(762, 1)

In [67]:
# combine predictions
all_predictions = [base_pred, model_2_pred.numpy().astype(int), model_6_pred.numpy().astype(int)]
all_predictions = np.array(all_predictions)
all_predictions.shape

(3, 762)

In [69]:
# getting the mode of the predictions
from scipy.stats import mode

hard_vote_preds, _ = mode(all_predictions, axis=0)
hard_vote_preds, len(hard_vote_preds)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 

In [70]:
calculate_results(val_label, hard_vote_preds)

{'accuracy': 0.7335958005249343,
 'precision': 0.7335677379950147,
 'recall': 0.7335958005249343,
 'f1': 0.7301795398258063}

Hard voting ensemble result is lower than the pre-trained USE model.

In [77]:
# If meant to be put into a function
def hard_voting(predictions):
    """
    Perform hard voting ensembling.

    Parameters:
    - predictions: List of arrays, each array containing the predictions of a model.

    Returns:
    - ensembled_predictions: Array containing the ensembled predictions.
    """
    # Convert the predictions to numpy arrays for easy manipulation
    predictions = np.array(predictions)

    # Take the mode along the axis 1 to get the majority vote for each sample
    ensembled_predictions, _ = mode(predictions, axis=0)

    return ensembled_predictions

# Example usage:
model1_predictions = np.array([0, 0, 1, 1, 0])
model2_predictions = np.array([1, 0, 1, 1, 1])
model3_predictions = np.array([0, 1, 0, 1, 0])

all_predictions = [model1_predictions, model2_predictions, model3_predictions]

# Perform hard voting ensembling
ensembled_predictions = hard_voting(all_predictions)

# Now, 'ensembled_predictions' contains the final ensembled predictions
print(ensembled_predictions)

[0 0 1 1 0]


## 6. Make a confusion matrix with the best-performing model's predictions on the validation set and the validation ground truth labels.

In [78]:
from sklearn.metrics import confusion_matrix

confusion_matrix(val_label, model_6_pred)

array([[ 74, 352],
       [ 87, 249]])

In [84]:
from helper_functions import make_confusion_matrix

make_confusion_matrix(val_label, model_6_pred)