### **NLP using Tensorflow**

In [1]:
# DL needs
import tensorflow as tf
import keras as kr

# Data needs
import pandas as pd
from sklearn.model_selection import train_test_split

# Numerical computation needs
import numpy as np

# plotting needs
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# ensuring reproducibility
random_seed=42
tf.random.set_seed(random_seed)
import sys

sys.path.append('/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/tensorflow')
import important_functionalities as impf

2025-03-24 09:54:40.998433: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
baseline_results = {'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1_score': 0.7862189758049549}

model_1_results = {'accuracy': 80.4461942257218,
 'precision': 0.8065100939758145,
 'recall': 0.8044619422572179,
 'f1_score': 0.8028505735911119}

In [3]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# shuffle training dataframe
train_df_shuffled=train_df.sample(frac=1,random_state=random_seed) # frac: percentage of  data to be shuffled
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
from sklearn.model_selection import train_test_split

train_sentences,val_sentences,train_labels,val_labels=train_test_split(train_df_shuffled['text'].to_numpy(),train_df_shuffled['target'].to_numpy(),test_size=0.1,random_state=random_seed)

In [6]:
# finding the average number of words/tokens in the training tweets
avg_len=round(sum([len(sentence.split()) for sentence in train_sentences])/len(train_sentences))
print(avg_len)
# setup text vectorization variables
max_vocab_length = 10000 # get the most common 10k words to have in our vocab 
max_len = avg_len # max length of our sequence (eg. how many words from the tweet does our model see?)

# creating the text_vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,  # how many words in vocabulary, if None there is no cap on the vocab, it automatically adds <OOV> (Out Of Vocabulary) / Unknown
    output_mode='int',  # how to map tokens/words to numbers
    output_sequence_length=max_len,  # None --> sets each sequence to longest sequence
)

15


In [7]:
# mapping the text vectorization layer to text data and turning it into numbers
text_vectorizer.adapt(train_sentences)

In [8]:
# creating an embedding
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 128,
                                      input_length = max_len,
                                      # embedding_initializer = 'uniform' # default:uniform random nos
                                    )
embedding



<Embedding name=embedding, built=False>

**10. Recurrent Neural Networks (RNNs)**
* Useful for sequence data
* The premise of RNN is to use the premise of previous input to aid the representation of a later input.

REF RNNs: https://www.youtube.com/watch?v=GvezxUdLrEk&list=PLtBw6njQRU-rwp5__7C0oIVt26ZgjG9NI&index=3<br>
REF RNNs: https://karpathy.github.io/2015/05/21/rnn-effectiveness/<br>
REF LSTM: https://colah.github.io/posts/2015-08-Understanding-LSTMs/<br>

**11. Model 2: [LSTM (RNN)]**

**Note:**
* When you are stacking, many parameters of LSTM layer are set to great defaults, but for the first one, you need to set return sequences to `True`, this is necessary because the LSTM layer expects 3D input [batch, timesteps/sequences, feature] {each sequence is treated as a time step}
* Default RNN layer for sequence problems

In [57]:
# create the LSTM model

inputs=tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x) # returns representation of sequence in form of a sequence
print(x.shape)
x = tf.keras.layers.LSTM(units=64)(x)
print(x.shape)
x = tf.keras.layers.Dense(units=64,activation='relu')(x)
print(x.shape)
outputs=tf.keras.layers.Dense(units=1,activation='sigmoid')(x)

model_2 = tf.keras.models.Model(inputs,outputs,name='model_2')

(None, 15, 128)
(None, 15, 64)
(None, 64)
(None, 64)


In [58]:
# get a summary
model_2.summary()

In [59]:
# compile the model
model_2.compile(loss='binary_crossentropy', 
                metrics=['accuracy'], 
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))

In [60]:
# create a tensorboard callback (need to create a new one for each model)
SAVE_DIR = 'model_logs'

# fit the model
history_model_2 = model_2.fit(train_sentences,train_labels,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[impf.create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name='model_2_LSTM')]
                              )

Saving TensorBoard log files to : model_logs/model_2_LSTM/20250324-103200
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.8437 - loss: 0.5010 - val_accuracy: 0.7507 - val_loss: 0.8067
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9791 - loss: 0.0650 - val_accuracy: 0.7428 - val_loss: 0.9796
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9817 - loss: 0.0530 - val_accuracy: 0.7349 - val_loss: 1.1188
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9810 - loss: 0.0476 - val_accuracy: 0.7467 - val_loss: 1.1639
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9835 - loss: 0.0410 - val_accuracy: 0.7480 - val_loss: 1.2702


In [61]:
# making predictions:
model_2_preds=model_2.predict(val_sentences)
thresh=0.5
model_2_preds_threshed=list(map(lambda x: 1 if x>thresh else 0,model_2_preds))

# calculating results:
model_2_results=impf.calculate_results(val_labels,model_2_preds_threshed)
model_2_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


{'accuracy': 74.80314960629921,
 'precision': 0.7475572440372034,
 'recall': 0.7480314960629921,
 'f1_score': 0.7475400591051667}

In [62]:
# comparing model 2 with baseline
np.array(list(model_2_results.values()))>np.array(list(baseline_results.values()))

array([False, False, False, False])

In [63]:
# comparing model 2 with model 1
np.array(list(model_2_results.values()))>np.array(list(model_1_results.values()))

array([False, False, False, False])

**Conclusion:**
* Baseline model and Model 1 outperform Model 2.
* It is observed that Model 2 is overfitting

***-- CONTD IN NEXT NOTEBOOK --***