### **NLP using Tensorflow**

In [1]:
# DL needs
import tensorflow as tf
import keras as kr

# Data needs
import pandas as pd
from sklearn.model_selection import train_test_split

# Numerical computation needs
import numpy as np

# plotting needs
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# ensuring reproducibility
random_seed=42
tf.random.set_seed(random_seed)
import sys

sys.path.append('/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/tensorflow')
import important_functionalities as impf

2025-03-24 13:02:55.720548: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
baseline_results = {'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1_score': 0.7862189758049549}

model_1_results = {'accuracy': 80.4461942257218,
 'precision': 0.8065100939758145,
 'recall': 0.8044619422572179,
 'f1_score': 0.8028505735911119}

model_2_results = {'accuracy': 74.80314960629921,
 'precision': 0.7475572440372034,
 'recall': 0.7480314960629921,
 'f1_score': 0.7475400591051667}

model_3_results = {'accuracy': 75.45931758530183,
 'precision': 0.7546642240189775,
 'recall': 0.7545931758530183,
 'f1_score': 0.7531886844350836}

model_4_results = {'accuracy': 74.93438320209974,
 'precision': 0.7500996927165142,
 'recall': 0.7493438320209974,
 'f1_score': 0.747278252053036}

In [4]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# shuffle training dataframe
train_df_shuffled=train_df.sample(frac=1,random_state=random_seed) # frac: percentage of  data to be shuffled
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [6]:
from sklearn.model_selection import train_test_split

train_sentences,val_sentences,train_labels,val_labels=train_test_split(train_df_shuffled['text'].to_numpy(),train_df_shuffled['target'].to_numpy(),test_size=0.1,random_state=random_seed)

In [7]:
# finding the average number of words/tokens in the training tweets
avg_len=round(sum([len(sentence.split()) for sentence in train_sentences])/len(train_sentences))
print(avg_len)
# setup text vectorization variables
max_vocab_length = 10000 # get the most common 10k words to have in our vocab 
max_len = avg_len # max length of our sequence (eg. how many words from the tweet does our model see?)

# creating the text_vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,  # how many words in vocabulary, if None there is no cap on the vocab, it automatically adds <OOV> (Out Of Vocabulary) / Unknown
    output_mode='int',  # how to map tokens/words to numbers
    output_sequence_length=max_len,  # None --> sets each sequence to longest sequence
)

15


In [8]:
# mapping the text vectorization layer to text data and turning it into numbers
text_vectorizer.adapt(train_sentences)

In [9]:
# creating an embedding
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 128,
                                      input_length = max_len,
                                      # embedding_initializer = 'uniform' # default:uniform random nos
                                    )
embedding



<Embedding name=embedding, built=False>

**14. Intuition behind 1D CNN**

In [None]:
tf.keras.layers.Conv1D

In [17]:
embedding_test = embedding(text_vectorizer(["This is a sentence"]))

conv_1d = tf.keras.layers.Conv1D(filters = 32,
                                 kernel_size = 5, # this is refered to as an ngram of 5 (meaning it looks at 5 words at a time)
                                 activation = 'relu',
                                 padding= 'same',# default:'valid'
                                 strides=1 # default:1
                                 )

conv_1d_output = conv_1d(embedding_test)

max_pool_layer = tf.keras.layers.GlobalMaxPool1D()
max_pool_output = max_pool_layer(conv_1d_output)

embedding_test.shape,conv_1d_output.shape,max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 15, 32]), TensorShape([1, 32]))

In [34]:
inputs=tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters = 32, kernel_size = 5, padding = 'same',activation='relu')(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
outputs=tf.keras.layers.Dense(units=1,activation='sigmoid')(x)
model_5 = tf.keras.models.Model(inputs,outputs,name='model_5')

In [35]:
model_5.summary()

**15. Model 5: [1D CNNs]**

In [36]:
# compile the model
model_5.compile(loss='binary_crossentropy', 
                metrics=['accuracy'], 
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

In [37]:
# create a tensorboard callback (need to create a new one for each model)
SAVE_DIR = 'model_logs'

# fit the model
history_model_5 = model_5.fit(train_sentences,train_labels,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[impf.create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name='model_5_Conv1D')]
                              )

Saving TensorBoard log files to : model_logs/model_5_Conv1D/20250324-132510
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.9119 - loss: 0.3557 - val_accuracy: 0.7822 - val_loss: 0.5979
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9655 - loss: 0.1090 - val_accuracy: 0.7756 - val_loss: 0.7019
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9769 - loss: 0.0714 - val_accuracy: 0.7730 - val_loss: 0.7599
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9785 - loss: 0.0603 - val_accuracy: 0.7717 - val_loss: 0.7952
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9801 - loss: 0.0525 - val_accuracy: 0.7717 - val_loss: 0.8227


In [38]:
# making predictions:
model_5_preds=model_5.predict(val_sentences)
thresh=0.5
model_5_preds_threshed=list(map(lambda x: 1 if x>thresh else 0,model_5_preds))

# calculating results:
model_5_results=impf.calculate_results(val_labels,model_5_preds_threshed)
model_5_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


{'accuracy': 77.16535433070865,
 'precision': 0.7722289521502119,
 'recall': 0.7716535433070866,
 'f1_score': 0.7701831305177762}

In [39]:
# comparing model 5 with baseline
np.array(list(model_5_results.values()))>np.array(list(baseline_results.values()))

array([False, False, False, False])

In [40]:
# comparing model 5 with model 1
np.array(list(model_5_results.values()))>np.array(list(model_1_results.values()))

array([False, False, False, False])

In [41]:
# comparing model 5 with model 2
np.array(list(model_5_results.values()))>np.array(list(model_2_results.values()))

array([ True,  True,  True,  True])

In [42]:
# comparing model 5 with model 3
np.array(list(model_5_results.values()))>np.array(list(model_3_results.values()))

array([ True,  True,  True,  True])

In [43]:
# comparing model 5 with model 4
np.array(list(model_5_results.values()))>np.array(list(model_4_results.values()))

array([ True,  True,  True,  True])

**Conclusion:**
* Baseline Model, Model 1, outperform Model 5
* Model 5 outperforms Model 2, Model 3, Model 4

***-- CONTD IN NEXT NOTEBOOK --***