### **NLP using Tensorflow**

In [1]:
# DL needs
import tensorflow as tf
import keras as kr

# Data needs
import pandas as pd
from sklearn.model_selection import train_test_split

# Numerical computation needs
import numpy as np

# plotting needs
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# ensuring reproducibility
random_seed=42
tf.random.set_seed(random_seed)
import sys

sys.path.append('/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/tensorflow')
import important_functionalities as impf

2025-03-21 17:31:10.416922: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:
baseline_results= {'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1_score': 0.7862189758049549}

In [2]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# shuffle training dataframe
train_df_shuffled=train_df.sample(frac=1,random_state=random_seed) # frac: percentage of  data to be shuffled
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
from sklearn.model_selection import train_test_split

train_sentences,val_sentences,train_labels,val_labels=train_test_split(train_df_shuffled['text'].to_numpy(),train_df_shuffled['target'].to_numpy(),test_size=0.1,random_state=random_seed)

In [5]:
# finding the average number of words/tokens in the training tweets
avg_len=round(sum([len(sentence.split()) for sentence in train_sentences])/len(train_sentences))
print(avg_len)
# setup text vectorization variables
max_vocab_length = 10000 # get the most common 10k words to have in our vocab 
max_len = avg_len # max length of our sequence (eg. how many words from the tweet does our model see?)

# creating the text_vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,  # how many words in vocabulary, if None there is no cap on the vocab, it automatically adds <OOV> (Out Of Vocabulary) / Unknown
    output_mode='int',  # how to map tokens/words to numbers
    output_sequence_length=max_len,  # None --> sets each sequence to longest sequence
)

15


In [8]:
# mapping the text vectorization layer to text data and turning it into numbers
text_vectorizer.adapt(train_sentences)

In [7]:
# creating an embedding
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 128,
                                      input_length = max_len,
                                      # embedding_initializer = 'uniform' # default:uniform random nos
                                    )
embedding



<Embedding name=embedding, built=False>

**8. Model 1: [Feed forward Neural Network (dense model)]: creation and evaluation**

In [32]:
# create a tensorboard callback (need to create a new one for each model)
SAVE_DIR = 'model_logs'

inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string) # inputs are 1D strings
x = text_vectorizer(inputs) # turn input text into numbers
x = embedding(x) # create an embedding using our vectorized text
x = tf.keras.layers.GlobalMaxPooling1D()(x)  # condense the feature vector of each token to one vector
# x = tf.keras.layers.GlobalAveragePooling1D()(x)  # condense the feature vector of each token to one vector
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
model_1=tf.keras.Model(inputs,outputs,name='model_1_dense')
model_1.summary()

In [33]:
# compile the model 
model_1.compile(loss='binary_crossentropy',metrics=['accuracy'],optimizer=tf.keras.optimizers.Adam())

In [34]:
# fit the model
model_1_history = model_1.fit(x=train_sentences, y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[impf.create_tensorboard_callback(dir_name=SAVE_DIR, experiment_name='model_1_dense')])

Saving TensorBoard log files to : model_logs/model_1_dense/20250321-184042
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.8533 - loss: 0.5436 - val_accuracy: 0.7848 - val_loss: 0.4860
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.8789 - loss: 0.3308 - val_accuracy: 0.7900 - val_loss: 0.4803
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.8984 - loss: 0.2590 - val_accuracy: 0.7913 - val_loss: 0.4961
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9202 - loss: 0.2131 - val_accuracy: 0.7953 - val_loss: 0.5163
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9345 - loss: 0.1778 - val_accuracy: 0.8045 - val_loss: 0.5381


In [35]:
# making predictions:
model_1_preds=model_1.predict(val_sentences)
thresh=0.5
model_1_preds_threshed=list(map(lambda x: 1 if x>thresh else 0,model_1_preds))

# calculating results:
model_1_results=impf.calculate_results(val_labels,model_1_preds_threshed)
model_1_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


{'accuracy': 80.4461942257218,
 'precision': 0.8065100939758145,
 'recall': 0.8044619422572179,
 'f1_score': 0.8028505735911119}

In [36]:
# comparing model 1 with baseline
np.array(list(model_1_results.values()))>np.array(list(baseline_results.values()))

array([ True, False,  True,  True])

**9. Visualizing our model's learned word embeddings**
* create two files vectors.tsv and metadata.tsv
* upload it to https://projector.tensorflow.org/ and visualize

In [40]:
# get the vocabulary from text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab),words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [41]:
# model 1 summary
model_1.summary()

In [45]:
# get the weight matrix of embedding layer
# these are the numerical representations of each tokens in our training data which have been learned for 5 epochs
embed_weights=model_1.get_layer('embedding').get_weights()[0]
print(embed_weights,embed_weights.shape)

[[-0.09109654 -0.0626293  -0.2163469  ... -0.08737138 -0.06183725
  -0.24922632]
 [-0.07200091 -0.06092988 -0.18160868 ... -0.06998814 -0.04988552
  -0.2113291 ]
 [-0.0678921  -0.0410133  -0.07798287 ... -0.065994   -0.02786303
  -0.09383145]
 ...
 [-0.02269508 -0.02359728 -0.02325134 ...  0.04924147  0.00737179
  -0.032533  ]
 [ 0.01177475  0.05736914 -0.05841202 ...  0.05546805  0.03036312
  -0.02103549]
 [ 0.08127932  0.00836206 -0.04864586 ...  0.12692735  0.1422633
  -0.10321134]] (10000, 128)


In [None]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

***-- CONTD IN NEXT NOTEBOOK --***