<a href="https://colab.research.google.com/github/varad0207/Keras-NLP/blob/main/disaster_tweets_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade tensorflow

In [None]:
!pip install --upgrade keras

In [1]:
import keras
print(keras.__version__)

3.1.1


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

from keras import ops
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, GlobalMaxPool1D, BatchNormalization, Bidirectional, Conv1D, GlobalMaxPooling1D, MaxPooling1D, MultiHeadAttention, LayerNormalization, Layer, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from tensorflow.keras import regularizers

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Load preprocessed data

In [3]:
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    7613 non-null   int64  
 1   id                            7613 non-null   int64  
 2   keyword                       7613 non-null   object 
 3   location                      7613 non-null   object 
 4   text                          7613 non-null   object 
 5   target                        7613 non-null   int64  
 6   word_count                    7613 non-null   int64  
 7   unique_word_count             7613 non-null   int64  
 8   stop_word_count               7613 non-null   int64  
 9   url_count                     7613 non-null   int64  
 10  mean_word_length              7613 non-null   float64
 11  char_count                    7613 non-null   int64  
 12  punctuation_count             7613 non-null   int64  
 13  has

In [5]:
X = train_df['clean_text']
y = train_df['target']
y = np.array(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    ariaahrary thetawniest the out of control wild...
7610                m194 0104 utc5km s of volcano hawaii 
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: clean_text, Length: 7613, dtype: object

In [8]:
y

array([1, 1, 1, ..., 1, 1, 1])

In [9]:
VOCAB_SIZE = 15000
SENTENCE_LENGTH = 15
VECTOR_FEAT = 32

# Support Vector Classifier

In [62]:
tfidf = TfidfVectorizer()
classifier = LinearSVC()

In [63]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [64]:
classifier.fit(X_train_tf, y_train)

In [65]:
y_pred_svc = classifier.predict(X_test_tf)

acc_svc = accuracy_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

In [66]:
print('SVC with tf-idf')
print(f'Accuracy: {acc_svc:.4f}')
print(f'F1 score: {f1_svc:.4f}')

SVC with tf-idf
Accuracy: 0.7968
F1 score: 0.7532


# Bidirectional LSTM

In [16]:
onehot_vec_train = [one_hot(words, VOCAB_SIZE) for words in X_train]
onehot_vec_test = [one_hot(words, VOCAB_SIZE) for words in X_test]

word_len = []
for i in onehot_vec_train:
    word_len.append(len(i))

print(f'Maximum word length: {max(word_len)}')
print(f'Minimum word length: {min(word_len)}')

embedded_docs_train = pad_sequences(onehot_vec_train, padding='post', maxlen=SENTENCE_LENGTH)
embedded_docs_test = pad_sequences(onehot_vec_test, padding='post', maxlen=SENTENCE_LENGTH)

Maximum word length: 31
Minimum word length: 1


In [17]:
def model():
    lstm_model = Sequential()
    lstm_model.add(Input(shape=(SENTENCE_LENGTH,)))
    lstm_model.add(Embedding(VOCAB_SIZE, VECTOR_FEAT))
    lstm_model.add(Bidirectional(LSTM(100, return_sequences=True)))
    lstm_model.add(GlobalMaxPool1D())
    lstm_model.add(BatchNormalization())
    lstm_model.add(Dropout(0.5))
    lstm_model.add(Dense(10, activation='relu'))
    lstm_model.add(Dropout(0.25))
    lstm_model.add(Dense(1, activation='sigmoid'))

    return lstm_model

In [18]:
lstm_model = model()
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.summary()

In [19]:
history = lstm_model.fit(embedded_docs_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.6156 - loss: 0.6503 - val_accuracy: 0.5844 - val_loss: 0.6504
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8525 - loss: 0.3709 - val_accuracy: 0.6567 - val_loss: 0.6203
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9173 - loss: 0.2342 - val_accuracy: 0.7655 - val_loss: 0.5112
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9491 - loss: 0.1566 - val_accuracy: 0.7233 - val_loss: 0.5580


In [20]:
y_pred_lstm = (lstm_model.predict(embedded_docs_test) > 0.5).astype("int32")

acc_lstm = accuracy_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm)

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [21]:
print('LSTM')
print(f'Accuracy: {acc_lstm:.4f}')
print(f'F1 score: {f1_lstm:.4f}')

LSTM
Accuracy: 0.7246
F1 score: 0.7040


# Naive Bayes Classifier with Countvectorizer and TF-IDF

In [22]:
count_vectorizer = CountVectorizer()

X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [23]:
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_counts, y_train)

In [25]:
y_pred_count_vectorizer = nb_clf.predict(X_test_counts)

acc_nb_count = accuracy_score(y_test, y_pred_count_vectorizer)
f1_nb_count = f1_score(y_test, y_pred_count_vectorizer)

In [26]:
print('NB with count vectorizer')
print(f'Accuracy: {acc_nb_count:.4f}')
print(f'F1 score: {f1_nb_count:.4f}')

NB with count vectorizer
Accuracy: 0.8087
F1 score: 0.7576


In [27]:
nb_clf.fit(X_train_tfidf, y_train)

In [28]:
y_pred_tfidf = nb_clf.predict(X_test_tfidf)

acc_nb_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_nb_tfidf = f1_score(y_test, y_pred_tfidf)

In [29]:
print('NB with TF-IDF')
print(f'Accuracy: {acc_nb_tfidf:.4f}')
print(f'F1 score: {f1_nb_tfidf:.4f}')

NB with TF-IDF
Accuracy: 0.8069
F1 score: 0.7361


# Random Forest Classifier

In [30]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_counts, y_train)

In [31]:
y_pred_rf = rf_clf.predict(X_test_counts)

acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

In [32]:
print('Random Forest')
print(f'Accuracy: {acc_rf:.4f}')
print(f'F1 score: {f1_rf:.4f}')

Random Forest
Accuracy: 0.7938
F1 score: 0.7175


# CNN

In [33]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_ind = tokenizer.word_index
print(f'Found {len(word_ind)} unique tokens')

Found 17777 unique tokens


In [34]:
data = pad_sequences(sequences, SENTENCE_LENGTH)
labels = y

test_split = 0.3
ind = np.arange(data.shape[0])
np.random.shuffle(ind)
data = data[ind]
labels = labels[ind]
num_test_samples = int(test_split * data.shape[0])

X_train_pad = data[num_test_samples:]
X_test_pad = data[:num_test_samples]
y_train_pad = labels[num_test_samples:]
y_test_pad = labels[:num_test_samples]

In [35]:
def model1():
    cnn_model = Sequential()
    cnn_model.add(Input(shape=(SENTENCE_LENGTH,)))
    cnn_model.add(Embedding(SENTENCE_LENGTH, 100))
    cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
    cnn_model.add(GlobalMaxPooling1D())
    cnn_model.add(Dense(64, activation='relu'))
    cnn_model.add(Dropout(0.5))
    cnn_model.add(Dense(1, activation='sigmoid'))

    return cnn_model

In [36]:
cnn_model = model1()
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()

In [37]:
history = cnn_model.fit(X_train_pad, y_train_pad, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.5774 - loss: 0.6733 - val_accuracy: 0.5591 - val_loss: 0.6811
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6375 - loss: 0.6299 - val_accuracy: 0.6304 - val_loss: 0.6513
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6430 - loss: 0.6204 - val_accuracy: 0.6238 - val_loss: 0.6708


In [38]:
y_pred_cnn = (cnn_model.predict(X_test_pad) > 0.5).astype("int32")

acc_cnn = accuracy_score(y_test_pad, y_pred_cnn)
f1_cnn = f1_score(y_test_pad, y_pred_cnn)

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step


In [39]:
print('CNN')
print(f'Accuracy: {acc_cnn:.4f}')
print(f'F1 score: {f1_cnn:.4f}')

CNN
Accuracy: 0.6338
F1 score: 0.4522


# Transformer

In [40]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [41]:
embed_dim = 32 # embedding size for each dim
num_heads = 2 # number of attention heads
ff_dim = 32 # hidden layer size

In [42]:
def model2():
    transformer_model = Sequential()
    transformer_model.add(Input(shape=(SENTENCE_LENGTH,)))
    transformer_model.add(TokenAndPositionEmbedding(SENTENCE_LENGTH, VOCAB_SIZE, embed_dim))
    transformer_model.add(TransformerBlock(embed_dim, num_heads, ff_dim))
    transformer_model.add(GlobalAveragePooling1D())
    transformer_model.add(Dropout(0.25))
    transformer_model.add(Dense(32, activation='relu'))
    transformer_model.add(Dropout(0.25))
    transformer_model.add(Dense(1, activation='sigmoid'))

    return transformer_model

In [43]:
transformer_model = model2()
transformer_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
transformer_model.summary()

In [44]:
history = transformer_model.fit(X_train_pad, y_train_pad, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 76ms/step - accuracy: 0.5928 - loss: 0.6681 - val_accuracy: 0.7608 - val_loss: 0.5196
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8391 - loss: 0.3939 - val_accuracy: 0.7758 - val_loss: 0.4921
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9288 - loss: 0.2166 - val_accuracy: 0.7411 - val_loss: 0.6508


In [45]:
y_pred_transformer = (transformer_model.predict(X_test_pad) > 0.5).astype("int32")

acc_t = accuracy_score(y_test_pad, y_pred_transformer)
f1_t = f1_score(y_test_pad, y_pred_transformer)

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


In [46]:
print('Transformer')
print(f'Accuracy: {acc_t:.4f}')
print(f'F1 score: {f1_t:.4f}')

Transformer
Accuracy: 0.7604
F1 score: 0.7173


# Comparing performance of all models

In [67]:
compare_models = dict()
compare_models = {
    'Models' : ['Support Vecort Classifier', 'Naive Bayes (Count Vectorizer)', 'Naive Bayes (TF-IDF)', 'Bidirectional LSTM', 'Random Forest Classifier', 'CNN', 'Transformer'],
    'Accuracy' : [round(acc_svc, 4), round(acc_nb_count, 4), round(acc_nb_tfidf, 4), round(acc_lstm, 4), round(acc_rf, 4), round(acc_cnn, 4), round(acc_t, 4)],
    'F1 Score' : [round(f1_svc, 4), round(f1_nb_count, 4), round(f1_nb_tfidf, 4), round(f1_lstm, 4), round(f1_rf, 4), round(f1_cnn, 4), round(f1_t, 4)]
}

comp_df = pd.DataFrame(compare_models)
print(comp_df)

                           Models  Accuracy  F1 Score
0       Support Vecort Classifier    0.7968    0.7532
1  Naive Bayes (Count Vectorizer)    0.8087    0.7576
2            Naive Bayes (TF-IDF)    0.8069    0.7361
3              Bidirectional LSTM    0.7246    0.7040
4        Random Forest Classifier    0.7938    0.7175
5                             CNN    0.6338    0.4522
6                     Transformer    0.7604    0.7173


Out of all the models used, Naive Bayes gives the best accuracy

# Evaluating models on test samples

In [109]:
sample_tweets = dict()
sample_tweets = {
    'tweet' : ['Breaking: Massive earthquake hits the west coast, causing widespread destruction.',
               'Urgent: Wildfires raging out of control, residents urged to evacuate immediately!',
               'Join us for a fun-filled community picnic this Saturday at the park!',
               'Devastating tsunami strikes coastal town, leaving a trail of destruction in its wake.',
               'Train derails, spilling hazardous chemicals into nearby river, prompting evacuation orders.',
               'Huge explosion reported at chemical plant, emergency crews responding to the scene.',
               'Exciting news! Our new product launch is happening tomorrow, don\'t miss out!',
               'The weather is perfect for a day at the beach, grab your sunscreen and head out!',
               'Tornado touches down in residential area, homes destroyed, residents trapped.',
               'Congratulations to our team for winning the championship! Go team!',
               'Massive landslide blocks major highway, motorists stranded, rescue efforts underway.',
               'Gas leak reported in downtown area, authorities advise residents to shelter in place.',
               'Powerful storm causes widespread flooding, roads impassable, rescue teams deployed.',
               'Building collapse in city center, multiple casualties reported, urgent need for medical assistance.',
               'Join us for a charity fundraising event to support local families in need.',
               'Earthquake aftershocks continue, residents urged to stay vigilant.',
               'Wildfire spreads rapidly, threatening nearby communities, evacuation centers set up.',
               'Flood warning issued for low-lying areas, residents advised to seek higher ground.',
               'Looking for volunteers to help clean up the local park this weekend!',
               'Come and enjoy live music at the downtown festival tonight, fun for the whole family!'
               ],
    'target' : [1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]
}

sample_tweet_df = pd.DataFrame(sample_tweets)

In [110]:
X_sample = sample_tweet_df['tweet']
y_sample = sample_tweet_df['target']
y_sample = np.array(y_sample)

In [111]:
X_sample_tf = tfidf_vectorizer.transform(X_sample)
X_sample_count = count_vectorizer.transform(X_sample)

onehot_vec_X_sample = [one_hot(words, VOCAB_SIZE) for words in X_sample]
embedded_docs_X_sample = pad_sequences(onehot_vec_X_sample, padding='post', maxlen=SENTENCE_LENGTH)

tokenizer.fit_on_texts(X_sample)
sequences_X_sample = tokenizer.texts_to_sequences(X_sample)
X_sample_pad = pad_sequences(sequences_X_sample, SENTENCE_LENGTH)

In [112]:
nb_preds = nb_clf.predict(X_sample_tf)
svc_preds = classifier.predict(X_sample_tf)
rf_preds = rf_clf.predict(X_sample_count)
lstm_preds = (lstm_model.predict(embedded_docs_X_sample) > 0.5).astype('int32')
cnn_preds = (cnn_model.predict(X_sample_pad) > 0.5).astype('int32')
transformer_preds = (transformer_model.predict(X_sample_pad) > 0.5).astype('int32')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


In [113]:
evaluate = dict()
evaluate = {
    'NB predictions' : np.array(nb_preds),
    'SVC predictions' : np.array(svc_preds),
    'RF predictions' : np.array(rf_preds),
    'LSTM predictions' : np.array(lstm_preds.ravel()),
    'CNN predictions' : np.array(cnn_preds.ravel()),
    'Transformer predictions' : np.array(transformer_preds.ravel()),
    'Actual Target' : y_sample
}

evaluate_df = pd.DataFrame(evaluate)

In [114]:
evaluate_df

Unnamed: 0,NB predictions,SVC predictions,RF predictions,LSTM predictions,CNN predictions,Transformer predictions,Actual Target
0,1,1,1,1,0,0,1
1,1,1,0,1,1,0,1
2,0,0,0,0,0,1,0
3,0,1,0,1,1,0,1
4,1,1,1,1,0,1,1
5,1,1,1,1,0,0,1
6,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1
9,0,0,0,0,0,1,0


References used

https://keras.io/examples/nlp/text_classification_with_transformer/