In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import pandas as pd  # To load the dataset
import os

# Verify and load dataset
file_path = r"C:\Users\dare2\OneDrive\Desktop\Pre-processed files\Hurricane_Irma.csv"  # Replace with the correct file path
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

data = pd.read_csv(file_path)  # Adjust based on the file format (e.g., .csv, .txt)

# Check for missing or invalid data in 'processed_data'
data['processed_data'] = data['processed_data'].fillna('')  # Replace NaNs with empty string
data['processed_data'] = data['processed_data'].astype(str)  # Ensure all data is string

# Check for class distribution
print(data['binary_class'].value_counts())

# Convert labels to binary: 'Non-Damage' -> 0, 'Damage' -> 1
data['binary_class'] = data['binary_class'].map({'Non-Damage': 0, 'Damage': 1})

# Tokenize sentences
sentences = [sentence.split() for sentence in data['processed_data']]  # Tokenize sentences
labels = data['binary_class'].values  # Extract labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Train Word2Vec model using CBOW technique
word2vec_model = Word2Vec(sentences, vector_size=128, window=5, min_count=1, sg=0)  # CBOW (sg=0)

# Create embedding matrix
vocab_size = len(word2vec_model.wv.index_to_key)
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))

word_index = {word: idx + 1 for idx, word in enumerate(word2vec_model.wv.index_to_key)}
for word, idx in word_index.items():
    embedding_matrix[idx] = word2vec_model.wv[word]

# Map words in X_train and X_test to their indices
X_train = [[word_index.get(word, 0) for word in sentence] for sentence in X_train]
X_test = [[word_index.get(word, 0) for word in sentence] for sentence in X_test]
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=100)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=100)

# Calculate class weights to handle class imbalance
class_weights = {0: 1., 1: len(y_train) / sum(y_train == 1)}

# Model architecture with regularization
model = Sequential([
    Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=X_train.shape[1], trainable=False),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),  # Added dropout for regularization
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Added dropout for regularization
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model with adjusted learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model with class weights to address imbalance
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weights)

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Print evaluation metrics with zero_division handling
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=1))
print("Recall:", recall_score(y_test, y_pred, zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred, zero_division=1))


binary_class
Non-Damage    3494
Damage         523
Name: count, dtype: int64
Epoch 1/10




[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 59ms/step - accuracy: 0.4093 - loss: 1.3131 - val_accuracy: 0.2193 - val_loss: 0.8112
Epoch 2/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.4103 - loss: 1.2762 - val_accuracy: 0.4121 - val_loss: 0.7138
Epoch 3/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.3788 - loss: 1.2706 - val_accuracy: 0.2551 - val_loss: 0.7840
Epoch 4/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.2980 - loss: 1.2729 - val_accuracy: 0.4448 - val_loss: 0.7228
Epoch 5/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.5011 - loss: 1.2534 - val_accuracy: 0.3219 - val_loss: 0.7560
Epoch 6/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.3671 - loss: 1.2312 - val_accuracy: 0.2473 - val_loss: 0.7778
Epoch 7/10
[1m81/81[0m [32m━━━━━━━━━━━━━━━

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import os

# Verify and load dataset
file_path = r"C:\Users\dare2\OneDrive\Desktop\Pre-processed files\Hurricane_Irma.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

data = pd.read_csv(file_path)

# Preprocess data
data['processed_data'] = data['processed_data'].fillna('').astype(str)

# Check column names and structure
print("Dataset columns:", data.columns)
print(data.head())

# Ensure the column 'Binary_Class' exists
if 'Binary_Class' not in data.columns:
    raise KeyError("The column 'Binary_Class' does not exist in the dataset. Verify the dataset structure.")

# Map labels to binary values and check class distribution
data['Binary_Class'] = data['Binary_Class'].map({'Non-Damage': 0, 'Damage': 1})
print(data['Binary_Class'].value_counts())

# Tokenize sentences
sentences = [sentence.split() for sentence in data['processed_data']]
labels = data['Binary_Class'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, sg=0)

# Create embedding matrix
vocab_size = len(word2vec_model.wv.index_to_key)
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))

word_index = {word: idx + 1 for idx, word in enumerate(word2vec_model.wv.index_to_key)}
for word, idx in word_index.items():
    embedding_matrix[idx] = word2vec_model.wv[word]

X_train = [[word_index.get(word, 0) for word in sentence] for sentence in X_train]
X_test = [[word_index.get(word, 0) for word in sentence] for sentence in X_test]
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=100)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=100)

# Calculate class weights
class_weights = {0: 1., 1: len(y_train) / sum(y_train == 1)}

# Build optimized CNN model
model = Sequential([
    Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=X_train.shape[1], trainable=False),
    Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Dropout(0.4),
    Conv1D(filters=128, kernel_size=4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Dropout(0.4),
    Conv1D(filters=256, kernel_size=5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train model
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, class_weight=class_weights, callbacks=[early_stopping])

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=1))
print("Recall:", recall_score(y_test, y_pred, zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred, zero_division=1))

Dataset columns: Index(['tweet_text', 'text_human', 'Multiclass', 'Binary_Class',
       'processed_data'],
      dtype='object')
                                          tweet_text  \
0  RT @MSN: Island of Barbuda 'literally under wa...   
1  RT @Reuters: Hurricane Irma threatens luxury T...   
2  RT @TheAnonJournal: BREAKING NEWS: Hurricane I...   
3  JUST IN: 11PM #Hurricane #Irma update. @ABC7Ne...   
4  RT @cnnbrk: Hurricane Irma destroys "upwards o...   

                          text_human      Multiclass Binary_Class  \
0  infrastructure_and_utility_damage  Infrastructure       Damage   
1         other_relevant_information      Non-Damage   Non-Damage   
2         other_relevant_information      Non-Damage   Non-Damage   
3         other_relevant_information      Non-Damage   Non-Damage   
4  infrastructure_and_utility_damage  Infrastructure       Damage   

                                      processed_data  
0      island barbara literally water hurricane irma  
1   hurr



Epoch 1/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 118ms/step - accuracy: 0.4763 - loss: 10.8451 - val_accuracy: 0.8600 - val_loss: 8.2168
Epoch 2/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 98ms/step - accuracy: 0.5231 - loss: 8.9857 - val_accuracy: 0.7652 - val_loss: 7.0242
Epoch 3/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 92ms/step - accuracy: 0.4099 - loss: 7.4577 - val_accuracy: 0.8600 - val_loss: 5.9660
Epoch 4/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 96ms/step - accuracy: 0.5559 - loss: 6.4209 - val_accuracy: 0.8600 - val_loss: 5.1092
Epoch 5/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 93ms/step - accuracy: 0.4129 - loss: 5.6363 - val_accuracy: 0.8600 - val_loss: 4.3428
Epoch 6/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 98ms/step - accuracy: 0.4409 - loss: 4.9213 - val_accuracy: 0.8600 - val_loss: 3.7083
Epoch 7/30
[1m41/41[0m [32m━

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

# Oversample the minority class
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Build and compile the model (architecture unchanged)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='binary_crossentropy', 
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Train the model with early stopping
model.fit(X_train_res, y_train_res, epochs=30, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate with AUC and threshold tuning
y_pred_proba = model.predict(X_test).flatten()
optimal_threshold = 0.5  # Adjust based on AUC analysis if necessary
y_pred = (y_pred_proba > optimal_threshold).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=1))
print("Recall:", recall_score(y_test, y_pred, zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred, zero_division=1))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))

Epoch 1/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 106ms/step - accuracy: 0.5637 - auc: 0.5159 - loss: 6.4812 - val_accuracy: 0.0018 - val_auc: 0.0000e+00 - val_loss: 3.1336
Epoch 2/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 96ms/step - accuracy: 0.6068 - auc: 0.5799 - loss: 2.5691 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 2.0773
Epoch 3/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 95ms/step - accuracy: 0.6031 - auc: 0.5713 - loss: 1.5361 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 1.3988
Epoch 4/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 96ms/step - accuracy: 0.6075 - auc: 0.6021 - loss: 1.1382 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 1.2596
Epoch 5/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 96ms/step - accuracy: 0.6234 - auc: 0.6048 - loss: 0.9939 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 1.17