In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical


## Loading data

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [None]:
train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=45, stratify=train_data.target)

## Analyzing data

In [None]:
train_data.head(3)

In [None]:
train_data["keyword"].value_counts().head(3)

In [None]:
train_data["location"].value_counts().head(3)

In [None]:
def clean_data(data:pd.DataFrame):
    data.drop(columns=["location", "id"], inplace=True)
    data.fillna("",inplace=True)    

clean_data(train_data)
clean_data(test_data)
clean_data(valid_data)

In [None]:
train_labels = train_data.pop("target")
valid_labels = valid_data.pop("target")

In [None]:
train_data.head(3)

In [None]:
train_data.head(3)

In [None]:
print(f'No disaster: {train_data[train_labels == 0]["text"].values[1]}')
print(f'Disaster: {train_data[train_labels == 1]["text"].values[1]}')

In [None]:
print("Train shape:",train_data.shape)
print("Test shape:",test_data.shape)

In [None]:
train_labels.value_counts()

In [None]:
target_counts = train_labels.value_counts()

# Create a bar plot
plt.bar(target_counts.index, target_counts.values)
plt.xlabel('Target')
plt.ylabel('Count')
plt.xticks(target_counts.index, ['Not Disaster', 'Disaster'])
plt.title('Distribution of Target')
plt.show()

In [None]:
import re
# Regular expression pattern to match URLs
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
def substitute_links(text):
    
    # Substitute links in the text with "website: domain_name"
    substituted_text = re.sub(pattern, 'website: domain_name', text)

    return substituted_text

# Example usage
text = "Check out this website: https://example.com and also visit http://openai.com"

substituted_text = substitute_links(text)
print(substituted_text)

In [None]:
max_sequence_length = 0
max_sequence_length_keyword = 0
# Iterate over text and keyword columns
for text, keyword in zip(train_data['text'], train_data['keyword']):
    text_length = len(text.split())
    keyword_length = len(keyword.split())

    # Update maximum sequence length
    max_sequence_length = max(max_sequence_length, text_length, keyword_length)
    
for keyword in train_data['keyword']:
    keyword_length = len(keyword.split())

    # Update maximum sequence length
    max_sequence_length_keyword = max(max_sequence_length_keyword, keyword_length)

print("Maximum Sequence Length:", max_sequence_length)
print("Maximum Sequence Length Keyword:", max_sequence_length_keyword)

In [None]:

# Step 1: Tokenize text and keyword columns
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(train_data['text'])
train_text_sequences = text_tokenizer.texts_to_sequences(train_data['text'])

keyword_tokenizer = Tokenizer()
keyword_tokenizer.fit_on_texts(train_data['keyword'])
train_keyword_sequences = keyword_tokenizer.texts_to_sequences(train_data['keyword'])

def process_data(data:pd.DataFrame):
    #Tokenize
    text_sequences = text_tokenizer.texts_to_sequences(data["text"])
    keyword_sequences = keyword_tokenizer.texts_to_sequences(data["text"])
    
    #Pad
    text_sequences = pad_sequences(text_sequences, maxlen=max_sequence_length)
    keyword_sequences = pad_sequences(keyword_sequences, maxlen=max_sequence_length_keyword)
    return text_sequences, keyword_sequences

# Step 2: Pad sequences
# Adjust this value based on your data
train_text_sequences, train_keyword_sequences = process_data(train_data)

# Step 3: Create the dataset
labels = train_labels
dataset = tf.data.Dataset.from_tensor_slices(((train_text_sequences, train_keyword_sequences), labels))
dataset = dataset.shuffle(1000).batch(32)


In [None]:
w_index = 1
print(train_data["text"][0].split()[w_index])
print(text_tokenizer.texts_to_sequences(train_data["text"][0].split()[w_index]))

In [None]:
print(train_data["text"][0])
print(train_data["text"][0].split())
print(len(train_data["text"][0].split()))
print(len(text_tokenizer.texts_to_sequences(train_data["text"][0])))
text_tokenizer.texts_to_sequences(train_data["text"][0])

In [None]:
# Define the inputs
input_text = Input(shape=(max_sequence_length,), name='input_text')
input_keyword = Input(shape=(max_sequence_length_keyword,), name='input_keyword')

# Text part
embedding = Embedding(name="embedding",input_dim=len(text_tokenizer.word_index) + 1, output_dim=16, input_length=max_sequence_length)(input_text)
lstm_text = Bidirectional(LSTM(16, return_sequences=True))(embedding)

# Keyword part
dense_keyword = Dense(8, activation='relu')(input_keyword)
dense_keyword_repeated = RepeatVector(max_sequence_length)(dense_keyword)

# Concatenate the outputs
concatenated = Concatenate(axis=-1)([lstm_text, dense_keyword_repeated])

# Other layers
flatten = Flatten()(concatenated)
dense = Dense(16, activation='relu')(flatten)
dropout = Dropout(0.15)(dense)
dense2 = Dense(8, activation='relu')(dropout)
dropout2 = Dropout(0.25)(dense2)
output = Dense(2, activation='softmax')(dropout2)

# Create the model
model = tf.keras.Model(inputs=[input_text, input_keyword], outputs=output)

In [None]:
# when using multiple output
labels = to_categorical(labels, num_classes=2)
valid_labels = to_categorical(valid_labels, num_classes=2)

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
valid_text, valid_keyword = process_data(valid_data)

In [None]:
# Compile the model 
opt = tf.keras.optimizers.Adam(0.001) #75
# opt = tf.keras.optimizers.Adagrad(learning_rate=0.0001)


model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


# Train the model
h = model.fit(
    {'input_text': train_text_sequences, 'input_keyword': train_keyword_sequences}, 
    labels, 
    validation_data=(
        {'input_text': valid_text, 'input_keyword': valid_keyword},
        valid_labels
        ),
    epochs=10)

In [None]:
history = h.history
print(history.keys())

In [None]:

epoch_range = range(1, len(history['loss'])+1)

plt.figure(figsize=[14,4])

plt.subplot(1,2,1)
plt.plot(epoch_range, history['loss'], label='Training')
plt.plot(epoch_range, history['val_loss'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(epoch_range, history['accuracy'], label='Training')
plt.plot(epoch_range, history['val_accuracy'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
len(text_tokenizer.word_index)