# Natural Language Processing with Disaster Tweets.

## 1. Data cleaning, integration and formatting.

In [63]:
import re
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from nltk.corpus import wordnet
from sklearn.metrics import f1_score
from spellchecker import SpellChecker
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from sklearn.decomposition import TruncatedSVD
from tensorflow.keras.layers import SimpleRNN, Dense
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Reading the data.

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [11]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [12]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [13]:
def correct_spellings(text: str) -> str:
    """
        Function that corrects the spelling of a given text.
        
        Parameters:
        text - A string containing the text to correct.

        Returns:
        text - The corrected input text.
    """
    
    # Define a SpellChekcer object and use it to identify misspelled words
    spell = SpellChecker()
    corrected_words = []
    misspelled_words = spell.unknown(text.split())

    # Correct and appénd misspelled words
    for word in text.split():
        if word in misspelled_words:
            corrected_words.append(spell.correction(word))
        else:
            corrected_words.append(word)

    return " ".join(corrected_words)

In [14]:
def convert_to_antonym(sentence: str) -> str:
    """
        Function that corrects the spelling of a given text.
        
        Parameters:
        text - A string containing the text to correct.

        Returns:
        text - The corrected input text.
    """

    # Tokenize the input and define a list with the corrected words
    words = nltk.word_tokenize(sentence)
    new_words = []
    temp_word = ''

    for word in words:
        antonyms = []
        
        if word == 'not':
            # Set temporary word flag if current word is not
            temp_word = 'not_'
        elif temp_word == 'not_':
            # Find antonyms for the current word using WordNet
            for syn in wordnet.synsets(word):
                for s in syn.lemmas():
                    for a in s.antonyms():
                        antonyms.append(a.name())
            
            # Replace current word with its antonym if found, else append not
            if len(antonyms) >= 1:
                word = antonyms[0]
            else:
                word = temp_word + word # when antonym is not found, it will
                                    # remain not_happy
            temp_word = ''
        
        if word != 'not':
            # Add current word tom the list of corrected words if it is not 'not'
            new_words.append(word)

    return ' '.join(new_words)

In [29]:
def clean_text(text: str) -> str:
    """
        Function that cleans a given text.
        
        Parameters:
        text - A string containing the text to clean.

        Returns:
        text - The cleaned input text.
    """

    text = text.lower() # Lowercase text
    text= re.sub(r'[^\w\s]',' ',text) # Removing everything other than space and words
    text  = re.sub(r"https?://\S+|www\.\S+", "", text ) # Removing URLs
    text= re.sub(r'[0-9]',' ',text) # Removing digits
    #text = correct_spellings(text) # Correcting the spelling of misspelled tweets
    text = convert_to_antonym(text) # Change negative words to its antonym i.e. not happy -> unhapy
    text = re.sub(' +', ' ', text) # Correcting double space text   
     
    return text

In [27]:
# Clean train and test text
train_text = [clean_text(text) for text in train_df['text']]
test_text = [clean_text(text) for text in test_df['text']]

In [30]:
train_text[0:11]

['our deeds are the reason of this earthquake may allah forgive us all',
 'forest fire near la ronge sask canada',
 'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
 'people receive wildfires evacuation orders in california',
 'just got sent this photo from ruby alaska as smoke from wildfires pours into a school',
 'rockyfire update california hwy closed in both directions due to lake county fire cafire wildfires',
 'flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas',
 'i m on top of the hill and i can see a fire in the woods',
 'there s an emergency evacuation happening now in the building across the street',
 'i m afraid that the tornado is coming to our area',
 'three people died from the heat wave so far']

In [31]:
test_text[0:11]

['just happened a terrible car crash',
 'heard about earthquake is different cities stay safe everyone',
 'there is a forest fire at spot pond geese are fleeing across the street i can not_save them all',
 'apocalypse lighting spokane wildfires',
 'typhoon soudelor kills in china and taiwan',
 'we re shaking it s an earthquake',
 'they d probably still show more life than arsenal did yesterday eh eh',
 'hey how are you',
 'what a nice hat',
 'fuck off',
 'no i don t like cold']

In [33]:
# Generate corpus
corpus = pd.DataFrame(columns=['text'])
corpus['text'] = pd.concat([pd.Series(train_text), pd.Series(test_text)])
corpus.head()

Unnamed: 0,text
0,our deeds are the reason of this earthquake ma...
1,forest fire near la ronge sask canada
2,all residents asked to shelter in place are be...
3,people receive wildfires evacuation orders in ...
4,just got sent this photo from ruby alaska as s...


In [36]:
# Generate TFIDF vectorizer with just unigrams and train it over the corpus
tfidf_vectorizer_unigram = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 1))
tfidf_vectorizer_unigram.fit(corpus['text'])

# Generate TFIDF vectorizer with just unigrams and bigrams and train it over the corpus
tfidf_vectorizer_bigram = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 2))
tfidf_vectorizer_bigram.fit(corpus['text'])

# Generate TFIDF vectorizer with just unigrams, bigrams and trigrams and train it over the corpus
tfidf_vectorizer_trigram = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3))
tfidf_vectorizer_trigram.fit(corpus['text'])

In [37]:
# Convert train text to vector using TFIDF vectorizers
train_tfidfvectors_unigram = tfidf_vectorizer_unigram.transform(train_df['text'])
train_tfidfvectors_bigram = tfidf_vectorizer_bigram.transform(train_df['text'])
train_tfidfvectors_trigram = tfidf_vectorizer_trigram.transform(train_df['text'])

# Same for test text
test_tfidfvectors_unigram = tfidf_vectorizer_unigram.transform(test_df['text'])
test_tfidfvectors_bigram = tfidf_vectorizer_bigram.transform(test_df['text'])
test_tfidfvectors_trigram = tfidf_vectorizer_trigram.transform(test_df['text'])

In [38]:
print('------ Shapes ------')
print(train_tfidfvectors_unigram.shape)
print(train_tfidfvectors_bigram.shape)
print(train_tfidfvectors_trigram.shape)
print()
print(test_tfidfvectors_unigram.shape)
print(test_tfidfvectors_bigram.shape)
print(test_tfidfvectors_trigram.shape)

------ Shapes ------
(7613, 28562)
(7613, 102837)
(7613, 175856)

(3263, 28562)
(3263, 102837)
(3263, 175856)


In [44]:
# Use Singular Value Decomposition to reduce dimensionality
tsv = TruncatedSVD(n_components=500)

train_tfidfvectors_unigram_svd = tsv.fit_transform(train_tfidfvectors_unigram)
train_tfidfvectors_bigram_svd = tsv.fit_transform(train_tfidfvectors_bigram)
train_tfidfvectors_trigram_svd = tsv.fit_transform(train_tfidfvectors_trigram)

In [45]:
print('------ Shapes ------')
print(train_tfidfvectors_unigram_svd.shape)
print(train_tfidfvectors_bigram_svd.shape)
print(train_tfidfvectors_trigram_svd.shape)

------ Shapes ------
(7613, 500)
(7613, 500)
(7613, 500)


## 2. Modeling and validation.

In [None]:
# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True)

data = [train_tfidfvectors_unigram, 
        train_tfidfvectors_bigram, 
        train_tfidfvectors_trigram,
        train_tfidfvectors_unigram_svd,
        train_tfidfvectors_bigram_svd,
        train_tfidfvectors_trigram_svd]

# Iterate over each vectorized dataset
for index, X in enumerate(data):
    # Initialize an empty list to store F1 scores for each fold
    f1_scores = []

    # Iterate over each fold 
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index].toarray(), X[val_index].toarray()
        y_train, y_val = train_df['target'][train_index], train_df['target'][val_index]


        # Reshape input data to treat each row as a single timestep
        X_train = X_train[:, :, np.newaxis]
        X_val = X_val[:, :, np.newaxis]
        
        # Build the RNN model
        model = Sequential([
            SimpleRNN(32, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),  # RNN layer with 32 units
            Dense(1, activation='sigmoid')  # Dense layer with sigmoid activation for binary classification
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy')
        early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

        # Train the model
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=25, batch_size=32, callbacks=[early_stopping], verbose=1)
        
        # Predict on the validation data
        y_pred = model.predict(X_val)
        y_pred_binary = (y_pred > 0.5).astype(int)  # Convert predicted probabilities to binary predictions
        
        # Calculate F1 score and append to the list
        f1 = f1_score(y_val.flatten(), y_pred_binary.flatten())
        f1_scores.append(f1)


    ('Scores for dataset', index)
    # Print F1 scores for each fold:
    print('Individual F1 Scores:', f1_scores)
    
    # Calculate and print the average F1 score across all folds
    avg_f1_score = np.mean(f1_scores)
    print("Average F1 Score:", avg_f1_score)

## 3. Visualization.

## 4. Discussion and conclusions.

## 5. References.