In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import time

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

3271
4342


# Preprocessing

In [4]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
t
@bbcmtd Wholesale Markets ablaze 


In [6]:
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)

In [7]:
# remove stopwords
#!pip install -q nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)
len(stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaasu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [8]:
df["text"] = df.text.map(remove_stopwords)

### Forming Vocabulary

In [9]:
# Counting number of unique words
from collections import Counter
# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df.text)

num_unique_words = len(counter)
print(num_unique_words)

17971


### Train and Validation Split

In [10]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.9)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [11]:
train_sentences.shape, val_sentences.shape

((6851,), (762,))

### Tokenization

In [12]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

# each word has unique index
word_index = tokenizer.word_index # dict- each word as key and value is unique indices 

In [13]:
#word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
#word_index["<START>"] = 1
#word_index["<UNK>"] = 2
#word_index["<UNUSED>"] = 3

#### Forming Sequence

In [14]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [15]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[463, 8, 437, 168, 358, 486], [750, 511, 2481, 131, 2482, 3090, 554, 529, 112, 511, 2481, 204, 204, 6151, 137], [2483, 137, 2076, 6152, 2481, 1315, 1605, 530, 179, 629, 3091], [114, 4064, 707, 1606, 4064], [125, 94, 334, 4065, 4066, 53, 18, 335]]


### Padding

In [16]:
max_length = 25

In [17]:
train_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences = train_sequences,value=word_index["<PAD>"],padding="post",maxlen=max_length,truncating='post')
val_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences = val_sequences,value=word_index["<PAD>"],padding="post",maxlen=max_length,truncating='post')

# Model Building

In [18]:
# Create RNN model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 16, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 16), where `None` is the batch dimension.


model.add(layers.SimpleRNN(32, dropout=0.9))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 16)            287536    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                1568      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 289,137
Trainable params: 289,137
Non-trainable params: 0
_________________________________________________________________


In [19]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)



In [20]:
model.fit(train_padded, train_labels, epochs=5, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/5
215/215 - 3s - loss: 0.6859 - accuracy: 0.5604 - val_loss: 0.6986 - val_accuracy: 0.5341
Epoch 2/5
215/215 - 1s - loss: 0.6659 - accuracy: 0.5980 - val_loss: 0.6384 - val_accuracy: 0.6457
Epoch 3/5
215/215 - 1s - loss: 0.6207 - accuracy: 0.6622 - val_loss: 0.5976 - val_accuracy: 0.7034
Epoch 4/5
215/215 - 2s - loss: 0.5522 - accuracy: 0.7244 - val_loss: 0.5313 - val_accuracy: 0.7717
Epoch 5/5
215/215 - 2s - loss: 0.5179 - accuracy: 0.7551 - val_loss: 0.4939 - val_accuracy: 0.7782


<keras.callbacks.History at 0x17cdfc9d970>

# Prediction

In [21]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

### Decoding sequences

In [22]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [23]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [24]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[463, 8, 437, 168, 358, 486]
three people died heat wave far


In [25]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
