In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import tensorflow as tf
from tensorflow import keras
import os
import matplotlib.pyplot as plt
import time

In [3]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df.shape

(7613, 5)

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
print((df.target==1).sum()) #Disaster
print((df.target==0).sum()) #No Disaster

3271
4342


In [6]:
#Preprocessing
import re 
import string

In [7]:
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

In [8]:
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
pattern = re.compile(r"https?://\S+|www\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
http://t.co/lHYXEOHY6C
@bbcmtd Wholesale Markets ablaze 


In [11]:
df["text"] = df.text.map(remove_URL) #map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)

In [12]:
#remove stopwords

In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stop = set(stopwords.words('english'))

In [15]:
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return ' '.join(filtered_words)

In [16]:
print(stop)

{'before', 'you', 'being', 'are', 'ma', 'below', 'out', 'both', 'our', 'and', 'each', 're', 'these', "don't", 'under', 'is', 'be', 'll', 'because', 'its', 'again', 'does', 'of', 'there', 'more', 'now', "aren't", 'she', 'they', 'all', 'yours', 'hers', 'few', "doesn't", "hadn't", 'will', 'i', 'hasn', "haven't", 'at', "she's", 'it', 'ourselves', 'were', 'over', 'then', "isn't", 'he', 'from', 'can', 'aren', 'ain', "you've", 'mightn', 'didn', 'about', 'here', 'in', 'don', 'me', 'doing', 'o', 'was', 'once', 'just', 'wasn', 'some', 'having', 'y', 'hadn', 'wouldn', "hasn't", "needn't", 'after', 'for', 'between', 'on', 'doesn', "wasn't", 've', 'been', 'her', 'couldn', "mightn't", 'him', 'theirs', 'than', 'against', 'up', "didn't", 'those', 'should', 'too', 'have', 'during', "wouldn't", 'myself', 'so', 'or', 'above', 'with', 'did', 'but', 'as', "that'll", "couldn't", 'that', 'why', 'only', "mustn't", 't', 'most', "weren't", 'himself', 'do', 'until', 'this', 'them', 'how', 'not', 'shouldn', 'ours

In [17]:
df["text"] = df.text.map(remove_stopwords)

In [18]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [19]:
from collections import Counter
#Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [20]:
counter = counter_word(df.text)

In [21]:
num_unique_words = len(counter)

In [22]:
len(counter)

17971

In [23]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [24]:
#Split data set into training and validiation 
train_size = int(df.shape[0] * 0.8)
train_df = df[:train_size]
val_df = df[train_size:]

In [25]:
#split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [26]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

#vectorize a text corpus by turning each text into a sequence of integer
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) #fit only to training

In [28]:
#each word has unique index
word_index = tokenizer.word_index
#word_index

In [29]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [30]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[520, 8, 395, 156, 297, 411], [749, 470, 2248, 138, 2249, 2813, 521, 611, 188, 470, 2248, 189, 189, 5679, 117], [2814, 117, 1884, 5680, 2248, 1285, 1450, 522, 256, 644, 2815], [99, 3742, 612, 1451, 3742], [111, 91, 336, 3743, 3744, 52, 22, 312]]


In [31]:
#Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Max numbers of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, 
                             padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, 
                             padding='post', truncating='post')
train_padded.shape, val_padded.shape

((6090, 20), (1523, 20))

In [32]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[520, 8, 395, 156, 297, 411]
[520   8 395 156 297 411   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [33]:
#Check reversing the indices

#flip(key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [34]:
# reverse_word_index

In [35]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [36]:
decoded_text = decode(train_sequences[10])

In [37]:
print(train_sequences[10])
print(decoded_text)

[520, 8, 395, 156, 297, 411]
three people died heat wave far


In [38]:
#Create LSTM model
from tensorflow.keras import layers
model = keras.models.Sequential()
#Embedding turns positive integers(indexes) into dense vectors of fix size.
#(othe approach could be one-hot-encoding)

#Word embedding gives us a way to use an efficient, dense representation 
#in which similar words have a similar encoding. Importantly, you do not have to
#specify this encoding by hand. An embedding is a dense vector of floating point values 
#(the length of the vector is a parameter you specify).
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

2022-09-22 08:16:14.468805: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            575072    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [39]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ['accuracy']

model.compile(loss=loss,optimizer=optim,metrics=metrics)

In [40]:
model.fit(train_padded, train_labels, epochs=20, 
                    validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/20


2022-09-22 08:16:14.946115: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


191/191 - 7s - loss: 0.5524 - accuracy: 0.7105 - val_loss: 0.4771 - val_accuracy: 0.7748
Epoch 2/20
191/191 - 4s - loss: 0.2949 - accuracy: 0.8869 - val_loss: 0.5182 - val_accuracy: 0.7479
Epoch 3/20
191/191 - 4s - loss: 0.1593 - accuracy: 0.9476 - val_loss: 0.6982 - val_accuracy: 0.7439
Epoch 4/20
191/191 - 5s - loss: 0.1112 - accuracy: 0.9645 - val_loss: 0.7629 - val_accuracy: 0.7518
Epoch 5/20
191/191 - 4s - loss: 0.0902 - accuracy: 0.9724 - val_loss: 0.8508 - val_accuracy: 0.7321
Epoch 6/20
191/191 - 4s - loss: 0.0763 - accuracy: 0.9759 - val_loss: 0.8623 - val_accuracy: 0.7387
Epoch 7/20
191/191 - 4s - loss: 0.0657 - accuracy: 0.9775 - val_loss: 0.9650 - val_accuracy: 0.7374
Epoch 8/20
191/191 - 4s - loss: 0.0529 - accuracy: 0.9787 - val_loss: 0.9659 - val_accuracy: 0.7341
Epoch 9/20
191/191 - 4s - loss: 0.0458 - accuracy: 0.9816 - val_loss: 1.1264 - val_accuracy: 0.7374
Epoch 10/20
191/191 - 4s - loss: 0.0424 - accuracy: 0.9808 - val_loss: 1.0333 - val_accuracy: 0.7301
Epoch 11/2

<keras.callbacks.History at 0x7f894c3ba910>

In [41]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [42]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
