<a href="https://colab.research.google.com/github/tolgagonen/google-collab-10.03/blob/main/LSTM_Model_for_disaster_0_71val.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.shape

(7613, 5)

In [5]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
print((df.target == 1).sum()) # disaster
print((df.target == 0).sum()) # no disaster

3271
4342


In [7]:
# preprocessing 
import re
import string

def remove_URL(text):
    url =re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_punct(text):
    translator = str.maketrans("","", string.punctuation)
    return text.translate(translator)

string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
pattern = re.compile(r"https?://(S+|www)\.\S+")
for t in df.text:
  matches =pattern.findall(t)
  for match in matches:
    print(t)
    print(match)
    print(pattern.sub(r"" , t))
  if len(matches) > 0:
    break



Dozens Die As two Trains Derail Into A River In Indiahttp://www.informationng.com/?p=309943
www
Dozens Die As two Trains Derail Into A River In India


In [9]:
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)

In [10]:
#remove stopwords
#pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#stop words A stop words is a commonly used word (such as "the" , "a")

stop= set(stopwords.words("english"))
stop2= set(stopwords.words("turkish"))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
df["text"] = df.text.map(remove_stopwords)

In [12]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [13]:
from collections import Counter

#count unique words

def counter_word(text_col):
  count = Counter()
  for text in text_col.values:
    for word in text.split():
      count[word] += 1
  return count

counter = counter_word(df.text)

In [14]:
len(counter)

17971

In [15]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [32]:
num_unique_words = len(counter)

In [17]:
train_size = int(df.shape[0] * 0.8) # tüm datnaıan yüzde 80

train_df = df[:train_size]
val_df = df[train_size:]

train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()


In [18]:
train_sentences.shape , val_sentences.shape

((6090,), (1523,))

In [33]:
#tokenize

from tensorflow.keras.preprocessing.text import Tokenizer

#vectorize a text corpus by turning each text into a sequance of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)


In [20]:
#each word has unique index
word_index = tokenizer.word_index

In [21]:
word_index

{'like': 1,
 'amp': 2,
 'fire': 3,
 'im': 4,
 'get': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'news': 9,
 'dont': 10,
 'emergency': 11,
 'one': 12,
 '2': 13,
 'us': 14,
 'video': 15,
 'disaster': 16,
 'burning': 17,
 'body': 18,
 'would': 19,
 'buildings': 20,
 'police': 21,
 'crash': 22,
 'first': 23,
 'california': 24,
 'still': 25,
 'man': 26,
 'got': 27,
 'know': 28,
 'day': 29,
 'back': 30,
 'going': 31,
 'two': 32,
 'time': 33,
 'full': 34,
 'accident': 35,
 'see': 36,
 'world': 37,
 'attack': 38,
 'nuclear': 39,
 'youtube': 40,
 'may': 41,
 'love': 42,
 'go': 43,
 'rt': 44,
 'many': 45,
 'cant': 46,
 '3': 47,
 'watch': 48,
 'collapse': 49,
 'dead': 50,
 'today': 51,
 'car': 52,
 'mass': 53,
 'want': 54,
 'years': 55,
 'work': 56,
 'train': 57,
 'last': 58,
 'good': 59,
 'think': 60,
 'families': 61,
 'hiroshima': 62,
 'life': 63,
 'fires': 64,
 'best': 65,
 'could': 66,
 'say': 67,
 'u': 68,
 'death': 69,
 'hot': 70,
 'forest': 71,
 'way': 72,
 'killed': 73,
 'need': 74,
 'legion

In [22]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [23]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[520, 8, 395, 156, 297, 411], [749, 470, 2248, 138, 2249, 2813, 521, 611, 188, 470, 2248, 189, 189, 5679, 117], [2814, 117, 1884, 5680, 2248, 1285, 1450, 522, 256, 644, 2815], [99, 3742, 612, 1451, 3742], [111, 91, 336, 3743, 3744, 52, 22, 312]]


In [40]:
 #pad the sequences to have the same lenght
from tensorflow.keras.preprocessing.sequence import pad_sequences

#max number of words in a sqeunce
max_length=20
 
train_padded= pad_sequences(train_sequences , maxlen=max_lenght , padding="post",truncating="post")
val_padded= pad_sequences(val_sequences, maxlen=max_lenght, padding="post" , truncating="post")
train_padded.shape , val_padded.shape

((6090, 20), (1523, 20))

In [25]:
train_padded[10]

array([520,   8, 395, 156, 297, 411,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [26]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[520, 8, 395, 156, 297, 411]
[520   8 395 156 297 411   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [27]:
# check reversing the indices
#flip (key,value)
reverse_word_index= dict([(idx , word) for  (word,idx) in word_index.items()])

In [28]:
def decode(sequence):
  return " ".join([reverse_word_index.get(idx , "?") for idx in sequence])

In [29]:
decoded_text = decode(train_sequences[10])
print(train_sequences[10])
print(decoded_text)

[520, 8, 395, 156, 297, 411]
three people died heat wave far


In [43]:
# create LSTM Model
from tensorflow.keras import layers

#embeddings: tensorflow/tutorials/text/word_embeddings
#turns positive integers(indexes) into dense vectors of fixed size

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32 , input_length=max_length))

#The layer will take as input as integer matrix of size (batch , input_lenght)
#and the largest integer (i.e. word index) in the input should be no larger than num_words(vocabulary size)
# Now model.output_shape is (None, input_lenght, 32), where 'None' is the batch dimension.

model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 32)            575072    
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [45]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim= keras.optimizers.Adam(lr=0.001)
metrics= ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)



In [46]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)
#training kısmını burada yaptık doğruluk %71

Epoch 1/20
191/191 - 5s - loss: 0.5502 - accuracy: 0.7149 - val_loss: 0.4647 - val_accuracy: 0.7748 - 5s/epoch - 28ms/step
Epoch 2/20
191/191 - 3s - loss: 0.2930 - accuracy: 0.8842 - val_loss: 0.5094 - val_accuracy: 0.7617 - 3s/epoch - 14ms/step
Epoch 3/20
191/191 - 3s - loss: 0.1519 - accuracy: 0.9501 - val_loss: 0.6151 - val_accuracy: 0.7603 - 3s/epoch - 15ms/step
Epoch 4/20
191/191 - 3s - loss: 0.1089 - accuracy: 0.9667 - val_loss: 0.5673 - val_accuracy: 0.7551 - 3s/epoch - 15ms/step
Epoch 5/20
191/191 - 3s - loss: 0.0888 - accuracy: 0.9719 - val_loss: 0.6753 - val_accuracy: 0.7452 - 3s/epoch - 14ms/step
Epoch 6/20
191/191 - 3s - loss: 0.0748 - accuracy: 0.9767 - val_loss: 0.8229 - val_accuracy: 0.7518 - 3s/epoch - 14ms/step
Epoch 7/20
191/191 - 3s - loss: 0.0621 - accuracy: 0.9787 - val_loss: 0.8470 - val_accuracy: 0.7452 - 3s/epoch - 15ms/step
Epoch 8/20
191/191 - 3s - loss: 0.0494 - accuracy: 0.9795 - val_loss: 1.0396 - val_accuracy: 0.7387 - 3s/epoch - 16ms/step
Epoch 9/20
191/1

<keras.callbacks.History at 0x7fe2e44b1bb0>

In [47]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]



In [48]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [50]:
print(train_sentences[1:11])

print(train_labels[1:11])
print(predictions[1:11])

['forest fire near la ronge sask canada'
 'residents asked shelter place notified officers evacuation shelter place orders expected'
 '13000 people receive wildfires evacuation orders california'
 'got sent photo ruby alaska smoke wildfires pours school'
 'rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires'
 'flood disaster heavy rain causes flash flooding streets manitou colorado springs areas'
 'im top hill see fire woods'
 'theres emergency evacuation happening building across street'
 'im afraid tornado coming area' 'three people died heat wave far']
[1 1 1 1 1 1 1 1 1 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
