In [None]:
# dataset - https://www.kaggle.com/c/nlp-getting-started
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Natural Language Processing with Disaster Tweets/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Natural Language Processing with Disaster Tweets/test.csv')


In [None]:
#test.head().T
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# preprocessing
import re

def remove_URL(text):
  url = re.compile(r"https?://(\s+|www)\.\s+")
  return url.sub(r"", text)
  
def remove_html(text):
  html = re.compile(r"<.*?>")
  return html.sub(r"", text)

In [None]:
# remove punctuation
import string

def remove_punct(text):
  table = str.maketrans("","", string.punctuation)
  return text.translate(table)

In [None]:
train["text"] = train.text.map(lambda x: remove_URL(x))
train["text"] = train.text.map(lambda x: remove_html(x))
train["text"] = train.text.map(lambda x: remove_punct(x))

In [None]:
# remove stopword

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
  text = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train["text"] = train["text"].map(remove_stopwords)

In [None]:
train.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610    m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

# Basic NLP

In [None]:
from collections import Counter

# count unique words
def counter_word(text):
  count = Counter()
  for i in text.values:
    for word in i.split():
      count[word] += 1
    return count


In [None]:
text = train.text
counter = counter_word(text)

In [None]:
len(counter)

7

In [None]:
counter

Counter({'allah': 1,
         'deeds': 1,
         'earthquake': 1,
         'forgive': 1,
         'may': 1,
         'reason': 1,
         'us': 1})

In [None]:
num_words = len(counter)

# max number of words in a sequence
max_length = 20

#Train/test split

In [None]:
train_size = int(train.shape[0]*0.8)

train_sentences = train.text[:train_size]
train_labels = train.target[:train_size]

test_sentences = train.text[train_size:]
test_labels = train.target[train_size:]

In [None]:
from keras.preprocessing.text import Tokenizer

#vocab_size = 1500
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(train_sentences)

In [None]:
word_index = tokenizer.word_index

In [None]:
word_index

{'like': 1,
 'amp': 2,
 'fire': 3,
 'im': 4,
 'get': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'news': 9,
 'dont': 10,
 'emergency': 11,
 'one': 12,
 '2': 13,
 'us': 14,
 'video': 15,
 'disaster': 16,
 'burning': 17,
 'body': 18,
 'would': 19,
 'buildings': 20,
 'police': 21,
 'crash': 22,
 'first': 23,
 'california': 24,
 'still': 25,
 'man': 26,
 'got': 27,
 'know': 28,
 'back': 29,
 'day': 30,
 'going': 31,
 'two': 32,
 'time': 33,
 'full': 34,
 'accident': 35,
 'see': 36,
 'world': 37,
 'attack': 38,
 'nuclear': 39,
 'youtube': 40,
 'may': 41,
 'love': 42,
 'go': 43,
 'rt': 44,
 'many': 45,
 'cant': 46,
 '3': 47,
 'watch': 48,
 'collapse': 49,
 'dead': 50,
 'today': 51,
 'car': 52,
 'mass': 53,
 'want': 54,
 'years': 55,
 'work': 56,
 'train': 57,
 'last': 58,
 'good': 59,
 'think': 60,
 'families': 61,
 'hiroshima': 62,
 'life': 63,
 'fires': 64,
 'best': 65,
 'could': 66,
 'say': 67,
 'u': 68,
 'death': 69,
 'hot': 70,
 'forest': 71,
 'way': 72,
 'killed': 73,
 'need': 74,
 'legion

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
train_sequences[0]

[]

In [None]:
from keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(
    train_sequences, maxlen = max_length, padding = 'post', truncating = 'post'
)

In [None]:
train_padded[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding='post', truncating='post')

In [None]:
print(train.text[0])
print(train_sequences[0])

deeds reason earthquake may allah forgive us
[]


#check inverse

In [None]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

In [None]:
def decode(text):
  return " ". join([reverse_word_index.get(i, "?") for i in text])

In [None]:
decode(train_sequences[0])

''

In [None]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of train {test_padded.shape}")


Shape of train (6090, 20)
Shape of train (1523, 20)


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words,32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(1,activation='sigmoid'))

optimizer = Adam(learning_rate = 3e-4)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics = ["accuracy"])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            224       
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 25,121
Trainable params: 25,121
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_padded, train_labels, epochs = 20, validation_data = (test_padded, test_labels),
                    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
