# Projet - Real or Not? NLP with Disaster Tweets

## Project description 

* Competition - https://www.kaggle.com/c/nlp-getting-started

In [1]:
# Ground base related import
import pandas as pd 
import numpy as np
import spacy

# Import Tensorflow & Pathlib librairies
import tensorflow as tf 
import tensorflow_hub as hub
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, SimpleRNN, GRU, Dense, Embedding, Dropout
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import TensorBoard

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

In [14]:
####
# Load & Explore
# ######
import pandas as pd 
data = pd.read_csv('train.csv')

print("Basics statistics: ")
data_desc = data.describe(include='all')
display(data_desc)

print("Percentage of missing values: ")
display(100*data.isnull().sum()/data.shape[0])

display(data.head())

Basics statistics: 


Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


Percentage of missing values: 


id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [15]:
####
# English pipeline optimized for CPU. 
# Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
###########
!python -m spacy download en_core_web_md -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [16]:
# Import Spacy and english 
nlp = spacy.load("en_core_web_md")

# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

In [17]:
###
# let's take only text & target
##############
data = data[['text', 'target']]

# let' take a look a the baseline
(data['target'].value_counts()/data.shape[0])*100

0    57.034021
1    42.965979
Name: target, dtype: float64

In [18]:
####
## Cleanup the text
###########

# remove none alphanum & single space characters
data["text_clean"] = data["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

# remove spaces
data["text_clean"] = data["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())

# remove stopword (“a”, “the”, “is”, “are” and etc) & lemmatization (play, played, player, playing --> play)
data["text_clean"] = data["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

data.head()

Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby #Alaska as ...,1,got send photo ruby alaska smoke wildfire pour...


In [19]:
###
## Instanciate the tokenizer & tokenize the cleaned text
###########
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=15000)  # instanciate the tokenizer
tokenizer.fit_on_texts(data.text_clean)

data["text_encoded"] = tokenizer.texts_to_sequences(data.text_clean)
data["text_len"] = data["text_encoded"].apply(lambda x: len(x))
data = data[data["text_len"] != 0]
data.head(10)

Unnamed: 0,text,target,text_clean,text_encoded,text_len
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,"[3657, 414, 169, 1380, 1937]",5
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[119, 3, 158, 511, 5544, 5545, 1087]",7
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[1381, 444, 1703, 324, 5546, 290, 183, 1703, 3...",11
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...,"[2308, 7, 2309, 69, 183, 311, 36]",7
4,Just got sent this photo from Ruby #Alaska as ...,1,got send photo ruby alaska smoke wildfire pour...,"[199, 175, 122, 5547, 1704, 184, 69, 2310, 110]",9
5,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy 20 close dir...,"[2311, 166, 36, 1260, 445, 312, 841, 789, 280,...",12
6,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain cause flash flooding...,"[24, 17, 633, 155, 49, 634, 490, 381, 5548, 79...",12
7,I'm on top of the hill and I can see a fire in...,1,m hill fire wood,"[2, 1088, 3, 1705]",4
8,There's an emergency evacuation happening now ...,1,s emergency evacuation happen building street,"[5, 16, 183, 176, 32, 381]",6
9,I'm afraid that the tornado is coming to our a...,1,m afraid tornado come area,"[2, 1938, 281, 13, 205]",5


In [20]:
# This is perfectly normal, Tensorflow is incapable as of now to create a tensor dataset based on lists, 
# we will have to store all of our encoded texts into a single numpy array before creating the tensorflow dataset.
# The problem is that not all our sequences are the same length, this is where the `tf.keras.preprocessing.sequence.pad_sequences` 
# comes in handy, it will add zero padding at the beginning (`padding="pre"`) or at the end (`padding="post"`) of your sequences so they all have equal length.
text_pad = tf.keras.preprocessing.sequence.pad_sequences(data.text_encoded, padding="post")
full_ds = tf.data.Dataset.from_tensor_slices((text_pad, data.target.values))

# Train Test Split
n_samples = data.shape[0]
TAKE_SIZE = int(0.7 * n_samples) # (split 70%/30%)
BATCH_SIZE = 128

shuffled_ds = full_ds.shuffle(n_samples, reshuffle_each_iteration=False)
train_ds = shuffled_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE).batch(BATCH_SIZE)
val_ds = shuffled_ds.skip(TAKE_SIZE).shuffle(n_samples - TAKE_SIZE).batch(BATCH_SIZE)


In [24]:
#################
## Setup the model
###################

# use a 64 values vector to represent a word
embedding_dim = 64 
vocab_size = 15000 # len(tokenizer.word_counts) + 1
imput_shape = text_pad.shape[1]
log_dir_base = "/Users/ycammarata/.keras/logs/"

# Let's create a learning rate schedule to decrease the learning rate as we train the model. 
initial_learning_rate = 0.001

lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True)

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

tensorboard_callback = TensorBoard(log_dir=log_dir_base+'embedding')

DROPOUT = 0.40 # %40

### Simple model, one embding later
model_emb = Sequential([
    Embedding(vocab_size, embedding_dim, input_shape=[imput_shape,], name="embedding"), # the embedding layer
    GlobalAveragePooling1D(),
    Dense(16, activation='relu', kernel_regularizer='l1'), # a dense layer
    Dense(8, activation='relu'), # a dense layer
    Dropout(DROPOUT),
    Dense(1, activation="sigmoid") # the prediction layer
])

model_emb.compile(
    optimizer=Adam(learning_rate = lr_schedule), 
    loss='binary_crossentropy', 
    metrics=['accuracy'])

model_emb.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=[tensorboard_callback, earlystop_callback])

2022-01-27 10:51:39.699326: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-01-27 10:51:39.699341: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.
2022-01-27 10:51:39.699577: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.


Epoch 1/10


2022-01-27 10:51:40.024673: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 4/42 [=>............................] - ETA: 1s - loss: 2.0981 - accuracy: 0.5078

2022-01-27 10:51:40.348950: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-01-27 10:51:40.348966: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.
2022-01-27 10:51:40.410828: I tensorflow/core/profiler/lib/profiler_session.cc:67] Profiler session collecting data.
2022-01-27 10:51:40.411505: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.
2022-01-27 10:51:40.412180: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: /Users/ycammarata/.keras/logs/embedding/train/plugins/profile/2022_01_27_10_51_40

2022-01-27 10:51:40.412813: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to /Users/ycammarata/.keras/logs/embedding/train/plugins/profile/2022_01_27_10_51_40/AFYves-2.local.trace.json.gz
2022-01-27 10:51:40.413372: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: /User



2022-01-27 10:51:41.434038: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3cd69a220>

In [22]:
######
# We use here a sentence encoder named, Universal Sentence Encoder from tensorhub
#################
model_name = 'model_use'

tensorboard_callback = TensorBoard(log_dir=log_dir_base+model_name)

X_train, X_val, y_train, y_val = train_test_split(data['text_clean'], data['target'], test_size=0.3, random_state=42, stratify = data['target'])

#create keras layer using the use layer from tensorflow hub
MODEL_URL="https://tfhub.dev/google/universal-sentence-encoder/4"
sentence_encoder_layer=hub.KerasLayer(
  MODEL_URL,
  input_shape=[],
  dtype=tf.string,
  trainable=False,
  name="USE")

#Create model using sequentinal api
model_use=Sequential([
  sentence_encoder_layer,
  Dense(32,activation="relu"),
  Dense(16,activation="relu"),
  Dropout(0.4),
  Dense(1,activation="sigmoid",name="output_layer"),
  ],name="Model_USE"
)

#Compile the model
model_use.compile(
  loss="binary_crossentropy",
  optimizer= 'adam',
  metrics=["accuracy"]
)

#trained the classfier on use layer
model_history=model_use.fit(
  X_train,
  y_train,
  epochs=10,
  batch_size = 128,
  validation_data=(X_val, y_val),
  callbacks=[tensorboard_callback], 
  use_multiprocessing = True
)

2022-01-27 10:49:40.562200: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-01-27 10:49:40.562214: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.
2022-01-27 10:49:40.562383: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.
2022-01-27 10:49:43.325322: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 1/10


2022-01-27 10:49:45.013184: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 2/42 [>.............................] - ETA: 4s - loss: 0.6893 - accuracy: 0.5430 

2022-01-27 10:49:45.436317: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-01-27 10:49:45.436333: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.


 5/42 [==>...........................] - ETA: 6s - loss: 0.6882 - accuracy: 0.5766

2022-01-27 10:49:45.917622: I tensorflow/core/profiler/lib/profiler_session.cc:67] Profiler session collecting data.
2022-01-27 10:49:45.919043: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.
2022-01-27 10:49:45.922005: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: /Users/ycammarata/.keras/logs/model_use/train/plugins/profile/2022_01_27_10_49_45

2022-01-27 10:49:45.923236: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to /Users/ycammarata/.keras/logs/model_use/train/plugins/profile/2022_01_27_10_49_45/AFYves-2.local.trace.json.gz
2022-01-27 10:49:45.924816: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: /Users/ycammarata/.keras/logs/model_use/train/plugins/profile/2022_01_27_10_49_45

2022-01-27 10:49:45.924969: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for memory_profile.json.gz to /User



2022-01-27 10:49:48.048450: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
