# NLP with Disaster Tweets

```
Step 1. Library Import & Data Load
Step 2. Data Preprocessing
     2-a. Drop Columns
     2-b. Tokenizer
     2-c. Pad Sequences
     2-d. Match Data type to numpy.ndarray
Step 3. Modeling
Step 4. Model Compile
Step 5. Callbacks
Step 6. Model Fit
Step 7. Model Evaluate & Save
Step 8. Reload Model
Step 9. Predict Test Data
```


## Step 1. Library Import & Data Load

In [4]:
import pandas as pd 
import numpy as np 

In [5]:
train_df = pd.read_csv('./data/train_data.csv')
test_df = pd.read_csv('./data/test_data.csv')

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake...,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,All residents asked to ' shelter in place ' ...,1
3,6,,,"13,000 people receive # wildfires evacuation ...",1
4,7,,,Just got sent this photo from Ruby # Alaska a...,1


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        6981
target         2
dtype: int64

In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about # earthquake is different cities,..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting . # Spokane # wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [11]:
test_df.nunique()

id          3263
keyword      221
location    1602
text        3109
dtype: int64

## Step 2. Data Preprocessing

### 2-a. Drop Columns

In [12]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake...,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,All residents asked to ' shelter in place ' ...,1
3,6,,,"13,000 people receive # wildfires evacuation ...",1
4,7,,,Just got sent this photo from Ruby # Alaska a...,1


In [13]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)

In [14]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about # earthquake is different cities,..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting . # Spokane # wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [15]:
test_df.drop(columns=['keyword','location'],axis=1, inplace=True)

In [16]:
print(train_df.shape, test_df.shape)

(7613, 2) (3263, 2)


### 2-b. Tokenizer

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'],train_df['target'], test_size=0.2, random_state=111)

In [19]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(6090,) (6090,) (1523,) (1523,)


In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
vocab_size = 1000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

In [22]:
tokenizer.fit_on_texts(X_train)

In [23]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

In [24]:
for i in range(10):
    print(len(X_train[i]))

11
16
10
28
16
20
15
20
10
8


In [25]:
X_train[0]

[132, 1, 6, 253, 10, 15, 685, 12, 618, 157, 1]

In [26]:
for i in range(10):
    print(len(X_valid[i]))

11
26
27
21
12
7
12
7
7
11


In [27]:
X_valid[0]

[32, 11, 230, 494, 18, 758, 402, 923, 178, 2, 1]

### 2-c. Pad Sequences

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
max_length = 120
trunc_type = 'post'
pad_type = 'post'

In [30]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [31]:
X_train_padded[:2]

array([[132,   1,   6, 253,  10,  15, 685,  12, 618, 157,   1,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [821,   1, 138,  26,   1,   5,   3,   1,  25, 125,   1,  70, 174,
         13,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [32]:
X_valid_padded[:2]

array([[ 32,  11, 230, 494,  18, 758, 402, 923, 178,   2,   1,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 17,   1,  50, 558,  57,   5, 834,   3,   1,   1,   9,  80,  54,
        437,  36,  20,   3,   1,   9,  54,   1,  12,   1,  33,   1,   1,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [33]:
print(X_train_padded.shape, X_valid_padded.shape)

(6090, 120) (1523, 120)


### 2-d. Match Data type to numpy.ndarray

In [34]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [35]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [36]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


## Step 3. Modeling

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten

In [38]:
embedding_dim = 16
# vocab_size = 1000
# max_length = 120

In [39]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [40]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

## Step 4. Model Compile

In [41]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

## Step 5. Callbacks

In [42]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [43]:
filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

In [44]:
ep = EarlyStopping(
    monitor='val_loss', 
    patience=5,
)

## Step 6. Model Fit

In [45]:
epochs=30
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp,ep],
    epochs=epochs
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.47684, saving model to my_checkpoint.ckpt
Epoch 2/30
Epoch 00002: val_loss improved from 0.47684 to 0.44872, saving model to my_checkpoint.ckpt
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.44872
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.44872
Epoch 5/30
Epoch 00005: val_loss improved from 0.44872 to 0.44845, saving model to my_checkpoint.ckpt
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.44845
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.44845
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.44845
Epoch 9/30
Epoch 00009: val_loss did not improve from 0.44845
Epoch 10/30
Epoch 00010: val_loss did not improve from 0.44845


<tensorflow.python.keras.callbacks.History at 0x7f912a097590>

## Step 7. Model Evaluate & Save

In [46]:
model.load_weights(filepath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f912a575590>

In [47]:
model.evaluate(X_valid_padded, y_valid)



[0.4484510123729706, 0.7964543700218201]

In [48]:
X_valid[0]

[32, 11, 230, 494, 18, 758, 402, 923, 178, 2, 1]

In [63]:
model.save('./model/basic_val_loss_0.4484.h5')

## Step 8. Reload Model

In [50]:
import tensorflow as tf

In [64]:
mymodel = tf.keras.models.load_model('./model/basic_val_loss_0.4484.h5')

In [65]:
mymodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

## Step 9. Predict Test Data 

In [66]:
X_test = tokenizer.texts_to_sequences(test_df['text'])

In [67]:
X_test_padded = pad_sequences(X_test, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [68]:
y_test_raw = model.predict(X_test_padded)

In [69]:
y_test_raw

array([[0.8736457 ],
       [0.6497478 ],
       [0.8904932 ],
       ...,
       [0.9764315 ],
       [0.84007984],
       [0.99754155]], dtype=float32)

In [70]:
y_test = list(map(lambda x : 1 if x > 0.5 else 0, y_test_raw))

In [71]:
set(y_test)

{0, 1}

In [72]:
y_test[:5]

[1, 1, 1, 0, 1]

In [73]:
test_df['predict'] = y_test

In [74]:
test_df

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about # earthquake is different cities,...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting . # Spokane # wildfires,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENER...,1
3259,10865,Storm in RI worse than last hurricane . My ci...,1
3260,10868,Green Line derailment in Chicago,1
3261,10874,MEG issues Hazardous Weather Outlook ( HWO ),1


In [75]:
test_df[test_df['predict']==1]

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about # earthquake is different cities,...",1
2,3,"there is a forest fire at spot pond, geese are...",1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,We are shaking . It is an earthquake,1
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENER...,1
3259,10865,Storm in RI worse than last hurricane . My ci...,1
3260,10868,Green Line derailment in Chicago,1
3261,10874,MEG issues Hazardous Weather Outlook ( HWO ),1


In [77]:
submission = test_df[['id','predict']]

In [78]:
submission

Unnamed: 0,id,predict
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [79]:
submission.columns = ['id', 'target']

In [80]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [81]:
submission.to_csv('./sample_submission.csv', index=False)