In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam

Using TensorFlow backend.


## Prepare datasets

In [2]:
data = pd.read_csv('../data/train.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Check the target variable distribution
print(data['target'].value_counts())

0    4342
1    3271
Name: target, dtype: int64


In [5]:
# Split into X and y
X, y = data['text'], data['target']

In [6]:
# Reshape the values to numpy array
X = np.reshape(X.values, (X.size,))
y =np.reshape(y.values, (y.size,))

In [7]:
# Split into training/testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [8]:
X_train.shape

(6090,)

In [9]:
# Print first 10 sentences from X_train and corresponding labels from y_train
for idx in range(10):
    print(f'Input sentence: "{X_train[idx]}",', f'Target variable -> {y_train[idx]}.\n')

Input sentence: "73rd GOODE Water Ski National Championships will go on as planned next week  http://t.co/PgKBT3MBAp. (Event w/ damage from a tornado on Mon)", Target variable -> 1.

Input sentence: "The tragedy of life is not that it ends so soon but that we wait so long to begin it. ~ W.M. Lewis #quotes", Target variable -> 0.

Input sentence: "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", Target variable -> 0.

Input sentence: "2pcs 18W CREE Led Work Light  Offroad Lamp Car Truck Boat Mining 4WD FLOOD BEAM - Full reaÛ_ http://t.co/VDeFmulx43 http://t.co/yqpAIjSa5g", Target variable -> 0.

Input sentence: "@Rubi_ How many stacks of burning did it apply?", Target variable -> 0.

Input sentence: "Grow Calgary avoids worst of city's wicked weather * ~ 16 http://t.co/HLyHDfWsQB http://t.co/GwSNBMmcqF", Target variable -> 1.

Input sentence: "I liked a @YouTube video http://t.co/N95IGskd3p Minecraft: Episode 2 'Blaze Farm Beginnings!'", Target variable -> 0.

Input senten

## Keras model

### Create an embedding layer in Keras

In [10]:
# Define helper function to read glove vectors
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [11]:
# Load pre-trained 50-dimensional GloVe embeddings
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../models/glove.twitter.27B.50d.txt')

print(len(word_to_vec_map))
print(word_to_vec_map["happy"].shape)
print(len(word_to_index))

1193514
(50,)
1193514


In [12]:
# Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1 # add 1 row for unknown words
    emb_dim = word_to_vec_map["happy"].shape[0]
    
    # Initialize the embedding matrix as a numpy array of zeros
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        if word_to_vec_map[word].shape != (50,): 
#             print(word) #'0.45973' embedding has shape of (49,)
            continue 
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes; make it non-trainable
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer
    embedding_layer.build((None,)) 
    
    # Set the weights of the embedding layer to the embedding matrix
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [13]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][:] =", embedding_layer.get_weights()[0][1][:]) #(batch size, max input length, embedding vector size)

weights[0][1][:] = [ 9.0566e-01 -7.1792e-01 -1.9574e-01 -8.0743e-01 -2.4903e-02  3.1071e-01
  8.9485e-01  6.3035e-01 -3.3863e-01  7.0584e-01  1.2707e-01  3.7673e-01
 -2.7810e+00  2.5292e-01  5.3043e-02  3.0618e-01 -4.2217e-01 -8.5150e-03
 -1.1452e+00 -5.1643e-01 -2.3699e-01 -3.1577e-01  2.4883e-01  1.0689e+00
  5.5007e-01 -1.2806e+00 -2.4169e-02 -3.1108e-01  1.3964e+00 -9.0377e-01
 -9.1328e-01  3.4808e-01 -7.5944e-01  9.9209e-01  9.5123e-01  1.0886e-01
 -1.8141e-01 -4.6055e-01 -8.2691e-01  1.4846e-01 -1.3769e+00 -2.9166e-01
  1.0895e-01  6.1422e-01  1.8414e-01  1.5971e-01  7.1934e-02  1.1230e-03
  2.8188e-02  3.0385e-01]


### Convert sentences to array of word indices

In [14]:
# Convert an array of sentences (strings) into an array of indices corresponding to words in the sentences; the output shape should be such that it can be given to Embedding()
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0] #number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape 
    X_indices = np.zeros((m, max_len))
    
    for i in range(m): 
        
        # Convert the ith training sentence in lower case and split is into words; should get a list of words
        sentence_words = X[i,].lower().split()
        
        j = 0
        # Loop over the words of sentence_words, until hits max_len
        for w in sentence_words:
            if w in word_to_index.keys():
                X_indices[i, j] = word_to_index[w]
            else:
                X_indices[i, j] = 0 # to handle unknown words
            
            j += 1
            # if j is exceeding max length, then not adding more word index to the array; generates less sparse data
            if j == max_len: break 

    return X_indices

In [15]:
max_len_word = max(np.array(X_train), key=len)
max_len = len(max_len_word)
print(max_len)

152


In [16]:
X1 = X_train[:2]
defined_max_len = 25
X1_indices = sentences_to_indices(X1, word_to_index, max_len = defined_max_len)
print("X1 =", X1)
print("X1_indices =\n", X1_indices)

X1 = ['73rd GOODE Water Ski National Championships will go on as planned next week  http://t.co/PgKBT3MBAp. (Event w/ damage from a tornado on Mon)'
 'The tragedy of life is not that it ends so soon but that we wait so long to begin it. ~ W.M. Lewis #quotes']
X1_indices =
 [[     0. 234495. 649253. 559866. 401074. 103545. 654140. 232849. 451194.
   37724. 482073. 406959. 650502.      0.      0.      0. 136107. 215710.
    2115. 610661. 451194.      0.      0.      0.      0.]
 [601627. 612407. 446383. 341139. 283380. 424732. 601405. 284816. 183147.
  563886. 567137.  88334. 601405. 649864. 647422. 563886. 346613. 607687.
   60942. 284828. 675654. 646643. 339752.      0.      0.]]


### Feed embedding layer's output to an LSTM network

In [17]:
def lstm_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the LSTM network's model graph
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph; dtype 'int32' (as it contains indices, which are integers)
    sentence_indices = Input(shape=input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 64-dimensional hidden state; return a batch of sequences
    X = LSTM(units=64, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.2
    X = Dropout(rate=0.2)(X)
    # Propagate X trough another LSTM layer with 64-dimensional hidden state; return a single hidden state, not a batch of sequences
    X = LSTM(units=64, return_sequences=False)(X)
    # Add dropout with a probability of 0.2
    X = Dropout(rate=0.2)(X)
    # Propagate X through a Dense layer with 2 units (target variable classes)
    X = Dense(units=2)(X)
    # Add a sigmoid activation
    X = Activation(activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X
    model = Model(inputs=sentence_indices, outputs=X)
        
    return model

In [18]:
model = lstm_model((defined_max_len,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 25)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 25, 50)            59675750  
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 64)            29440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130 

In [19]:
# Compile the model with defined loss function, optimizer and evaluation meterics
opt = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [20]:
# Convert target to one hot vector
y_oh_train = to_categorical(y_train)
y_oh_test = to_categorical(y_test)

In [21]:
# Convert from sentences to word indecies
X_train_indices = sentences_to_indices(X_train, word_to_index, defined_max_len)
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len=defined_max_len)

In [22]:
model.fit(X_train_indices, y_oh_train, epochs = 50, batch_size = 30, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x146665550>

In [23]:
# Evaluate model performance
loss, acc = model.evaluate(X_test_indices, y_oh_test)
print("Test loss = ", loss)
print("Test accuracy = ", acc)

Test loss =  1.158551921988504
Test accuracy =  0.7931713461875916


In [24]:
# Make prediction on test data
pred = model.predict(X_test_indices)

y_pred = [np.argmax(p) for p in pred]

In [25]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.793827971109652
              precision    recall  f1-score   support

           0       0.78      0.88      0.82       841
           1       0.82      0.69      0.75       682

    accuracy                           0.79      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.79      0.79      1523



In [26]:
# Error analysis, for first 10 errors
counter = 0
for i in range(len(X_test)):
    x = X_test_indices
    if y_pred[i] != y_test[i] and counter < 10:
        print(f'Input sentence: "{X_test[i]}"')
        print(f'Input word indices: {x[i]},') # to check if unknown words are too much; or too sparse vectors
        print(f'Expected -> {y_test[i]}, Predicted -> {y_pred[i]}.\n')
        counter += 1

Input sentence: "Whereas Jez will obliterate the national debt - and give lots of new benefits - by simply printing money! Genius! https://t.co/ReffbkVG9R"
Input word indices: [652676. 294693. 654140. 444770. 601627. 401074. 139581.   1743.  26338.
 231380. 347738. 446383. 406520.  63204.   1743.  89093. 557140. 493167.
      0.      0.      0.      0.      0.      0.      0.],
Expected -> 1, Predicted -> 0.

Input sentence: "I think bombing Iran would be kinder... https://t.co/GVm70U2bPm"
Input word indices: [266801. 602685.  77716. 282695. 657158.  59105.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.],
Expected -> 0, Predicted -> 1.

Input sentence: "Our thoughts are with these local residents! Time for some heavy rain!!! http://t.co/x3g2OX6K8R"
Input word indices: [456923. 603259.  34878. 655002. 602272. 345525.      0. 605075. 211804.
 566117. 253634.      0.      0.      0. 