In [23]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import matplotlib as plt

In [24]:
tweets = pd.read_csv('./preprocessing/train.csv')
test_df = pd.read_csv('./preprocessing/test.csv')

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tweets.drop(['id','keyword','location','target'],axis=1), tweets[['target']], test_size=0.2, stratify=tweets[['target']], random_state=0)
X_train_text = X_train['text']
X_val_text = X_val['text']

print('X_train shape: ', X_train.shape)
print('X_val shape: ', X_val.shape)
print('y_train shape: ', y_train.shape)
print('y_val shape: ', y_val.shape)

X_train shape:  (6088, 1)
X_val shape:  (1523, 1)
y_train shape:  (6088, 1)
y_val shape:  (1523, 1)


In [26]:
tokenizer_1 = Tokenizer(num_words=5000, oov_token='<UNK>')
tokenizer_1.fit_on_texts(X_train_text)

In [27]:
X_train_text = tokenizer_1.texts_to_sequences(X_train_text)
X_val_text = tokenizer_1.texts_to_sequences(X_val_text)
print(X_train_text[:10])
print('')
print(X_val_text[:10])

[[53, 14, 368, 6, 1295, 316, 1581], [265, 524, 127, 128, 220, 145, 302, 66, 111], [34, 818, 25, 81, 203, 4838], [4839, 1407, 819, 4, 4840, 394, 1582, 4841, 5, 4842, 1791, 35, 936, 116], [3277, 4843, 4844, 369, 671, 4845, 1088, 7, 204, 95, 76, 2096, 624, 112], [44, 1006, 2, 43, 107, 195, 132], [714, 273, 253, 764, 162, 873], [82, 4846, 2, 1179, 650, 237, 62, 4847, 163, 1792, 1180], [196, 2097, 3, 442], [146, 874, 303, 133, 1408, 266]]

[[17, 2377, 1038, 1547, 1, 11, 67, 537, 1177, 1, 1323], [189, 5, 10, 121, 559, 222, 189, 3527, 1663, 2210, 3528, 2718, 3529], [1, 1, 798, 639, 514, 1914, 1], [729, 30, 792, 737, 738, 109, 547, 608, 12, 900, 970], [301, 125, 179, 9, 169, 42, 1058, 59, 359, 414, 372, 52, 801], [230, 417, 189, 11, 222, 703, 8, 436, 110, 480, 1284, 464, 121], [2778, 15, 55, 112, 1, 389, 587, 1, 42, 680, 56, 100, 1], [1495, 735, 343, 3163, 632, 179, 546, 343, 763, 20, 3163, 1, 1, 160], [60, 58, 82, 1143, 747, 686, 321, 952, 741, 817, 17, 82], [714, 273, 253, 764, 162, 873]]


In [28]:
tokenizer_1.sequences_to_texts([X_train_text[1]])

['wreckage conclusively confirm mh370 malaysia pm investigators families be']

In [29]:
print('Train Set Max Length:', max(len(text) for text in X_train_text))
maxlen = 50

X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_val_text = pad_sequences(X_val_text, padding='post', maxlen=maxlen)

print('X_train shape:', X_train_text.shape)
print('X_train shape:', X_val_text.shape)

Train Set Max Length: 25
X_train shape: (6088, 50)
X_train shape: (1523, 50)


In [30]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer_1.word_index) + 1

# load the whole embedding into memory
embeddings_index = dict()
f = open('./dataset/glove.twitter.27B.200d.txt', encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [31]:
# create a weight matrix for words in training set
embedding_matrix = np.zeros((vocab_size, 200))

for word, i in tokenizer_1.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print('Embedding Matrix Shape:', embedding_matrix.shape)

Embedding Matrix Shape: (11927, 200)


In [32]:
## Hyperparameters
num_epochs=15
dropout=0.2
recurrent_dropout=0.2
lr=0.0005
batch_size=128
class_weight = {0: y_train['target'].value_counts()[1]/len(y_train), 1: y_train['target'].value_counts()[0]/len(y_train)} 

In [33]:
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, LSTM
from keras.layers import Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint

lstm_model = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen, trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout)) # try adding dropout later
lstm_model.add(LSTM(128))

#model.add(Flatten())
lstm_model.add(Dense(1, activation='sigmoid'))

adam = optimizers.Adam(lr=lr)
lstm_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 200)           2385400   
                                                                 
 lstm_4 (LSTM)               (None, 50, 128)           168448    
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,685,561
Trainable params: 300,161
Non-trainable params: 2,385,400
_________________________________________________________________
None


  super().__init__(name, **kwargs)


In [34]:
def plot_model_performance(history):   
    plt.figure(figsize=(15,5))
    plt.plot(range(num_epochs), history.history['acc'],'-o',
             label='Train ACC',color='#ff7f0e')
    plt.plot(range(num_epochs),history.history['val_acc'],'-o',
             label='Val ACC',color='#1f77b4')
    x = np.argmax( history.history['val_acc'] ); y = np.max( history.history['val_acc'] )
    xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#1f77b4')
    plt.text(x-0.03*xdist,y-0.13*ydist,'max acc\n%.2f'%y,size=14)
    plt.ylabel('Accuracy',size=14); plt.xlabel('Epoch',size=14)
    plt.legend(loc=(0.01,0.75))

    plt2 = plt.gca().twinx()
    plt2.plot(range(num_epochs),history.history['loss'],'-o',
              label='Train Loss',color='#2ca02c')
    plt2.plot(range(num_epochs),history.history['val_loss'],'-o',
              label='Val Loss',color='#d62728')
    x = np.argmin( history.history['val_loss'] ); y = np.min( history.history['val_loss'] )
    ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#d62728')
    plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
   # plt.ylim([-0.2, 2])
    plt.ylabel('Loss',size=14)
    plt.xticks(ticks=list(range(num_epochs)),labels=list(range(1, num_epochs+1)))
    plt.legend(loc='lower left', bbox_to_anchor=(0.01, 0.1))
    plt.show()

In [35]:
checkpoint = ModelCheckpoint('lstm_model.h5', monitor='val_acc', save_best_only=True)
history = lstm_model.fit(X_train_text, y_train, batch_size=batch_size, callbacks=[checkpoint], epochs=num_epochs, 
                         class_weight=class_weight, validation_data=(X_val_text, y_val), verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [36]:
# plot_model_performance(history)

In [37]:
# tokenize
test_text = test_df['text']
test_text = tokenizer_1.texts_to_sequences(test_text)

# padding
test_text = pad_sequences(test_text, padding='post', maxlen=50)

print('X_test shape:', test_text.shape)

X_test shape: (3263, 50)


In [38]:
# lstm prediction
# model.predict(test_text)
lstm_model.load_weights('lstm_model.h5')
submission = test_df.copy()[['id']]
submission['target'] = lstm_model.predict(test_text)
def classes(x):
    # sigmoid = 1/1+np.exp(-x)
    if x < 0.5:
        return 0
    else:
        return 1

submission['target'] = submission['target'].apply(lambda x: classes(x))
submission.to_csv('submission.csv', index=False)
display(submission.head())



Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [39]:
from sklearn.metrics import accuracy_score

ans = pd.read_csv('./dataset/ans.csv')['target'].values
accuracy_score(y_pred= submission['target'], y_true= ans)

0.7965062825620595

In [40]:
from keras.layers import Layer
import keras.backend as K

class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [41]:
### Attention

## Hyperparameters
num_epochs=15
dropout=0.3
recurrent_dropout=0.3
lr=0.0005
batch_size=128

import tensorflow as tf
from keras.models import Sequential
from keras import Model
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, Input, Layer, GlobalMaxPooling1D, LSTM, Bidirectional, Concatenate
from keras.layers import Embedding
from keras import optimizers

## Embedding Layer
sequence_input = Input(shape=(maxlen,))
embedded_sequences = Embedding(vocab_size, 200, weights=[embedding_matrix], trainable=False)(sequence_input)

## RNN Layer
lstm = Bidirectional(LSTM(128, return_sequences = True, dropout=dropout, recurrent_dropout=recurrent_dropout))(embedded_sequences)
# Getting our LSTM outputs
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(128, return_sequences=True, return_state=True))(lstm)

## Attention Layer
att_out=attention()(lstm)
outputs=Dense(1,activation='sigmoid')(att_out)
model_attn = Model(sequence_input, outputs)

adam = optimizers.Adam(lr=lr)
#sgd = optimizers.sgd(lr=lr)
model_attn.compile(optimizer=adam, loss='binary_crossentropy', metrics=['acc'])

print(model_attn.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 50, 200)           2385400   
                                                                 
 bidirectional_2 (Bidirectio  (None, 50, 256)          336896    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  [(None, 50, 256),        394240    
 nal)                         (None, 128),                       
                              (None, 128),                       
                              (None, 128),                       
                              (None, 128)]                       
                                                           

  super().__init__(name, **kwargs)


In [42]:
checkpoint = ModelCheckpoint('attn_model.h5', monitor='val_acc', save_best_only=True)
history_attn = model_attn.fit(X_train_text, y_train, batch_size=batch_size, callbacks=[checkpoint], epochs=num_epochs, 
                              class_weight=class_weight, validation_data=(X_val_text, y_val), verbose=1)
# plot_model_performance(history_attn)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15

KeyboardInterrupt: 

In [43]:
# tokenize
test_text = test_df['text']
test_text = tokenizer_1.texts_to_sequences(test_text)

# padding
test_text = pad_sequences(test_text, padding='post', maxlen=50)

print('X_test shape:', test_text.shape)

model_attn.load_weights('attn_model.h5')

submission = test_df.copy()[['id']]
submission['target'] = model_attn.predict(test_text)

def classes(x):
    if x < 0.5:
        return 0
    else:
        return 1

submission['target'] = submission['target'].apply(lambda x: classes(x))
# submission.to_csv('submission.csv', index=False)
# display(submission.head())

X_test shape: (3263, 50)


In [48]:
ans = pd.read_csv('./dataset/ans.csv')['target'].values
accuracy_score(y_pred= submission['target'], y_true= ans)

0.7971192154459087