In [1]:
# Note: you may need to update your version of future
# sudo pip install -U future

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNLSTM, CuDNNGRU, Conv1D, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Flatten, Bidirectional, GlobalMaxPool1D,SimpleRNN

from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, roc_curve, accuracy_score
from sklearn.model_selection import GridSearchCV,train_test_split, KFold


import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

from capstone_utils import *

# Inline plotting
%matplotlib inline
pd.set_option('display.width', 200)
pd.set_option('max_colwidth', 2000)

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Config
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

MAX_SEQUENCE_LENGTH = 150
MAX_VOCAB_SIZE = 30000
EMBEDDING_DIM = 300

In [3]:
train, test, corpus = load_data('../data', clean=False, lower_stop=False)

Loading questions...
Done loading train - Loading test
Done loading test


In [4]:
word2vec = load_embeddings(path='../data/glove.840B.300d/glove.840B.300d.txt')

Loading word vectors...
Word: . - could not convert string to float: '.'
Word: at - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: to - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: . - could not convert string to float: '.'
Word: email - could not convert string to float: 'name@domain.com'
Word: or - could not convert string to float: 'name@domain.com'
Word: contact - could not convert string to float: 'name@domain.com'
Word: Email - could not convert string to float: 'name@domain.com'
Word: on - could not convert string to float: 'name@domain.com'
Word: At - could not convert string to float: 'Killerseats.com'
Word: by - could not convert string to float: 'name@domain.com'
Word: in - could not convert string to float: 'mylot.com'
Word: emailing - could not convert string to float: 'name@domain.com'
Word: Contact - could not convert string to float: 'name@domain.c

In [5]:
# convert the sentences (strings) into integers
targets = train['target'].values
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(train["question_text"])
test_sequences = tokenizer.texts_to_sequences(test['question_text'])

In [6]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 227538 unique tokens.


In [7]:
embedding_matrix = load_embedding_matrix(word2idx, word2vec, MAX_VOCAB_SIZE, EMBEDDING_DIM)

Filling pre-trained embeddings...


In [8]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (1306122, 150)


In [9]:
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of testdata tensor:', test_data.shape)

Shape of testdata tensor: (56370, 150)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.3, random_state=42)

In [11]:
embedding_layer = Embedding(
  MAX_VOCAB_SIZE,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [12]:
BATCH_SIZE = 512
EPOCHS = 4

In [13]:

model_cp_filepath = 'simplernn.ep-{epoch:02d}-loss-{val_loss:.2f}.hdf5'

optimizer = Adam(lr=0.001, decay=0.0001);

input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer (input_layer)
x = SimpleRNN(64, return_sequences=False)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(32, activation="relu")(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=x)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          9000000   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                23360     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total para

In [14]:
print('Training model for {} epochs'.format(EPOCHS))

model_cp = ModelCheckpoint(model_cp_filepath, monitor='val_loss', verbose=0, 
                     save_best_only=True, save_weights_only=False, mode='auto', period=1)

r = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                   validation_data=(X_test, y_test), callbacks=[model_cp])

Training model for 4 epochs
Train on 914285 samples, validate on 391837 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [15]:
val_predictions = model.predict(X_test)
best_threshold = threshold_search(y_test, val_predictions, min_threshold=0.1, max_threshold=0.9)

F1 score at threshold 0.1 is 0.5399354201691886
F1 score at threshold 0.11 is 0.5493385917344508
F1 score at threshold 0.12 is 0.557013118062563
F1 score at threshold 0.13 is 0.5633012017130498
F1 score at threshold 0.14 is 0.5693460510363564
F1 score at threshold 0.15 is 0.5752449593471722
F1 score at threshold 0.16 is 0.5804103543969792
F1 score at threshold 0.17 is 0.5845522434117139
F1 score at threshold 0.18 is 0.587947422906798
F1 score at threshold 0.19 is 0.5910220466420288
F1 score at threshold 0.2 is 0.5943849100518838
F1 score at threshold 0.21 is 0.5974153604245491
F1 score at threshold 0.22 is 0.5999867215509228
F1 score at threshold 0.23 is 0.6026650514839491
F1 score at threshold 0.24 is 0.6043413071628161
F1 score at threshold 0.25 is 0.6066772435676347
F1 score at threshold 0.26 is 0.6081765178228538
F1 score at threshold 0.27 is 0.6098935061724033
F1 score at threshold 0.28 is 0.6116048278945017
F1 score at threshold 0.29 is 0.6129008904655034
F1 score at threshold 0.

In [16]:
submission_predictions = model.predict(test_data)
submission_best_predictions = (submission_predictions > best_threshold.get('threshold')).astype(int)

In [17]:
submission = pd.DataFrame({"qid":test["qid"].values})
submission['prediction'] = submission_best_predictions
submission.to_csv("submission.csv", index=False)

In [18]:
import importlib
import capstone_utils
importlib.reload(capstone_utils)
from capstone_utils import *

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# For clearing memory
import gc
from keras import backend as K
del model
gc.collect()
K.clear_session()

In [20]:
# If the above does not work
from numba import cuda
cuda.select_device(0)
cuda.close()