In [1]:
# Note: you may need to update your version of future
# sudo pip install -U future

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNLSTM, CuDNNGRU, Conv1D, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Flatten, Bidirectional, GlobalMaxPool1D

from keras.models import Model, Sequential
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, roc_curve, accuracy_score
from sklearn.model_selection import GridSearchCV,train_test_split, KFold
from keras.callbacks import ModelCheckpoint

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

from capstone_utils import *

# Inline plotting
%matplotlib inline
pd.set_option('display.width', 200)
pd.set_option('max_colwidth', 2000)

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Config
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

MAX_SEQUENCE_LENGTH = 150
MAX_VOCAB_SIZE = 30000
EMBEDDING_DIM = 300

In [3]:
train, test, corpus = load_data('../data', clean=False, lower_stop=False)

Loading questions...
Done loading train - Loading test
Done loading test


In [4]:
word2vec = load_embeddings(path='../data/glove.840B.300d/glove.840B.300d.txt')

Loading word vectors...
Word: . - could not convert string to float: '.'
Word: at - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: to - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: . - could not convert string to float: '.'
Word: email - could not convert string to float: 'name@domain.com'
Word: or - could not convert string to float: 'name@domain.com'
Word: contact - could not convert string to float: 'name@domain.com'
Word: Email - could not convert string to float: 'name@domain.com'
Word: on - could not convert string to float: 'name@domain.com'
Word: At - could not convert string to float: 'Killerseats.com'
Word: by - could not convert string to float: 'name@domain.com'
Word: in - could not convert string to float: 'mylot.com'
Word: emailing - could not convert string to float: 'name@domain.com'
Word: Contact - could not convert string to float: 'name@domain.c

In [5]:
# convert the sentences (strings) into integers
targets = train['target'].values
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(train["question_text"])
test_sequences = tokenizer.texts_to_sequences(test['question_text'])

In [6]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 227538 unique tokens.


In [7]:
embedding_matrix = load_embedding_matrix(word2idx, word2vec, MAX_VOCAB_SIZE, EMBEDDING_DIM)

Filling pre-trained embeddings...


In [21]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (1306122, 150)


In [22]:
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of testdata tensor:', test_data.shape)

Shape of testdata tensor: (56370, 150)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.3, random_state=42)

In [24]:
embedding_layer = Embedding(
  MAX_VOCAB_SIZE,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [25]:
BATCH_SIZE = 512
EPOCHS = 10

In [26]:

model_cp_filepath = 'ffnn_embeddings.ep-{epoch:02d}-loss-{val_loss:.2f}.hdf5'

model = Sequential()
model.add(embedding_layer)
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy'],
  
)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 300)          9000000   
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 300)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 256)               77056     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)                8256      
__________

In [27]:
print('Training model for {} epochs'.format(EPOCHS))

model_cp = ModelCheckpoint(model_cp_filepath, monitor='val_loss', verbose=0, 
                     save_best_only=True, save_weights_only=False, mode='auto', period=1)

r = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                   validation_data=(X_test, y_test), callbacks=[model_cp])

Training model for 10 epochs
Train on 914285 samples, validate on 391837 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
val_predictions = model.predict(X_test)
best_threshold = threshold_search(y_test, val_predictions, min_threshold=0.1, max_threshold=0.9)

F1 score at threshold 0.1 is 0.49348509809794816
F1 score at threshold 0.11 is 0.5070646299696578
F1 score at threshold 0.12 is 0.5169442227302858
F1 score at threshold 0.13 is 0.5271973558904263
F1 score at threshold 0.14 is 0.536399021835072
F1 score at threshold 0.15 is 0.5428311645724336
F1 score at threshold 0.16 is 0.5490635210381801
F1 score at threshold 0.17 is 0.5553189811529059
F1 score at threshold 0.18 is 0.5602955234037038
F1 score at threshold 0.19 is 0.5653917181027168
F1 score at threshold 0.2 is 0.569245234635439
F1 score at threshold 0.21 is 0.5725433678401322
F1 score at threshold 0.22 is 0.5744827087252207
F1 score at threshold 0.23 is 0.5752611365702518
F1 score at threshold 0.24 is 0.5752481649108128
F1 score at threshold 0.25 is 0.5755709720165251
F1 score at threshold 0.26 is 0.5742601954163086
F1 score at threshold 0.27 is 0.5722757295053473
F1 score at threshold 0.28 is 0.5695906432748539
F1 score at threshold 0.29 is 0.5662414131501472
F1 score at threshold 0

In [15]:
submission_predictions = model.predict(test_data)
submission_best_predictions = (submission_predictions > best_threshold.get('threshold')).astype(int)

In [16]:
submission = pd.DataFrame({"qid":test["qid"].values})
submission['prediction'] = submission_best_predictions
submission.to_csv("submission.csv", index=False)

In [17]:
import importlib
import capstone_utils
importlib.reload(capstone_utils)
from capstone_utils import *

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
