In [2]:
# Note: you may need to update your version of future
# sudo pip install -U future

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNLSTM, CuDNNGRU, Conv1D, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Flatten, Bidirectional, GlobalMaxPool1D

from keras.models import Model, Sequential
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, roc_curve, accuracy_score
from sklearn.model_selection import GridSearchCV,train_test_split, KFold
from keras.callbacks import ModelCheckpoint

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

from capstone_utils import *

# Inline plotting
%matplotlib inline
pd.set_option('display.width', 200)
pd.set_option('max_colwidth', 2000)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading WordNet: Package 'WordNet' not found in
[nltk_data]     index


In [19]:
# Config
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

MAX_SEQUENCE_LENGTH = 300
MAX_VOCAB_SIZE = 30000
EMBEDDING_DIM = 300
BATCH_SIZE = 128
EPOCHS = 4

In [5]:
!ls ../input/embeddings

GoogleNews-vectors-negative300	paragram_300_sl999
glove.840B.300d			wiki-news-300d-1M


In [11]:
train, test, corpus = load_data('../data', clean=False, lower_stop=False)

Loading questions...
Done loading train - Loading test
Done loading test


In [12]:
word2vec = load_embeddings(path='../data/glove.840B.300d/glove.840B.300d.txt')

Loading word vectors...
Word: . - could not convert string to float: '.'
Word: at - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: to - could not convert string to float: 'name@domain.com'
Word: . - could not convert string to float: '.'
Word: . - could not convert string to float: '.'
Word: email - could not convert string to float: 'name@domain.com'
Word: or - could not convert string to float: 'name@domain.com'
Word: contact - could not convert string to float: 'name@domain.com'
Word: Email - could not convert string to float: 'name@domain.com'
Word: on - could not convert string to float: 'name@domain.com'
Word: At - could not convert string to float: 'Killerseats.com'
Word: by - could not convert string to float: 'name@domain.com'
Word: in - could not convert string to float: 'mylot.com'
Word: emailing - could not convert string to float: 'name@domain.com'
Word: Contact - could not convert string to float: 'name@domain.c

In [14]:
# convert the sentences (strings) into integers
targets = train['target'].values
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train["question_text"])
sequences = tokenizer.texts_to_sequences(train["question_text"])
test_sequences = tokenizer.texts_to_sequences(test['question_text'])

In [17]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 222161 unique tokens.


In [22]:
embedding_matrix = load_embedding_matrix(word2idx, word2vec, MAX_VOCAB_SIZE, EMBEDDING_DIM)

Filling pre-trained embeddings...


In [23]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (1306122, 300)


In [24]:
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of testdata tensor:', test_data.shape)

Shape of testdata tensor: (56370, 300)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.3, random_state=42)

In [27]:
embedding_layer = Embedding(
  MAX_VOCAB_SIZE,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [28]:
print('Building model...')
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)

model.summary()

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          9000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 298, 128)          115328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 99, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 97, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0  

In [23]:
print('Training model for {} epochs'.format(EPOCHS))
r = model.fit(
  X_train,
  y_train,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_data=(X_test, y_test)
)

Training model for 5 epochs
Train on 914285 samples, validate on 391837 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
val_predictions = model.predict(X_test)
best_threshold = threshold_search(y_test, val_predictions)

F1 score at threshold 0.1 is 0.46143153331448494
F1 score at threshold 0.11 is 0.47325908701505326
F1 score at threshold 0.12 is 0.4822246043256196
F1 score at threshold 0.13 is 0.4902811411403633
F1 score at threshold 0.14 is 0.4973613726166168
F1 score at threshold 0.15 is 0.5027133094255363
F1 score at threshold 0.16 is 0.5077271221989182
F1 score at threshold 0.17 is 0.5131908663456937
F1 score at threshold 0.18 is 0.5203693801754411
F1 score at threshold 0.19 is 0.531051448061454
F1 score at threshold 0.2 is 0.5352112676056339
F1 score at threshold 0.21 is 0.5389111426300285
F1 score at threshold 0.22 is 0.5422367026744902
F1 score at threshold 0.23 is 0.5471798595981603
F1 score at threshold 0.24 is 0.55050736497545
F1 score at threshold 0.25 is 0.5533735819014131
F1 score at threshold 0.26 is 0.5576590330788805
F1 score at threshold 0.27 is 0.5597154688063778
F1 score at threshold 0.28 is 0.5619319566807117
F1 score at threshold 0.29 is 0.5641902010183047
F1 score at threshold 0

In [27]:
submission_predictions = model.predict(test_data)
submission_best_predictions = (cnn_submission_predictions > best_threshold.get('threshold')).astype(int)

In [28]:
submission = pd.DataFrame({"qid":test["qid"].values})
submission['prediction'] = submission_best_predictions
submission.to_csv("submission.csv", index=False)

In [21]:
import importlib
import capstone_utils
importlib.reload(capstone_utils)
from capstone_utils import *

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
