## import packages

In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow import keras
from tensorflow.keras import layers

from keras.layers import Input, Embedding, Bidirectional, CuDNNLSTM, GlobalMaxPool1D, Dense, Dropout, CuDNNGRU
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


## read data

In [2]:
dir_file = '/glade/scratch/wmingch/ML_project/Quora/'
train_df = pd.read_csv(dir_file + "train.csv")
test_df = pd.read_csv(dir_file + "test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## split train into train and validation 

In [4]:
#train and test split in a stratified fashion
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

In [5]:
## Question text
train_X = train_df['question_text'].values
val_X = val_df['question_text'].values
test_X = test_df['question_text'].values

## Get the target values
train_y = train_df['target']
val_y = val_df['target']

## tokenization

In [6]:
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use
embed_size = 300 #how big is each word vector

In [7]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_X)

In [8]:
%%time

train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

CPU times: user 30.5 s, sys: 310 ms, total: 30.8 s
Wall time: 30.8 s


In [9]:
%%time

#pad_sequences
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

CPU times: user 8.22 s, sys: 211 ms, total: 8.43 s
Wall time: 8.49 s


## GRU model, without pretrained Embeddings

In [10]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
drop

In [11]:
%%time
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
CPU times: user 1min 35s, sys: 25.4 s, total: 2min 1s
Wall time: 2min 22s


<keras.callbacks.History at 0x2b19542a6dd8>

In [12]:
%%time
pred_noemb_val_y = model.predict(val_X, batch_size=1024, verbose=1)

CPU times: user 1.21 s, sys: 327 ms, total: 1.53 s
Wall time: 1.59 s


In [13]:
thresh_best = None
f1_score_best = float('-inf')
for thresh in np.arange(0.1,0.501,0.01):
    thresh = np.round(thresh, 2)
    f1_score =  metrics.f1_score(val_y, (pred_noemb_val_y > thresh).astype('int'))
    print("F1 score at threshold {} is {}".format(thresh, f1_score))
    if f1_score > f1_score_best:
        f1_score_best = f1_score
        thresh_best = thresh
print("Best F1 score at threshold {} is {}".format(thresh_best, f1_score_best))

F1 score at threshold 0.1 is 0.550544794188862
F1 score at threshold 0.11 is 0.5606201550387597
F1 score at threshold 0.12 is 0.5687485157919734
F1 score at threshold 0.13 is 0.5769758617905869
F1 score at threshold 0.14 is 0.5837860082304526
F1 score at threshold 0.15 is 0.5899949723479136
F1 score at threshold 0.16 is 0.5962939297124601
F1 score at threshold 0.17 is 0.6027005972474682
F1 score at threshold 0.18 is 0.6075726961258017
F1 score at threshold 0.19 is 0.6129995099567871
F1 score at threshold 0.2 is 0.6171010434075613
F1 score at threshold 0.21 is 0.621504096672312
F1 score at threshold 0.22 is 0.6257828083685114
F1 score at threshold 0.23 is 0.6286305103863146
F1 score at threshold 0.24 is 0.6323858496405276
F1 score at threshold 0.25 is 0.6345412490362375
F1 score at threshold 0.26 is 0.6374676876554651
F1 score at threshold 0.27 is 0.6406003159557662
F1 score at threshold 0.28 is 0.6438698083067094
F1 score at threshold 0.29 is 0.6451515304321516
F1 score at threshold 0.

In [14]:
pred_noemb_test_y = model.predict(test_X, batch_size=1024, verbose=1)

out_df = pd.DataFrame({'qid': test_df['qid'].values})
out_df['prediction'] = (pred_noemb_test_y > thresh_best).astype('int')
out_df.to_csv('submission.csv',index=False)

out_df.head()



Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
