In [2]:
#import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#plot figures in the notebook wihtout the need to call plt.show()
%matplotlib inline 
plt.style.use("seaborn-ticks") 

import time
import os
print(os.listdir("../input")) #Print directories/folders in the directory: current_working_directory/input/embeddings

['test.csv', 'sample_submission.csv', 'train.csv', 'embeddings.zip']


In [3]:
#load data

train=pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")

train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [7]:
#turn text into vector

num_possible_tokens=10000 #choosen random

tokenizer=Tokenizer(num_words=num_possible_tokens)
tokenizer.fit_on_texts(train.question_text) 
sequences_train=tokenizer.texts_to_sequences(train.question_text) 
sequences_test=tokenizer.texts_to_sequences(test.question_text) 

In [9]:
#see longest vector, we had to build same length vectors

max_len=np.max([len(i) for i in sequences_train]+[len(i) for i in sequences_test])
print(max_len)

221


In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [27]:
X=pad_sequences(sequences_train,maxlen=max_len) #Pad the training data
X_test=pad_sequences(sequences_test,maxlen=max_len) #Pad the test data

y=train.target.values 
print(X[0:20,:]) 
print(X.shape)

[[   0    0    0 ...    6    1 8333]
 [   0    0    0 ...   10   44 1846]
 [   0    0    0 ...  374  451 5546]
 ...
 [   0    0    0 ...  623    7    4]
 [   0    0    0 ...  253  141 2211]
 [   0    0    0 ...  746    4 4164]]
(1306122, 221)


In [12]:
from sklearn.model_selection import train_test_split

#split into training and validation data

X_train,X_val,y_train,y_val=train_test_split(X,y, test_size=0.2, random_state=42)

In [28]:
import tensorflow.keras as keras
from tensorflow.keras import layers



In [29]:
embedding_dimension=30 # Arbitraily choose an embedding dimension
model=keras.models.Sequential()

model.add(layers.Embedding(num_possible_tokens+1,embedding_dimension,input_length=max_len)) # Creat embedding layer as described above
model.add(layers.Flatten())  
model.add(layers.Dense(32, activation='relu')) 
model.add(layers.Dense(1,activation='sigmoid')) 
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])  
model.summary() 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 221, 30)           300030    
_________________________________________________________________
flatten_1 (Flatten)          (None, 6630)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                212192    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 512,255
Trainable params: 512,255
Non-trainable params: 0
_________________________________________________________________


In [30]:
batch_size=1000
epochs=3 #Choose number of epochs to train

history=model.fit(X_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=[X_val,y_val])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1044897 samples, validate on 261225 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
from sklearn.metrics import f1_score

val_pred=model.predict(X_val,batch_size=batch_size).ravel() 

f1_score(y_val,val_pred>0.5)

0.5340213331279603

In [24]:
val_pred[0:50]

array([1.14108807e-05, 1.91738014e-04, 7.49048719e-04, 6.63008495e-06,
       1.59340980e-05, 1.65216290e-04, 3.49356647e-04, 6.20187609e-04,
       2.17802197e-04, 5.77530009e-04, 1.87929757e-02, 4.90615750e-03,
       3.45543463e-04, 2.35823039e-02, 3.35563682e-02, 2.64665228e-04,
       1.01024304e-02, 9.16616682e-06, 2.22983537e-03, 1.74042769e-03,
       1.08064241e-05, 2.73406086e-03, 5.14894903e-01, 1.13727066e-04,
       2.66896814e-01, 3.89706739e-03, 1.84121460e-03, 3.02312016e-01,
       1.55087779e-04, 4.01350576e-03, 1.10858819e-03, 3.64085659e-03,
       3.79229710e-03, 6.83338614e-04, 3.48131813e-04, 1.37706287e-03,
       2.24568998e-03, 1.59355113e-03, 6.16387241e-02, 3.61944325e-02,
       6.28875205e-05, 1.01308839e-03, 9.41174012e-03, 3.34805972e-03,
       6.76100581e-06, 1.79109380e-01, 6.55492485e-01, 1.75425157e-01,
       9.03492910e-05, 8.54135081e-02, 6.09888099e-02, 3.52201913e-03,
       3.45850065e-02, 4.69778895e-01, 1.44435763e-02, 9.02284955e-05,
      

In [17]:
Threshold=[] 
f1=[] 

for i in np.arange(0.1, 0.501, 0.01):
    Threshold.append(i)
    temp_val_pred=val_pred>i 
    temp_val_pred=temp_val_pred.astype(int) 
    score=f1_score(y_val,temp_val_pred) 
    f1.append(score) #store f1 score
    print("Threshold: {} \t F1 Score: {}".format(np.round(i,2),score))

Threshold: 0.1 	 F1 Score: 0.5465583618902567
Threshold: 0.11 	 F1 Score: 0.555284269271015
Threshold: 0.12 	 F1 Score: 0.5631137617147214
Threshold: 0.13 	 F1 Score: 0.5702859244276914
Threshold: 0.14 	 F1 Score: 0.5760169292264284
Threshold: 0.15 	 F1 Score: 0.5811653149142019
Threshold: 0.16 	 F1 Score: 0.5836751737859547
Threshold: 0.17 	 F1 Score: 0.5876193100683919
Threshold: 0.18 	 F1 Score: 0.5910970442595334
Threshold: 0.19 	 F1 Score: 0.5938499129214212
Threshold: 0.2 	 F1 Score: 0.5966733764169934
Threshold: 0.21 	 F1 Score: 0.5998222030656503
Threshold: 0.22 	 F1 Score: 0.6015930802288342
Threshold: 0.23 	 F1 Score: 0.603126912591109
Threshold: 0.24 	 F1 Score: 0.6035786859336194
Threshold: 0.25 	 F1 Score: 0.6040926286827926
Threshold: 0.26 	 F1 Score: 0.6043458052521496
Threshold: 0.27 	 F1 Score: 0.6031596321622258
Threshold: 0.28 	 F1 Score: 0.6034153331741103
Threshold: 0.29 	 F1 Score: 0.6023499562026158
Threshold: 0.3 	 F1 Score: 0.6021294823155061
Threshold: 0.31 	 

In [18]:
best_threshold=Threshold[np.argmax(f1)] #Get threshold at index of largest f1 score.
best_threshold

0.2599999999999999

In [19]:
test_pred=model.predict(X_test,batch_size=4096).ravel() #Predict test data

df=pd.DataFrame({'qid':test.qid.values,'prediction':test_pred}) #Create dataframe of unique id's and predicted target 
df.prediction=(df.prediction>best_threshold).astype(int) #Convert target to binary 
df.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
