In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [2]:
!unzip ../input/quora-insincere-questions-classification/embeddings.zip

Archive:  ../input/quora-insincere-questions-classification/embeddings.zip
   creating: GoogleNews-vectors-negative300/
   creating: glove.840B.300d/
   creating: paragram_300_sl999/
   creating: wiki-news-300d-1M/
  inflating: glove.840B.300d/glove.840B.300d.txt  
  inflating: GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: paragram_300_sl999/README.txt  
  inflating: paragram_300_sl999/paragram_300_sl999.txt  


In [3]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [4]:
dir_file = '../input/quora-insincere-questions-classification/'
train_df = pd.read_csv(dir_file + "train.csv")
test_df = pd.read_csv(dir_file + "test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [5]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

embed_size = 300 #how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## Question text
train_X = train_df['question_text'].values
val_X = val_df['question_text'].values
test_X = test_df['question_text'].values

## Get the target values
train_y = train_df['target']
val_y = val_df['target']

In [7]:
%%time

## Tokenize 
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_X)

CPU times: user 21.4 s, sys: 209 ms, total: 21.6 s
Wall time: 22.1 s


In [8]:
%%time

train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

CPU times: user 27.2 s, sys: 236 ms, total: 27.4 s
Wall time: 27.5 s


In [9]:
%%time

## Pad the sentences
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

CPU times: user 12.4 s, sys: 593 ms, total: 13 s
Wall time: 13 s


## glove embeddings

In [10]:
%%time

EMBEDDING_FILE = './glove.840B.300d/glove.840B.300d.txt'

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

CPU times: user 3min 53s, sys: 5.05 s, total: 3min 58s
Wall time: 4min


In [11]:
%%time

all_embs = np.stack(embeddings_index.values())
print(all_embs.shape)
emb_mean, emb_std = all_embs.mean(), all_embs.std() #cal mean and std over flattened array
embed_size = all_embs.shape[1]
print(emb_mean)
print(emb_std)
print(embed_size)

  call = lambda f, *a, **k: f(*a, **k)


(2196016, 300)
-0.005838499
0.48782197
300
CPU times: user 5.45 s, sys: 4.14 s, total: 9.59 s
Wall time: 9.9 s


In [12]:
word_index = tokenizer.word_index
print(len(word_index))
nb_words = min(max_features, len(word_index))
print(nb_words)

209286
50000


In [13]:
%%time

"""
here may be incorrect, go over the word_index.items, including the stopwords, could try later 
https://stackoverflow.com/questions/46202519/keras-tokenizer-num-words-doesnt-seem-to-work

useful link: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
"""
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
print(embedding_matrix.shape)
for word, i in word_index.items():
    if i >= max_features:
        continue 
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

(50000, 300)
CPU times: user 670 ms, sys: 21 ms, total: 691 ms
Wall time: 692 ms


In [14]:

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())


2021-11-14 00:26:52.031397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 00:26:52.131381: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 00:26:52.132060: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 00:26:52.133737: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17    

In [15]:
%%time 

model.fit(train_X, train_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))

2021-11-14 00:26:54.572908: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 470203600 exceeds 10% of free system memory.
2021-11-14 00:26:55.013088: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/4


2021-11-14 00:26:57.072895: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 3min 25s, sys: 7.65 s, total: 3min 32s
Wall time: 4min 23s


<keras.callbacks.History at 0x7f3870004a10>

In [16]:
%%time
pred_noemb_val_y = model.predict(val_X, batch_size=1024, verbose=1)

CPU times: user 1.46 s, sys: 76.2 ms, total: 1.54 s
Wall time: 2.83 s


In [17]:
thresh_best = None
f1_score_best = float('-inf')
for thresh in np.arange(0.1,0.501,0.01):
    thresh = np.round(thresh, 2)
    f1_score =  metrics.f1_score(val_y, (pred_noemb_val_y > thresh).astype('int'))
    print("F1 score at threshold {} is {}".format(thresh, f1_score))
    if f1_score > f1_score_best:
        f1_score_best = f1_score
        thresh_best = thresh
print("Best F1 score at threshold {} is {}".format(thresh_best, f1_score_best))

F1 score at threshold 0.1 is 0.577059240028322
F1 score at threshold 0.11 is 0.5856109324758842
F1 score at threshold 0.12 is 0.593370528388579
F1 score at threshold 0.13 is 0.6006763244687513
F1 score at threshold 0.14 is 0.6076701170880705
F1 score at threshold 0.15 is 0.6144500452371721
F1 score at threshold 0.16 is 0.6196013289036545
F1 score at threshold 0.17 is 0.6248116969428446
F1 score at threshold 0.18 is 0.6285252380097
F1 score at threshold 0.19 is 0.6329044869754967
F1 score at threshold 0.2 is 0.6374810109100953
F1 score at threshold 0.21 is 0.6409086677218137
F1 score at threshold 0.22 is 0.6451369927502119
F1 score at threshold 0.23 is 0.6481137909709338
F1 score at threshold 0.24 is 0.650463518901004
F1 score at threshold 0.25 is 0.6534730306558013
F1 score at threshold 0.26 is 0.6565849361327265
F1 score at threshold 0.27 is 0.6581129095933675
F1 score at threshold 0.28 is 0.6594051526907391
F1 score at threshold 0.29 is 0.662086258776329
F1 score at threshold 0.3 is 

In [18]:
pred_noemb_test_y = model.predict(test_X, batch_size=1024, verbose=1)

out_df = pd.DataFrame({'qid': test_df['qid'].values})
out_df['prediction'] = (pred_noemb_test_y > thresh_best).astype('int')
out_df.to_csv('submission.csv',index=False)

out_df.head()



Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
