In [28]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.models import Model
import numpy as np 
import pandas as pd 
import re
import nltk
from preprocess import *
from models import *

In [29]:
df = pd.read_csv("questions.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [30]:
# question_1, question_2 = df['question1'].to_list(), df['question2'].to_list()
# is_duplicate = df['is_duplicate'].to_list()
# preprocess_neural(question_1, question_2, is_duplicate)

In [31]:
df = df.sample(n=50000, random_state=1)
df.shape

(50000, 6)

In [32]:
q1_preprocessed, q2_preprocessed, is_duplicate = df['question1'].to_list(), df['question2'].to_list(), df['is_duplicate'].to_list()

Acquired Test data

In [33]:
MAX_WORDS_VOCAB = 200000
tokenizer = Tokenizer(num_words = MAX_WORDS_VOCAB, lower=False, split=" ")
tokenizer.fit_on_texts(list(df['question1'].values.astype(str))+list(df['question2'].values.astype(str)))

In [34]:
print("Number of words in vocabulary: ", len(tokenizer.word_index))

Number of words in vocabulary:  42981


In [35]:

q1_sequence = tokenizer.texts_to_sequences(df['question1'].values.astype(str))
q1_sequence = pad_sequences(q1_sequence, maxlen = 128)

q2_sequence = tokenizer.texts_to_sequences(df['question2'].values.astype(str))
q2_sequence = pad_sequences(q2_sequence, maxlen = 128)

In [36]:
windex = tokenizer.word_index

In [37]:
embedding_index = {}
with open('glove.6B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [38]:
embedding_matrix = np.random.random((len(windex)+1, 300))

for word, i in windex.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(42982, 300)


In [39]:
print(len(windex)+1)

42982


In [40]:
#split the data into 70-20-10 train-validation-test with random state 42
from sklearn.model_selection import train_test_split
q1_train, q1_test, q2_train, q2_test, y_train, y_test = train_test_split(q1_sequence, q2_sequence, is_duplicate, test_size=0.1, random_state=42)
q1_train, q1_val, q2_train, q2_val, y_train, y_val = train_test_split(q1_train, q2_train, y_train, test_size=0.2, random_state=42)

In [41]:
import tensorflow as tf
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

In [42]:
is_duplicate = np.array(is_duplicate)
print(is_duplicate.shape)

(50000,)


In [43]:
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)
print("Train: ", sum(y_train)/len(y_train))
print("Validation: ", sum(y_val)/len(y_val))
print("Test: ", sum(y_test)/len(y_test))

Train:  [0.6346111 0.3653889]
Validation:  [0.6388889 0.3611111]
Test:  [0.6274 0.3726]


In [45]:
model = BiLSTM(emb_mat = embedding_matrix, vocab_size = len(windex)+1, loss="categorical_crossentropy")

In [46]:
model.train_model()

In [47]:
model.train_model()

In [48]:
model.get_model_summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 embedding_9 (Embedding)        (None, 128, 300)     12894600    ['input_11[0][0]']               
                                                                                                  
 embedding_10 (Embedding)       (None, 128, 300)     12894600    ['input_12[0][0]']               
                                                                                            

In [49]:
print(q1_val.shape)

(9000, 128)


In [50]:
model.model.fit([q1_train, q2_train], y_train, epochs = 4, validation_data = ([q1_val, q2_val], y_val), batch_size = 8, validation_batch_size=4, verbose = 1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
y_pred = model.predict([q1_test, q2_test])
y_pred1d, y_actual1d = [], []
for i in range(len(y_test)):
    if(y_test[i][0] == 1):
        y_actual1d.append(0)
    else:
        y_actual1d.append(1)

for i in range(len(y_pred)):
    if(y_pred[i][0] > y_pred[i][1]):
        y_pred1d.append(0)
    else:
        y_pred1d.append(1)

from sklearn.metrics import accuracy_score, f1_score
print("Accuracy: ", accuracy_score(y_actual1d, y_pred1d))
print("F1 Score: ", f1_score(y_actual1d, y_pred1d))

Accuracy:  0.790922939954496
F1 Score:  0.7637004912961728
