# Semantics Similarity

**Installing the Libraries**

In [None]:
!pip install -U -q numpy
!pip install -U -q keras
!pip install -U -q scikit-learn
!pip install -U -q matplotlib
!pip install -U -q nltk
!pip install -U -q PyDrive 
!pip install -U -q pandas
!pip install -U -q https://download.pytorch.org/whl/cu100/torch-1.0.1-cp36-cp36m-win_amd64.whl
!pip install -U -q torchvision
!pip install --quiet tensorflow-hub
!pip install --quiet seaborn
!pip install --quiet "tensorflow>=1.7"

**Getting data from Google Drive**

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import json

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file_ids = ["16-aKOfyeLQpBJlUHCJUGxWp4UsY2rvb3", "1oec77bHzg5a2oGshDuBe99jxMi80NlUo"]
file_names = ["train_translated.csv", "test_translated.csv"]

In [None]:
for each_id, each_name in zip(file_ids, file_names):
    download = drive.CreateFile({'id':each_id})
    download.GetContentFile(each_name)

# Dataset

In [1]:
import numpy as np
import pandas as pd
import multiprocessing

In [2]:
questions = pd.read_csv("E:\Datasets\quora\questions.csv")
questions.dropna(inplace=True)

In [3]:
questions1 = questions.iloc[:, 3].values
questions2 = questions.iloc[:, 4].values
is_duplicate_questions = questions.iloc[:, 5].values

In [4]:
questions.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
cores = multiprocessing.cpu_count()

In [6]:
length = is_duplicate_questions.shape[0]

# Natural Language Processing

**Text Preprocessing**

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
lemma = WordNetLemmatizer()
stopword = stopwords.words("english")

In [10]:
sentences_1 = []
for i in questions1:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    sentences_1.append(" ".join(tempx))
sentences_1 = np.asarray(sentences_1)

In [11]:
sentences_2 = []
for i in questions2:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    sentences_2.append(" ".join(tempx))
sentences_2 = np.asarray(sentences_2)

In [12]:
dataset_no = 5
print(sentences_1[dataset_no])
print(sentences_2[dataset_no])
print(is_duplicate_questions[dataset_no])

astrology capricorn sun cap moon cap rise say
triple capricorn sun moon ascendant capricorn say
1


# Text Comparisions

**Count Vectorizor**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(np.append(sentences_1, sentences_2, axis=0))

In [None]:
count_vectorizer_1 = count_vectorizer.transform(sentences_1)

In [None]:
count_vectorizer_2 = count_vectorizer.transform(sentences_2)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

In [None]:
distance_and_similarity_scores = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(count_vectorizer_1[i], count_vectorizer_2[i])[0][0]
    temp['manhattan_distance'] = euclidean_distances(count_vectorizer_1[i], count_vectorizer_2[i])[0][0]
    temp['euclidean_distance'] = manhattan_distances(count_vectorizer_1[i], count_vectorizer_2[i])[0][0]
    distance_and_similarity_scores.append(temp)
#try to give this to log reg to find the similarity between these data

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores[i])

**Tfidf Vectorizor**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer.fit(np.append(sentences_1, sentences_2, axis=0))

In [None]:
tfid_vectorizer_1 = tfidf_vectorizer.transform(sentences_1)

In [None]:
tfid_vectorizer_2 = tfidf_vectorizer.transform(sentences_2)

In [None]:
distance_and_similarity_scores_2 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(tfid_vectorizer_1[i], tfid_vectorizer_2[i])[0][0]
    temp['manhattan_distance'] = euclidean_distances(tfid_vectorizer_1[i], tfid_vectorizer_2[i])[0][0]
    temp['euclidean_distance'] = manhattan_distances(tfid_vectorizer_1[i], tfid_vectorizer_2[i])[0][0]
    distance_and_similarity_scores_2.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_2[i])

**LSA Method**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True)

In [None]:
svd_model = TruncatedSVD(n_components=300,
                         algorithm='randomized',
                         n_iter=10, random_state=42)

In [None]:
lsa_model1 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [None]:
lsa_model2 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [None]:
lsa_1 = lsa_model1.fit_transform(sentences_1)
lsa_2 = lsa_model2.fit_transform(sentences_2)

In [None]:
distance_and_similarity_scores_3 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([lsa_1[i]]), np.asarray([lsa_2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([lsa_1[i]]), np.asarray([lsa_2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([lsa_1[i]]), np.asarray([lsa_2[i]]))[0][0]
    distance_and_similarity_scores_3.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_3[i])

**Word2Vec model(Using Mean to get the sentence vectors)**

In [None]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [None]:
google_model = KeyedVectors.load_word2vec_format("E:\Models\pre_trained\word2vec\google\google.300d.bin", binary=True)
#wiki_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/wiki/wiki.300d.txt", binary=False)
#common_crawl_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/common_crawl/common_crawl.300d.txt", binary=False)

In [None]:
def sentence_vectorizer(model, sentence):
    vectors =[]
    num = 0
    for i in sentence.split():
        try:
            if num == 0:
                vectors = model[i]
            else:
                vectors = np.add(vectors, model[i])
            num += 1
        except:
            pass
    return np.array(vectors) / num

In [None]:
sent_vec1 = []
for each in sentences_1:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec1.append(temp)
    else:
        sent_vec1.append(np.zeros((300,)))
sent_vec1 = np.asarray(sent_vec1)

In [None]:
sent_vec2 = []
for each in sentences_2:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec2.append(temp)
    else:
        sent_vec2.append(np.zeros((300,)))
sent_vec2 = np.asarray(sent_vec2)

In [None]:
distance_and_similarity_scores_4 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    distance_and_similarity_scores_4.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_4[i])

**Doc2Vec Model**

Abisek Update this and use quora questions dataset

**InferText model**

https://github.com/facebookresearch/InferSent

In [13]:
import torch
import numpy as np
from random import randint

Load Model

In [14]:
from models.infersent.models import InferSent
model_version = 1
MODEL_PATH = "models\infersent\infersent%s.pickle" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

In [15]:
use_cuda = False
model = model.cuda() if use_cuda else model # Keep it on CPU or put it on GPU

In [16]:
W2V_PATH = 'E:\Models\pre_trained\glove\commoncrawl\common_crawl.300d.txt'
model.set_w2v_path(W2V_PATH)

In [17]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


Encode Sentences

In [28]:
embeddings_1.shape

(20, 4096)

In [20]:
embeddings_1 = model.encode(sentences_1[:20], bsize=128, tokenize=False, verbose=True)

Nb words kept : 139/152 (91.4%)
Speed : 25.0 sentences/s (cpu mode, bsize=128)


In [21]:
embeddings_2 = model.encode(sentences_2[:20], bsize=128, tokenize=False, verbose=True)

Nb words kept : 140/150 (93.3%)
Speed : 35.7 sentences/s (cpu mode, bsize=128)


In [22]:
distance_and_similarity_scores_6 = []

In [23]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([embeddings_1[i]]), np.asarray([embeddings_2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([embeddings_1[i]]), np.asarray([embeddings_2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([embeddings_1[i]]), np.asarray([embeddings_2[i]]))[0][0]
    distance_and_similarity_scores_6.append(temp)

In [25]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_6[i])

0 {'cosine_similarity': 0.9458276, 'manhattan_distance': 1.2538241, 'euclidean_distance': 32.184387501109086}
0 {'cosine_similarity': 0.6067528, 'manhattan_distance': 2.9255688, 'euclidean_distance': 112.79561321428446}
0 {'cosine_similarity': 0.8873254, 'manhattan_distance': 1.8598092, 'euclidean_distance': 72.58798747658739}
0 {'cosine_similarity': 0.5393939, 'manhattan_distance': 3.1896555, 'euclidean_distance': 120.62491884012707}
0 {'cosine_similarity': 0.75635266, 'manhattan_distance': 3.1802003, 'euclidean_distance': 131.53287864351518}
1 {'cosine_similarity': 0.8199016, 'manhattan_distance': 2.2952108, 'euclidean_distance': 86.52014614462496}
0 {'cosine_similarity': 0.3553719, 'manhattan_distance': 3.3347626, 'euclidean_distance': 130.02840969695353}
1 {'cosine_similarity': 0.96099097, 'manhattan_distance': 0.75167567, 'euclidean_distance': 19.646674617991266}
0 {'cosine_similarity': 1.0, 'manhattan_distance': 0.0, 'euclidean_distance': 1.2594475265359506e-05}
0 {'cosine_simila

**Sentence Encoder V2**

https://tfhub.dev/google/universal-sentence-encoder/2

In [34]:
!pip3 install --quiet tensorflow-hub

You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [35]:
import tensorflow as tf
import tensorflow_hub as hub

W0310 12:42:12.050889  7768 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [36]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [37]:
embed = hub.Module(module_url)

In [38]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    sentences_embeddings_1 = session.run(embed(sentences_1[:20]))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0310 13:10:26.869572  7768 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


In [39]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    sentences_embeddings_2 = session.run(embed(sentences_2[:20]))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0310 13:10:56.016236  7768 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


In [40]:
distance_and_similarity_scores_8 = []

In [41]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([sentences_embeddings_1[i]]), np.asarray([sentences_embeddings_2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([sentences_embeddings_1[i]]), np.asarray([sentences_embeddings_2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([sentences_embeddings_1[i]]), np.asarray([sentences_embeddings_2[i]]))[0][0]
    distance_and_similarity_scores_8.append(temp)

In [42]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_8[i])

0 {'cosine_similarity': 0.9486568, 'manhattan_distance': 0.3204472, 'euclidean_distance': 5.245485807812656}
0 {'cosine_similarity': 0.6202224, 'manhattan_distance': 0.8715247, 'euclidean_distance': 14.994461547388255}
0 {'cosine_similarity': 0.8335015, 'manhattan_distance': 0.57705903, 'euclidean_distance': 9.815707650494005}
0 {'cosine_similarity': 0.2900136, 'manhattan_distance': 1.1916263, 'euclidean_distance': 21.74811126565328}
0 {'cosine_similarity': 0.49464017, 'manhattan_distance': 1.0053457, 'euclidean_distance': 17.698465278358526}
1 {'cosine_similarity': 0.8778694, 'manhattan_distance': 0.49422783, 'euclidean_distance': 7.790173144967412}
0 {'cosine_similarity': 0.16489124, 'manhattan_distance': 1.2923691, 'euclidean_distance': 23.196336368258926}
1 {'cosine_similarity': 0.9448141, 'manhattan_distance': 0.33222255, 'euclidean_distance': 5.783593213651329}
0 {'cosine_similarity': 1.0, 'manhattan_distance': 0.0, 'euclidean_distance': 0.0}
0 {'cosine_similarity': 0.83476865, '

# CNN and TimeDistributed

https://www.kaggle.com/zhihang/an-ensemble-approach-cnn-and-timedistributed

In [14]:
import numpy as np
import pandas as pd

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [16]:
import re
import matplotlib.pyplot as plt
import datetime, time, json
from string import punctuation

In [17]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [19]:
from keras import initializers
from keras import backend as K
from keras.optimizers import SGD
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping

In [21]:
all_questions = dataset_p_l_rms_l_1.tolist() + dataset_p_l_rms_l_2.tolist()

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_questions)

In [23]:
question1_word_sequences = tokenizer.texts_to_sequences(sentences_1.tolist())
question2_word_sequences = tokenizer.texts_to_sequences(sentences_2.tolist())

In [24]:
word_index = tokenizer.word_index

In [25]:
max_question_len = 0
for each in range(length):
    max_question_len = max(max_question_len, len(question1_word_sequences[each]), len(question2_word_sequences[each]))
print(max_question_len)

97


In [26]:
train_q1 = pad_sequences(question1_word_sequences,
                              maxlen = max_question_len)

In [27]:
train_q2 = pad_sequences(question2_word_sequences,
                              maxlen = max_question_len)

In [28]:
embeddings_index = {}
with open('E:/Models/pre_trained/glove/wiki/wiki.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding
print('Word embeddings:', len(embeddings_index))

Word embeddings: 400001


In [29]:
embedding_dim = 300

In [30]:
nb_words = len(word_index)
word_embedding_matrix = np.zeros((nb_words + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0)) #75,334

Null word embeddings: 20431


In [31]:
units = 128 # Number of nodes in the Dense layers
dropout = 0.25 # Percentage of nodes to drop
nb_filter = 32 # Number of filters to use in Convolution1D
filter_length = 3 # Length of filter for Convolution1D

In [32]:
weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
bias = bias_initializer='zeros'

In [50]:
from keras.layers import Input

In [51]:
model_1_input = Input(shape = (max_question_len,), dtype = 'int32', name = 'model_1_input')
model_1_embedding = Embedding(nb_words + 1,
                     embedding_dim,
                     weights = [word_embedding_matrix], 
                     input_length = max_question_len,
                     trainable = False)(model_1_input)
model_1_conv_a = Convolution1D(filters = nb_filter, 
                         kernel_size = filter_length, 
                         padding = 'same')(model_1_embedding)
model_1_batch_a = BatchNormalization()(model_1_conv_a)
model_1_act = Activation('relu')(model_1_batch_a)
model_1_drop_a = Dropout(dropout)(model_1_act)
model_1_conv_b = Convolution1D(filters = nb_filter, 
                         kernel_size = filter_length, 
                         padding = 'same')(model_1_drop_a)
model_1_batch_b = BatchNormalization()(model_1_conv_b)
model_1_act_b = Activation('relu')(model_1_batch_b)
model_1_drop_b = Dropout(dropout)(model_1_act_b)
model_1_flat = Flatten()(model_1_drop_b)

In [52]:
model_2_input = Input(shape = (max_question_len,), dtype = 'int32', name = 'model_2_input')
model_2_embedding = Embedding(nb_words + 1,
                     embedding_dim,
                     weights = [word_embedding_matrix], 
                     input_length = max_question_len,
                     trainable = False)(model_2_input)
model_2_conv_a = Convolution1D(filters = nb_filter, 
                         kernel_size = filter_length, 
                         padding = 'same')(model_2_embedding)
model_2_batch_a = BatchNormalization()(model_2_conv_a)
model_2_act = Activation('relu')(model_2_batch_a)
model_2_drop_a = Dropout(dropout)(model_2_act)
model_2_conv_b = Convolution1D(filters = nb_filter, 
                         kernel_size = filter_length, 
                         padding = 'same')(model_2_drop_a)
model_2_batch_b = BatchNormalization()(model_2_conv_b)
model_2_act_b = Activation('relu')(model_2_batch_b)
model_2_drop_b = Dropout(dropout)(model_2_act_b)
model_2_flat = Flatten()(model_2_drop_b)

In [53]:
model_3_input = Input(shape = (max_question_len,), dtype = 'int32', name = 'model_3_input')
model_3_embedding = Embedding(nb_words + 1,
                     embedding_dim,
                     weights = [word_embedding_matrix],
                     input_length = max_question_len,
                     trainable = False)(model_3_input)
model_3_time_distributed = TimeDistributed(Dense(embedding_dim))(model_3_embedding)
model_3_batch = BatchNormalization()(model_3_time_distributed)
model_3_act = Activation('relu')(model_3_batch)
model_3_drop = Dropout(dropout)(model_3_act)
model_3_lambda = Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim, ))(model_3_drop)

In [54]:
model_4_input = Input(shape = (max_question_len,), dtype = 'int32', name = 'model_4_input')
model_4_embedding = Embedding(nb_words + 1,
                     embedding_dim,
                     weights = [word_embedding_matrix],
                     input_length = max_question_len,
                     trainable = False)(model_4_input)
model_4_time_distributed = TimeDistributed(Dense(embedding_dim))(model_4_embedding)
model_4_batch = BatchNormalization()(model_4_time_distributed)
model_4_act = Activation('relu')(model_4_batch)
model_4_drop = Dropout(dropout)(model_4_act)
model_4_lambda = Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim, ))(model_4_drop)

In [55]:
from keras.layers import concatenate

In [56]:
merge_layer = concatenate([model_1_flat, model_2_flat, model_3_lambda, model_4_lambda], name = 'merge_layer')

In [57]:
t = Dense(200, activation = 'relu', name = 'dense1')(merge_layer)
t = Dropout(0.3)(t)
t = BatchNormalization()(t)

In [58]:
t = Dense(200, activation = 'relu', name  ='dense2')(t)
t = Dropout(0.3)(t)
t = BatchNormalization()(t)

In [59]:
t = Dense(100, activation= 'relu',name = 'dense3')(t)
t = Dropout(0.3)(t)
t = BatchNormalization()(t)

In [60]:
final_output = Dense(1, activation = 'sigmoid')(t)

In [61]:
from keras.models import Model

In [62]:
model = Model(inputs = [model_1_input, model_2_input, model_3_input, model_4_input], outputs = final_output)

In [63]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
save_best_weights = 'question_pairs_weights.h5'

In [65]:
t0 = time.time()
callbacks = [ModelCheckpoint(save_best_weights, monitor='val_loss', save_best_only=True),
             EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')]
history = model.fit([train_q1, train_q2, train_q1, train_q2],
                    questions.is_duplicate,
                    batch_size=256,
                    epochs=1, #Use 100, I reduce it for Kaggle,
                    validation_split=0.15,
                    verbose=True,
                    shuffle=True,
                    callbacks=callbacks)
t1 = time.time()
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Train on 343695 samples, validate on 60653 samples
Epoch 1/1
  7680/343695 [..............................] - ETA: 58:10 - loss: 0.7693 - acc: 0.5598

KeyboardInterrupt: 

In [None]:
summary_stats = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                              'train_acc': history.history['acc'],
                              'valid_acc': history.history['val_acc'],
                              'train_loss': history.history['loss'],
                              'valid_loss': history.history['val_loss']})

In [None]:
summary_stats

In [None]:
plt.plot(summary_stats.train_loss) # blue
plt.plot(summary_stats.valid_loss) # green
plt.show()

In [None]:
min_loss, idx = min((loss, idx) for (idx, loss) in enumerate(history.history['val_loss']))
print('Minimum loss at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(min_loss))
min_loss = round(min_loss, 4)

In [None]:
model.load_weights(save_best_weights)
predictions = model.predict([test_q1, test_q2, test_q1, test_q2], verbose = True)

**Siamese Neural Networks(Using LSTM and GRU)**

https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

In [None]:
from keras.models import Model
import keras.backend as backend
from keras.layers import Input, Embedding, LSTM, Lambda, GRU, Dropout

In [None]:
vocabulary = dict()
inverse_vocabulary = ['<unk>']

In [None]:
q2n_left = []
for sentence in sentences_1.tolist():
    temp_sentence = []
    for word in sentence.split():
        if word not in vocabulary:
            vocabulary[word] = len(inverse_vocabulary)
            temp_sentence.append(len(inverse_vocabulary))
            inverse_vocabulary.append(word)
        else:
            temp_sentence.append(vocabulary[word])
    q2n_left.append(temp_sentence)

In [None]:
q2n_right = []
for sentence in sentences_2.tolist():
    temp_sentence = []
    for word in sentence.split():
        if word not in vocabulary:
            vocabulary[word] = len(inverse_vocabulary)
            temp_sentence.append(len(inverse_vocabulary))
            inverse_vocabulary.append(word)
        else:
            temp_sentence.append(vocabulary[word])
    q2n_right.append(temp_sentence)

In [None]:
embedding_dim = 300
embeddings = np.zeros((len(vocabulary) + 1, embedding_dim))
embeddings[0] = 0

In [None]:
for word, index in vocabulary.items():
    if word in google_model.vocab:
        embeddings[index] = google_model.word_vec(word)

In [None]:
del google_model

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
dataset_left = q2n_left

In [None]:
dataset_right = q2n_right

In [None]:
max_seq_length = 0
for each in range(length):
    max_seq_length = max(max_seq_length, len(q2n_left[each]), len(q2n_right[each]))
print(max_seq_length)

In [None]:
dataset_left = pad_sequences(q2n_left, maxlen=max_seq_length)
dataset_right = pad_sequences(q2n_right, maxlen=max_seq_length)

In [None]:
dataset_left.shape == dataset_right.shape

In [None]:
n_hidden1 = 512
n_hidden2 = 384
n_hidden3 = 256
n_hidden4 = 128

In [None]:
left_input = Input(shape=(max_seq_length, ), dtype='int32')
right_input = Input(shape=(max_seq_length, ), dtype='int32')

In [None]:
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], 
                            input_length=max_seq_length, trainable=False)

In [None]:
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

In [None]:
shared_lstm1 = LSTM(n_hidden1, return_sequences=True)
shared_dropout1 = Dropout(0.3)
shared_gru1 = GRU(n_hidden2, return_sequences=True)
shared_dropout2 = Dropout(0.4)
shared_gru2 = GRU(n_hidden3, return_sequences=True)
shared_dropout3 = Dropout(0.3)
shared_lstm2 = LSTM(n_hidden4, return_sequences=False)

In [None]:
left_lstm1 = shared_lstm1(encoded_left)
left_dropout1 = shared_dropout1(left_lstm1)
left_gru1 = shared_gru1(left_dropout1)
left_dropout2 = shared_dropout2(left_gru1)
left_gru2 = shared_gru2(left_dropout2)
left_dropout3 = shared_dropout3(left_gru2)
left_lstm2 = shared_lstm2(left_dropout3)

In [None]:
right_lstm1 = shared_lstm1(encoded_right)
right_dropout1 = shared_dropout1(right_lstm1)
right_gru1 = shared_gru1(right_dropout1)
right_dropout2 = shared_dropout2(right_gru1)
right_gru2 = shared_gru2(right_dropout2)
right_dropout3 = shared_dropout3(right_gru2)
right_lstm2 = shared_lstm2(right_dropout3)

In [None]:
manhattan_distance_for_lstm = Lambda(function=lambda x: backend.exp(-backend.sum(backend.abs(x[0]-x[1]), axis=1, keepdims=True)),
                                     output_shape=lambda x: (x[0][0], 1))([left_lstm2, right_lstm2])

**Training and Validation**

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
stratkfold = StratifiedKFold(n_splits=2, random_state=None, shuffle=True)

In [None]:
for train_index, test_index in stratkfold.split(dataset_left, is_duplicate_questions):
    siamese_network = Model([left_input, right_input], manhattan_distance_for_lstm)
    siamese_network.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])
    siamese_network.fit([dataset_left[train_index], dataset_right[train_index]], is_duplicate_questions[train_index], batch_size=128, 
                        epochs=128, validation_data=([dataset_left[test_index], dataset_right[test_index]], is_duplicate_questions[test_index]))

# Spell Corrector

**Word Corrector**

In [None]:
import re
from collections import Counter

In [None]:
def words(text): 
    return re.findall(r'\w+', text.lower())

In [None]:
WORDS = Counter(words(open('data/big.txt').read()))

In [None]:
def probability(word, n=sum(WORDS.values())): 
    return WORDS[word] / n

In [None]:
def correction(word): 
    return max(candidates(word), key=probability)

In [None]:
def candidates(word): 
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

In [None]:
def known(words):
    return set(w for w in words if w in WORDS)

In [None]:
def edits1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

In [None]:
def edits2(word):
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

**Sentence Corrector**

In [None]:
import json

In [None]:
with open("data/words_dictionary.json") as words_dictionary_file:
    word_dict = json.load(words_dictionary_file)

In [None]:
def correct_sentences(sentence):
    sentence = sentence.lower().split()
    combination_sentences = []
    combination_probabilities = []
    meta_data = {}
    for each in sentence:
        if not word_dict.get(each, None):
            possible_words = candidates(each)
            probabilities = []
            for each_word in possible_words:
                probabilities.append(probability(each_word))
            meta_data[each] = [list(possible_words), list(probabilities)]
    for i in range(len(sentence)):
        if meta_data.get(sentence[i], None):
            for each in meta_data[sentence[i]][0]:
                combination_sentences.append(" ".join(sentence[:i]) + " " + each + " ".join(sentence[i+1:]))
    return combination_sentences

In [None]:
correct_sentences("Every thing comes with a pricee")