**Themes**

In [1]:
!pip install jupyterthemes
!jt -t monokai



You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Semantics Similarity

**Installing the Libraries**

In [None]:
!pip install -U -q numpy
!pip install -U -q keras
!pip install -U -q scikit-learn
!pip install -U -q matplotlib
!pip install -U -q nltk
!pip install -U -q PyDrive 
!pip install -U -q pandas
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn
!pip3 install --quiet "tensorflow>=1.7"

**Getting data from Google Drive**

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import json

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file_ids = ["16-aKOfyeLQpBJlUHCJUGxWp4UsY2rvb3", "1oec77bHzg5a2oGshDuBe99jxMi80NlUo"]
file_names = ["train_translated.csv", "test_translated.csv"]

In [None]:
for each_id, each_name in zip(file_ids, file_names):
    download = drive.CreateFile({'id':each_id})
    download.GetContentFile(each_name)

# Dataset

In [2]:
import numpy as np
import pandas as pd
import multiprocessing

In [3]:
test = pd.read_csv("datasets/msr-para-train.tsv", sep='\t', error_bad_lines=False, skip_blank_lines=True, keep_default_na=False)
test.dropna(inplace=True)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\n'


In [4]:
test1 = test.iloc[:, 3].values
test2 = test.iloc[:, 4].values
res = test.iloc[:, 0].values

In [5]:
questions = pd.read_csv("datasets/questions.csv")

In [10]:
questions1 = questions.iloc[:, 3].values
questions2 = questions.iloc[:, 4].values
is_duplicate_questions = questions.iloc[:, 5].values

In [8]:
questions.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
cores = multiprocessing.cpu_count()

In [5]:
length = res.shape[0]

# Natural Language Processing

**Text Preprocessing**

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
lemma = WordNetLemmatizer()
stopword = stopwords.words("english")

In [14]:
dataset_p_l_rms_l_1 = []
dataset_p_l_rms_1 = []
for i in questions1:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    dataset_p_l_rms_1.append(" ".join(tempx))
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    dataset_p_l_rms_l_1.append(" ".join(tempx))
dataset_p_l_rms_l_1 = np.asarray(dataset_p_l_rms_l1)
dataset_p_l_rms_1 = np.asarray(dataset_p_l_rms_1)

In [15]:
dataset_p_l_rms_l_2 = []
dataset_p_l_rms_2 = []
for i in questions2:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    dataset_p_l_rms_2.append(" ".join(tempx))
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    dataset_p_l_rms_l_2.append(" ".join(tempx))
dataset_p_l_rms_l_2 = np.asarray(dataset_p_l_rms_l_2)
dataset_p_l_rms_2 = np.asarray(dataset_p_l_rms_2)

In [16]:
max_words = 0
for each in dataset_p_l_rms_l_1:
    max_words = max(len(each.split()), max_words)

for each in dataset_p_l_rms_l_2:
    max_words = max(len(each.split()), max_words)
print(max_words)

97


# Text Comparisions

**Count Vectorizor**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(np.append(train, test, axis=0))

In [None]:
vectorized_train = count_vectorizer.transform(train)

In [None]:
vectorized_test = count_vectorizer.transform(test)

**Tfidf Vectorizor**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer.fit(np.append(train, test, axis=0))

In [None]:
vectorized_train = tfidf_vectorizer.transform(train)

In [None]:
vectorized_test = tfidf_vectorizer.transform(test)

**LSA Method**

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True)

In [19]:
svd_model = TruncatedSVD(n_components=300,
                         algorithm='randomized',
                         n_iter=10, random_state=42)

In [20]:
lsa_model1 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [21]:
lsa_model2 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [25]:
lsa_test1 = lsa_model1.fit_transform(dataset_p_l_rms_l_1)
lsa_test2 = lsa_model2.fit_transform(dataset_p_l_rms_l_2)

In [26]:
from scipy.spatial.distance import cosine

In [27]:
from math import isnan

In [29]:
founded_lsa = []
for i in range(lsa_test1.shape[0]):
    temp = cosine(lsa_test1[i], lsa_test2[i])
    if not isnan(temp):
        founded_lsa.append(1 - int(temp))
    else:
        founded_lsa.append(0)
founded_lsa = np.asarray(founded_lsa)

  dist = 1.0 - uv / np.sqrt(uu * vv)


**Word2Vec model(Using Mean to get the sentence vectors)**

In [None]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [None]:
wiki_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/wiki/wiki.300d.txt", binary=False)
#google_model = KeyedVectors.load_word2vec_format("models/pretrained/word2vec/google/google.300d.bin", binary=True)
#common_crawl_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/common_crawl/common_crawl.300d.txt", binary=False)

In [None]:
def sentence_vectorizer(model, sentence):
    vectors =[]
    num = 0
    for i in sentence.split():
        try:
            if num == 0:
                vectors = model[i]
            else:
                vectors = np.add(vectors, model[i])
            num += 1
        except:
            pass
    return np.array(vectors) / num

In [None]:
sent_vec1 = []
for each in dataset_p_l_rms_1:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec1.append(temp)
    else:
        sent_vec1.append(np.zeros((300,)))
sent_vec1 = np.asarray(sent_vec1)

In [None]:
sent_vec2 = []
for each in dataset_p_l_rms_2:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec2.append(temp)
    else:
        sent_vec2.append(np.zeros((300,)))
sent_vec2 = np.asarray(sent_vec2)

In [None]:
founded_sent2vec = []
for i in range(length):
    temp = cosine(sent_vec1[i], sent_vec2[i])
    if not isnan(temp):
        founded_sent2vec.append(1 - int(temp))
    else:
        founded_sent2vec.append(0)
founded_sent2vec = np.asarray(founded_sent2vec)

**Sent2Vec model(Fast.ai)**

https://github.com/epfml/sent2vec

**Doc2Vec Model**

**Sentence Encoder V2**

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
embed = hub.Module(module_url)

In [None]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(messages))
    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
        print("Message: {}".format(messages[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in message_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [None]:
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=labels,
        yticklabels=labels,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

In [None]:
def run_and_plot(session_, input_tensor_, messages_, encoding_tensor):
    message_embeddings_ = session_.run(
        encoding_tensor, feed_dict={input_tensor_: messages_})
    plot_similarity(messages_, message_embeddings_, 90)

In [None]:
messages = [
    "I am an Indian",
    "I am from India",
    "I am not from India",
    "I play cricket",
    "I watch television",
]

In [None]:
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    run_and_plot(session, similarity_input_placeholder, messages,
         similarity_message_encodings)

**Siamese Neural Networks(Using LSTM and GRU)**

https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

In [None]:
from keras.models import Model
import keras.backend as backend
from keras.layers import Input, Embedding, LSTM, Lambda, GRU, Dropout

In [None]:
embedding_dim = 300
embeddings = 1 * np.random.randn(len(test1) + 1, embedding_dim)

In [None]:
n_hidden1 = 512
n_hidden2 = 384
n_hidden3 = 256
n_hidden4 = 128

In [None]:
left_input = Input(shape=(max_words, ), dtype='int32')
right_input = Input(shape=(max_words, ), dtype='int32')

In [None]:
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], 
                            input_length=max_words, trainable=False)

In [None]:
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

In [None]:
shared_lstm1 = LSTM(n_hidden1, return_sequences=True)
shared_dropout1 = Dropout(0.3)
shared_gru1 = GRU(n_hidden2, return_sequences=True)
shared_dropout2 = Dropout(0.4)
shared_gru2 = GRU(n_hidden3, return_sequences=True)
shared_dropout3 = Dropout(0.3)
shared_lstm2 = LSTM(n_hidden4, return_sequences=False)

In [None]:
left_lstm1 = shared_lstm1(encoded_left)
left_dropout1 = shared_dropout1(left_lstm1)
left_gru1 = shared_gru1(left_dropout1)
left_dropout2 = shared_dropout2(left_gru1)
left_gru2 = shared_gru2(left_dropout2)
left_dropout3 = shared_dropout3(left_gru2)
left_lstm2 = shared_lstm2(left_dropout3)

In [None]:
right_lstm1 = shared_lstm1(encoded_right)
right_dropout1 = shared_dropout1(right_lstm1)
right_gru1 = shared_gru1(right_dropout1)
right_dropout2 = shared_dropout2(right_gru1)
right_gru2 = shared_gru2(right_dropout2)
right_dropout3 = shared_dropout3(right_gru2)
right_lstm2 = shared_lstm2(right_dropout3)

In [None]:
manhattan_distance_for_lstm = Lambda(function=lambda x: backend.exp(-backend.sum(backend.abs(x[0]-x[1]), axis=1, keepdims=True)),
                                     output_shape=lambda x: (x[0][0], 1))([left_lstm2, right_lstm2])

**Training and Validation**

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
stratkfold = StratifiedKFold(n_splits=2, random_state=None, shuffle=True)

In [None]:
for train_index, test_index in stratkfold.split(test1, res):
    siamese_network = Model([left_input, right_input], manhattan_distance_for_lstm)
    siamese_network.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])
    siamese_network.fit([test1[train_index], test2[train_index]], res[train_index], batch_size=128, 
                        epochs=128, validation_data=([test1[test_index], test2[test_index]], res[test_index]))

**Testing**

In [None]:
siamese_network = Model([left_input, right_input], manhattan_distance_for_lstm)

In [None]:
siamese_network.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])

In [None]:
siamese_network.fit([test1[train_index], test2[train_index]], res[train_index], batch_size=128, epochs=128)

# Scores

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

**LSA Score**

In [32]:
print("Accuracy Score :", accuracy_score(is_duplicate_questions, founded_lsa))

Accuracy Score : 0.4665402088779303


In [33]:
print("Precision :", precision_score(is_duplicate_questions, founded_lsa))

Precision : 0.39018605805011164


In [34]:
print("Recall Score :", recall_score(is_duplicate_questions, founded_lsa))

Recall Score : 0.7900754155894606


In [35]:
print("F1 Score :", f1_score(is_duplicate_questions, founded_lsa))

F1 Score : 0.5223866386496941


**Sentence2Vec(Modified word2vec) Score**

In [None]:
print("Accuracy Score :", accuracy_score(res, founded_sent2vec))

In [None]:
print("Precision :", precision_score(res, founded_sent2vec))

In [None]:
print("Recall :", recall_score(res, founded_sent2vec))

In [None]:
print("F1 Score :", f1_score(res, founded_sent2vec))