# Semantics Similarity

**Installing the Libraries**

In [None]:
!pip install -U -q numpy
!pip install -U -q keras
!pip install -U -q scikit-learn
!pip install -U -q matplotlib
!pip install -U -q nltk
!pip install -U -q PyDrive 
!pip install -U -q pandas
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn
!pip3 install --quiet "tensorflow>=1.7"

**Getting data from Google Drive**

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import json

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file_ids = ["16-aKOfyeLQpBJlUHCJUGxWp4UsY2rvb3", "1oec77bHzg5a2oGshDuBe99jxMi80NlUo"]
file_names = ["train_translated.csv", "test_translated.csv"]

In [None]:
for each_id, each_name in zip(file_ids, file_names):
    download = drive.CreateFile({'id':each_id})
    download.GetContentFile(each_name)

# Dataset

In [1]:
import numpy as np
import pandas as pd
import multiprocessing

In [2]:
questions = pd.read_csv("E:\Datasets\quora\questions.csv")
questions.dropna(inplace=True)

In [4]:
questions1 = questions.iloc[:, 3].values
questions2 = questions.iloc[:, 4].values
is_duplicate_questions = questions.iloc[:, 5].values

In [5]:
questions.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
cores = multiprocessing.cpu_count()

In [7]:
length = is_duplicate_questions.shape[0]

In [8]:
dataset_p_l_rms_l_1 = questions1
dataset_p_l_rms_l_2 = questions2

# Natural Language Processing

**Text Preprocessing**

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

In [10]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
lemma = WordNetLemmatizer()
stopword = stopwords.words("english")

In [12]:
dataset_p_l_rms_l_1 = []
dataset_p_l_rms_1 = []
for i in questions1:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    dataset_p_l_rms_1.append(" ".join(tempx))
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    dataset_p_l_rms_l_1.append(" ".join(tempx))
dataset_p_l_rms_l_1 = np.asarray(dataset_p_l_rms_l_1)
dataset_p_l_rms_1 = np.asarray(dataset_p_l_rms_1)

In [13]:
dataset_p_l_rms_l_2 = []
dataset_p_l_rms_2 = []
for i in questions2:
    tempx = re.sub(r"[^A-Za-z]", " ", str(i))
    tempx = tempx.lower().split()
    tempx = [word for word in tempx if word not in stopword]
    dataset_p_l_rms_2.append(" ".join(tempx))
    tempx = [lemma.lemmatize(word, pos="a") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="r") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="n") for word in tempx]
    tempx = [lemma.lemmatize(word, pos="v") for word in tempx]
    dataset_p_l_rms_l_2.append(" ".join(tempx))
dataset_p_l_rms_l_2 = np.asarray(dataset_p_l_rms_l_2)
dataset_p_l_rms_2 = np.asarray(dataset_p_l_rms_2)

In [14]:
dataset_no = 5
print(dataset_p_l_rms_1[dataset_no])
print(dataset_p_l_rms_2[dataset_no])
print(is_duplicate_questions[dataset_no])

astrology capricorn sun cap moon cap rising say
triple capricorn sun moon ascendant capricorn say
1


# Text Comparisions

**Count Vectorizor**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(np.append(dataset_p_l_rms_l_1, dataset_p_l_rms_l_2, axis=0))

In [None]:
v_dataset_p_l_rms_l_1 = count_vectorizer.transform(dataset_p_l_rms_l_1)

In [None]:
v_dataset_p_l_rms_l_2 = count_vectorizer.transform(dataset_p_l_rms_l_2)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

In [None]:
distance_and_similarity_scores = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(v_dataset_p_l_rms_l_1[i], v_dataset_p_l_rms_l_2[i])[0][0]
    temp['manhattan_distance'] = euclidean_distances(v_dataset_p_l_rms_l_1[i], v_dataset_p_l_rms_l_2[i])[0][0]
    temp['euclidean_distance'] = manhattan_distances(v_dataset_p_l_rms_l_1[i], v_dataset_p_l_rms_l_2[i])[0][0]
    distance_and_similarity_scores.append(temp)
#try to give this to log reg to find the similarity between these data

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores[i])

**Tfidf Vectorizor**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer.fit(np.append(dataset_p_l_rms_l_1, dataset_p_l_rms_l_2, axis=0))

In [None]:
V_dataset_p_l_rms_l_1 = tfidf_vectorizer.transform(dataset_p_l_rms_l_1)

In [None]:
V_dataset_p_l_rms_l_2 = tfidf_vectorizer.transform(dataset_p_l_rms_l_2)

In [None]:
distance_and_similarity_scores_2 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(V_dataset_p_l_rms_l_1[i], V_dataset_p_l_rms_l_2[i])[0][0]
    temp['manhattan_distance'] = euclidean_distances(V_dataset_p_l_rms_l_1[i], V_dataset_p_l_rms_l_2[i])[0][0]
    temp['euclidean_distance'] = manhattan_distances(V_dataset_p_l_rms_l_1[i], V_dataset_p_l_rms_l_2[i])[0][0]
    distance_and_similarity_scores_2.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_2[i])

**LSA Method**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True)

In [None]:
svd_model = TruncatedSVD(n_components=300,
                         algorithm='randomized',
                         n_iter=10, random_state=42)

In [None]:
lsa_model1 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [None]:
lsa_model2 = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

In [None]:
lsa_test1 = lsa_model1.fit_transform(dataset_p_l_rms_l_1)
lsa_test2 = lsa_model2.fit_transform(dataset_p_l_rms_l_2)

In [None]:
lsa_dataset_p_l_rms_l_1 = lsa_test1
lsa_dataset_p_l_rms_l_2 = lsa_test2

In [None]:
distance_and_similarity_scores_3 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([lsa_dataset_p_l_rms_l_1[i]]), np.asarray([lsa_dataset_p_l_rms_l_2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([lsa_dataset_p_l_rms_l_1[i]]), np.asarray([lsa_dataset_p_l_rms_l_2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([lsa_dataset_p_l_rms_l_1[i]]), np.asarray([lsa_dataset_p_l_rms_l_2[i]]))[0][0]
    distance_and_similarity_scores_3.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_3[i])

**Word2Vec model(Using Mean to get the sentence vectors)**

In [16]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [17]:
#wiki_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/wiki/wiki.300d.txt", binary=False)
google_model = KeyedVectors.load_word2vec_format("E:\Models\pre_trained\word2vec\google\google.300d.bin", binary=True)
#common_crawl_model = KeyedVectors.load_word2vec_format("models/pretrained/glove/common_crawl/common_crawl.300d.txt", binary=False)

In [None]:
def sentence_vectorizer(model, sentence):
    vectors =[]
    num = 0
    for i in sentence.split():
        try:
            if num == 0:
                vectors = model[i]
            else:
                vectors = np.add(vectors, model[i])
            num += 1
        except:
            pass
    return np.array(vectors) / num

In [None]:
sent_vec1 = []
for each in dataset_p_l_rms_1:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec1.append(temp)
    else:
        sent_vec1.append(np.zeros((300,)))
sent_vec1 = np.asarray(sent_vec1)

In [None]:
sent_vec2 = []
for each in dataset_p_l_rms_2:
    temp = sentence_vectorizer(google_model, each)
    if temp.shape[0] != 0:
        sent_vec2.append(temp)
    else:
        sent_vec2.append(np.zeros((300,)))
sent_vec2 = np.asarray(sent_vec2)

In [None]:
distance_and_similarity_scores_4 = []

In [None]:
for i in range(0, 20):
    temp = {}
    temp['cosine_similarity'] = cosine_similarity(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    temp['manhattan_distance'] = euclidean_distances(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    temp['euclidean_distance'] = manhattan_distances(np.asarray([sent_vec1[i]]), np.asarray([sent_vec2[i]]))[0][0]
    distance_and_similarity_scores_4.append(temp)

In [None]:
for i in range(0, 20):
    print(is_duplicate_questions[i], distance_and_similarity_scores_4[i])

**Sent2Vec model(Fast.ai)**

https://github.com/epfml/sent2vec

**Doc2Vec Model**

**Sentence Encoder V2**

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
embed = hub.Module(module_url)

In [None]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(messages))
    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
        print("Message: {}".format(messages[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in message_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [None]:
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=labels,
        yticklabels=labels,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

In [None]:
def run_and_plot(session_, input_tensor_, messages_, encoding_tensor):
    message_embeddings_ = session_.run(
        encoding_tensor, feed_dict={input_tensor_: messages_})
    plot_similarity(messages_, message_embeddings_, 90)

In [None]:
messages = [
    "I am an Indian",
    "I am from India",
    "I am not from India",
    "I play cricket",
    "I watch television",
]

In [None]:
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    run_and_plot(session, similarity_input_placeholder, messages,
         similarity_message_encodings)

**Siamese Neural Networks(Using LSTM and GRU)**

https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

In [18]:
from keras.models import Model
import keras.backend as backend
from keras.layers import Input, Embedding, LSTM, Lambda, GRU, Dropout

Using TensorFlow backend.


In [19]:
vocabulary = dict()
inverse_vocabulary = ['<unk>']

In [36]:
q2n_left = []
for sentence in dataset_p_l_rms_l_1.tolist():
    temp_sentence = []
    for word in sentence.split():
        if word not in vocabulary:
            vocabulary[word] = len(inverse_vocabulary)
            temp_sentence.append(len(inverse_vocabulary))
            inverse_vocabulary.append(word)
        else:
            temp_sentence.append(vocabulary[word])
    q2n_left.append(temp_sentence)

In [37]:
q2n_right = []
for sentence in dataset_p_l_rms_l_2.tolist():
    temp_sentence = []
    for word in sentence.split():
        if word not in vocabulary:
            vocabulary[word] = len(inverse_vocabulary)
            temp_sentence.append(len(inverse_vocabulary))
            inverse_vocabulary.append(word)
        else:
            temp_sentence.append(vocabulary[word])
    q2n_right.append(temp_sentence)

In [22]:
embedding_dim = 300
embeddings = np.zeros((len(vocabulary) + 1, embedding_dim))
embeddings[0] = 0

In [23]:
for word, index in vocabulary.items():
    if word in google_model.vocab:
        embeddings[index] = google_model.word_vec(word)

In [24]:
del google_model

In [25]:
from keras.preprocessing.sequence import pad_sequences

In [39]:
dataset_left = q2n_left

In [40]:
dataset_right = q2n_right

In [43]:
max_seq_length = 0
for each in range(length):
    max_seq_length = max(max_seq_length, len(q2n_left[each]), len(q2n_right[each]))
print(max_seq_length)

97


In [44]:
dataset_left = pad_sequences(q2n_left, maxlen=max_seq_length)
dataset_right = pad_sequences(q2n_right, maxlen=max_seq_length)

In [45]:
dataset_left.shape == dataset_right.shape

True

In [51]:
n_hidden1 = 512
n_hidden2 = 384
n_hidden3 = 256
n_hidden4 = 128

In [47]:
left_input = Input(shape=(max_seq_length, ), dtype='int32')
right_input = Input(shape=(max_seq_length, ), dtype='int32')

In [48]:
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], 
                            input_length=max_seq_length, trainable=False)

In [49]:
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

In [52]:
shared_lstm1 = LSTM(n_hidden1, return_sequences=True)
shared_dropout1 = Dropout(0.3)
shared_gru1 = GRU(n_hidden2, return_sequences=True)
shared_dropout2 = Dropout(0.4)
shared_gru2 = GRU(n_hidden3, return_sequences=True)
shared_dropout3 = Dropout(0.3)
shared_lstm2 = LSTM(n_hidden4, return_sequences=False)

In [53]:
left_lstm1 = shared_lstm1(encoded_left)
left_dropout1 = shared_dropout1(left_lstm1)
left_gru1 = shared_gru1(left_dropout1)
left_dropout2 = shared_dropout2(left_gru1)
left_gru2 = shared_gru2(left_dropout2)
left_dropout3 = shared_dropout3(left_gru2)
left_lstm2 = shared_lstm2(left_dropout3)

In [54]:
right_lstm1 = shared_lstm1(encoded_right)
right_dropout1 = shared_dropout1(right_lstm1)
right_gru1 = shared_gru1(right_dropout1)
right_dropout2 = shared_dropout2(right_gru1)
right_gru2 = shared_gru2(right_dropout2)
right_dropout3 = shared_dropout3(right_gru2)
right_lstm2 = shared_lstm2(right_dropout3)

In [55]:
manhattan_distance_for_lstm = Lambda(function=lambda x: backend.exp(-backend.sum(backend.abs(x[0]-x[1]), axis=1, keepdims=True)),
                                     output_shape=lambda x: (x[0][0], 1))([left_lstm2, right_lstm2])

**Training and Validation**

In [56]:
from sklearn.model_selection import StratifiedKFold

In [57]:
stratkfold = StratifiedKFold(n_splits=2, random_state=None, shuffle=True)

In [59]:
for train_index, test_index in stratkfold.split(dataset_left, is_duplicate_questions):
    siamese_network = Model([left_input, right_input], manhattan_distance_for_lstm)
    siamese_network.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])
    siamese_network.fit([dataset_left[train_index], dataset_right[train_index]], is_duplicate_questions[train_index], batch_size=128, 
                        epochs=1, validation_data=([dataset_left[test_index], dataset_right[test_index]], is_duplicate_questions[test_index]))

Train on 202174 samples, validate on 202174 samples
Epoch 1/1
  2048/202174 [..............................] - ETA: 9:04:12 - loss: 0.2159 - acc: 0.6372

KeyboardInterrupt: 

# Spell Corrector

**Word Corrector**

In [1]:
import re
from collections import Counter

In [4]:
def words(text): 
    return re.findall(r'\w+', text.lower())

In [5]:
WORDS = Counter(words(open('data/big.txt').read()))

In [6]:
def probability(word, n=sum(WORDS.values())): 
    return WORDS[word] / n

In [7]:
def correction(word): 
    return max(candidates(word), key=probability)

In [8]:
def candidates(word): 
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

In [9]:
def known(words):
    return set(w for w in words if w in WORDS)

In [10]:
def edits1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

In [11]:
def edits2(word):
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

**Sentence Corrector**

In [12]:
import json

In [13]:
with open("data/words_dictionary.json") as words_dictionary_file:
    word_dict = json.load(words_dictionary_file)

In [17]:
def correct_sentences(sentence):
    sentence = sentence.lower().split()
    combination_sentences = []
    combination_probabilities = []
    meta_data = {}
    for each in sentence:
        if not word_dict.get(each, None):
            possible_words = candidates(each)
            probabilities = []
            for each_word in possible_words:
                probabilities.append(probability(each_word))
            meta_data[each] = [list(possible_words), list(probabilities)]
    for i in range(len(sentence)):
        if meta_data.get(sentence[i], None):
            for each in meta_data[sentence[i]][0]:
                combination_sentences.append(" ".join(sentence[:i]) + " " + each + " ".join(sentence[i+1:]))
    return combination_sentences

In [19]:
correct_sentences("Every thing comes with a pricee")

['every thing comes with a price', 'every thing comes with a prices']