This code is based on/inspired by a tutorial from Packt: https://hub.packtpub.com/use-tensorflow-and-nlp-to-detect-duplicate-quora-questions-tutorial/
The data in "quora_duplicate_questions.tsv" is released for non-commercial use only
More info can be found on: https://www.quora.com/about/tos

**This code expands the quora data with useful features to train a machine learning model**

In [20]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
from copy import deepcopy

**ASSUMPTIONS**
1. 2 questions that mean the same often share a lot of words, while 2 different questions rarely share a lot of words
2. 2 questions that mean the same often have a small edit distance, while 2 different questions rarely have a small edit distance

In [2]:
# Read in the data and remove unnecessary columns
data = pd.read_csv("quora_duplicate_questions.tsv", sep="\t") \
         .drop(["id", "qid1", "qid2"], axis=1)
# Split the data set into a training (data) and testing (test) data set
data, test = train_test_split(data, test_size=0.2)

**LENGTH BASED FEATURES**

In [3]:
# Calculate the length of each sentence
data["len_q1"] = data.question1.apply(lambda x: len(str(x)))
data["len_q2"] = data.question2.apply(lambda x: len(str(x)))
# Calculate the difference between the lengths of each pair of questions
data["dif_len"] = data.len_q1 - data.len_q2

In [4]:
# Calculate the character length of each sentence (excluding spaces)
data["len_char_q1"] = data.question1.apply(lambda x: len(str(x).replace(" ", "")))
data["len_char_q2"] = data.question2.apply(lambda x: len(str(x).replace(" ", "")))

In [5]:
# Calculate the word count of each sentence
data["len_word_q1"] = data.question1.apply(lambda x: len(str(x).split()))
data["len_word_q2"] = data.question2.apply(lambda x: len(str(x).split()))

In [6]:
# Count the number of common words in each pair of questions
data["common_words"] = \
    data.apply(lambda x: len(set(str(x.question1).lower().split()).intersection(
                             set(str(x.question2).lower().split()))),
               axis=1)

In [7]:
# The length-based feature set for future reference
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1',
        'len_char_q2', 'len_word_q1', 'len_word_q2',
        'common_words']

**DISTANCE BASED FEATURES**

In [8]:
# Calculate the Q and W ratio of each pair of questions
data["fuzz_QRatio"] = \
    data.apply(lambda x: fuzz.QRatio(str(x.question1),
                                     str(x.question2)),
               axis=1)
data["fuzz_WRatio"] = \
    data.apply(lambda x: fuzz.WRatio(str(x.question1),
                                     str(x.question2)),
               axis=1)
# Calculate the partial ratio of each pair of questions
data["fuzz_partial_ratio"] = \
    data.apply(lambda x: fuzz.partial_ratio(str(x.question1),
                                            str(x.question2)),
               axis=1)

In [10]:
# Calculate the partial token set ratio of each pair of questions
data["fuzz_partial_token_set_ratio"] = \
    data.apply(lambda x: fuzz.partial_token_set_ratio(str(x.question1),
                                                      str(x.question2)),
               axis=1)
# Calculate the partial token sort ratio of each pair of questions
data["fuzz_partial_token_sort_ratio"] = \
    data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x.question1),
                                                       str(x.question2)),
               axis=1)

In [11]:
# Calculate the token set ratio of each pair of questions
data["fuzz_token_set_ratio"] = \
    data.apply(lambda x: fuzz.token_set_ratio(str(x.question1),
                                              str(x.question2)),
               axis=1)
# Calculate the token sort ratio of each pair of questions
data["fuzz_token_sort_ratio"] = \
    data.apply(lambda x: fuzz.token_sort_ratio(str(x.question1),
                                               str(x.question2)),
               axis=1)

In [9]:
# The distance-based feature set for future reference
fs_2 = ['fuzz_QRatio', 'fuzz_WRatio', 'fuzz_partial_ratio',
        'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
        'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

**TF-IDF & LSA BASED FEATURES**

In [22]:
# Create term frequency-inverse document frequency vectorizers
tfv = TfidfVectorizer(min_df=3,
                      max_features=None,
                      strip_accents='unicode',
                      analyzer='word',
                      token_pattern=r"\w{1,}",
                      ngram_range=(1, 2),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
                      stop_words="english")
tfv_q1 = deepcopy(tfv)
tfv_q2 = deepcopy(tfv)

In [23]:
# Calculate the tf-idf matrices for both questions
q1_tfidf = tfv_q1.fit_transform(data.question1.fillna(""))
q2_tfidf = tfv_q2.fit_transform(data.question2.fillna(""))

In [29]:
# Create truncated SVD decompostions = fast but aproximate, with 180 components
svd = TruncatedSVD(n_components=180)
svd_q1 = TruncatedSVD(n_components=180)
svd_q2 = TruncatedSVD(n_components=180)

In [30]:
# Calculate the SVD features based on the tf-idf matrices
question1_vectors = svd_q1.fit_transform(q1_tfidf)
question2_vectors = svd_q2.fit_transform(q2_tfidf)

In [32]:
# The 3rd feature set is obtained by combining the tf-idf and SVD features
# Stack the tf-idf matrices together
fs3_1 = sparse.hstack((q1_tfidf, q2_tfidf))

In [33]:
# First combine the questions and then calculate the tf-idf
q1q2 = data.question1.fillna("") \
     + " " \
     + data.question2.fillna("")
fs3_2 = tfv.fit_transform(q1q2)

In [34]:
# Stack the SVD matrices togetherr
fs3_3 = np.hstack((question1_vectors, question2_vectors))

In [35]:
# First stack the tf-idf matrices together and then calculate the SVD features
fs3_4 = svd.fit_transform(fs3_1)

In [36]:
# First combine the questions and then calculate the SVD features
fs3_5 = svd.fit_transform(fs3_2)

**WORD2VEC BASED FEATURES**