In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
import distance



In [2]:

STOP_WORDS = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

In [3]:
SAFE_DIV = 0.0001

In [4]:
def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    return x


In [5]:
def get_token_features(q1, q2):
    token_features = [0.0]*10

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
    return token_features

In [6]:
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

In [7]:
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    print (token_features)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    print("token_set_ratio..")
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    print("token_sort_ratio..")
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    print("fuzz_ratio..")
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    print("fuzz_partial_ratio..")
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    print("similarity features..")
    df["jaccard"] =       df.apply(lambda x: distance.jaccard(x["question1"], x["question2"]), axis=1)
    df["levenshtein"] =   df.apply(lambda x: distance.levenshtein(x["question1"], x["question2"]), axis=1)
    return df

In [8]:
print("Extracting features for train:")
train_df = pd.read_csv("data/train.csv")

Extracting features for train:


In [9]:
train_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [10]:
train_df = extract_features(train_df)

0         [0.7999840003199936, 0.6666555557407376, 0.999...
1         [0.49998750031249223, 0.2222197531138543, 0.66...
2         [0.3999920001599968, 0.3333277778703688, 0.399...
3             [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 2, 10.0]
4         [0.0, 0.0, 0.9999500024998749, 0.6666444451851...
5         [0.499993750078124, 0.499993750078124, 0.85713...
6              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 7, 7.5]
7         [0.49997500124993743, 0.49997500124993743, 0.5...
8         [0.49998750031249223, 0.49998750031249223, 0.9...
9         [0.3999920001599968, 0.3333277778703688, 0.333...
10        [0.0, 0.0, 0.49997500124993743, 0.111109876556...
11        [0.6666444451851604, 0.49998750031249223, 0.59...
12        [0.9999750006249845, 0.9999750006249845, 0.666...
13        [0.6666444451851604, 0.49998750031249223, 0.99...
14        [0.818174380232907, 0.818174380232907, 0.99999...
15        [0.33332962967078145, 0.2999970000299997, 0.16...
16        [0.49997500124993743, 0.499975

In [11]:
train_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,jaccard,levenshtein
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,0.799984,0.666656,0.999983,0.999983,...,1.0,2.0,13.0,100,93,93,98,0.965517,0.0,9
1,1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0,0.499988,0.22222,0.666644,0.499988,...,1.0,5.0,10.5,86,63,66,75,0.576923,0.296296,42
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,0.399992,0.333328,0.399992,0.249997,...,1.0,4.0,12.0,63,63,43,47,0.166667,0.25,40


In [12]:
len(train_df)

404290

In [13]:
train_df.drop(["id", "qid1", "qid2", "question1", "question2"], axis=1, inplace=True)

In [14]:
train_split = train_df[:363861]
test_split = train_df[363861:]

In [15]:
train_split.tail(3)

Unnamed: 0,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,jaccard,levenshtein
363858,1,0.666644,0.499988,0.666644,0.666644,0.666656,0.57142,0.0,1.0,1.0,6.5,68,51,60,59,0.428571,0.4,18
363859,0,0.28571,0.28571,0.666644,0.399992,0.399996,0.333331,0.0,1.0,2.0,11.0,66,64,48,49,0.148148,0.090909,43
363860,1,0.857131,0.857131,0.999983,0.999983,0.857137,0.857137,0.0,1.0,0.0,14.0,97,91,93,94,0.926829,0.0,8


In [16]:
len(train_split)

363861

In [17]:
len(test_split)

40429

In [18]:
test_split.head(3)

Unnamed: 0,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,jaccard,levenshtein
363861,0,0.499975,0.333322,0.749981,0.499992,0.666656,0.399996,0.0,1.0,4.0,8.0,92,70,68,75,0.413793,0.117647,21
363862,0,0.666644,0.399992,0.599988,0.428565,0.624992,0.416663,1.0,1.0,4.0,10.0,79,69,60,53,0.27027,0.15,25
363863,1,0.749981,0.749981,0.666644,0.499988,0.714276,0.624992,1.0,1.0,1.0,7.5,85,72,72,67,0.432432,0.157895,18


In [19]:
train_split.to_csv("data/nlp_features(fuzzy)_train.csv", index=False)

In [20]:
test_split.to_csv("data/nlp_features(fuzzy)_test.csv", index=False)