In [40]:
import pandas as pd
from fuzzywuzzy import fuzz
import swifter
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx
from collections import Counter
import numpy as np
import statistics


In [41]:
train_df = pd.read_csv('../data/train.csv.zip').dropna()
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [42]:
DATASET_NAME = 'dataset_v2'
STOP_WORDS = set(stopwords.words('english'))
EMBEDDINGS_FILE = ''

In [43]:
lemmatizer = WordNetLemmatizer()

def clean(uncleaned_string):
    cleaned_string = uncleaned_string.lower()
    cleaned_string = cleaned_string.translate(str.maketrans(dict.fromkeys(string.punctuation, ' ')))

    return cleaned_string
    

def preprocess(question_text):
    processed_string = ' '.join([lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'n'), 'v') for word in question_text.split(' ')])
    
    processed_string = clean(processed_string)
    
    words = processed_string.split(' ')
    
    processed_string = " ".join(word for word in words if word not in STOP_WORDS)
    
    return processed_string

train_df['question1_lemma'] = train_df['question1'].swifter.apply(preprocess)
train_df['question2_lemma'] = train_df['question2'].swifter.apply(preprocess)

train_df.head()

Pandas Apply: 100%|██████████| 404287/404287 [00:18<00:00, 21757.94it/s]
Pandas Apply: 100%|██████████| 404287/404287 [00:18<00:00, 21577.57it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water


In [44]:
def add_fuzzywuzzy_features(df):
    df['simple_ratio'] = df.swifter.apply(lambda x: fuzz.ratio(x['question1_lemma'], x['question2']), axis=1)
    df['partial_ratio'] = df.swifter.apply(lambda x: fuzz.partial_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_sort_ratio'] = df.swifter.apply(lambda x: fuzz.token_sort_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_set_ratio'] = df.swifter.apply(lambda x: fuzz.token_set_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    return df

train_df = add_fuzzywuzzy_features(train_df.copy(deep=True))

train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:02<00:00, 15.59it/s]
Dask Apply: 100%|██████████| 32/32 [00:03<00:00,  9.22it/s]
Dask Apply: 100%|██████████| 32/32 [00:02<00:00, 11.92it/s]
Dask Apply: 100%|██████████| 32/32 [00:03<00:00, 10.23it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,92,100
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,59,89
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,70,84
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,22,23
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,40,51


In [45]:
def get_question_type(question_text):
    """Look for the question type by looking for certain keywords, if multiple
    keywords are found take the leftmost one.
    """
    
    question_text = clean(question_text)
    
    question_type_map = {
        'why': 1,
        'what': 2,
        'when': 3,
        'where': 4,
        'who': 5,
        'how': 6
    }
    
    found_keywords = []
    for question_type in question_type_map:
        if question_type in question_text:
            found_keywords.append((question_type, question_text.index(question_type)))
           
    found_keywords.sort(key=lambda x: x[1])
            
    return question_type_map[found_keywords[0][0]] if found_keywords else -1
    

def add_question_type(df):
    """
        Checks the question type, the type can be any of the 5W 1H why, what, when, where, who and how or n/a. The types are
        stored as integers from 1 to 6 and -1 for n/a.
            -1 -> n/a
            1 -> why
            2-> what
            3 -> when
            4 -> where
            5 -> who
            6 -> how
    """
    
    df['question1_type'] = df.swifter.apply(lambda r: get_question_type(r['question1']), axis=1)
    df['question2_type'] = df.swifter.apply(lambda r: get_question_type(r['question2']), axis=1)
    
    return df

train_df = add_question_type(train_df.copy(deep=True))

train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:01<00:00, 17.12it/s]
Dask Apply: 100%|██████████| 32/32 [00:01<00:00, 17.16it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,92,100,2,2
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,59,89,2,2
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,70,84,6,6
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,22,23,1,3
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,40,51,-1,-1


In [46]:
def punctuation_count(question):
    counts = Counter(question)
    return sum(counts[p] for p in string.punctuation)


def add_misc_features(df):
    df['question1_length'] = df.swifter.apply(lambda r: len(r['question1']), axis=1)
    df['question2_length'] = df.swifter.apply(lambda r: len(r['question2']), axis=1)

    df['question1_punctuation_count'] = df.swifter.apply(lambda r: punctuation_count(r['question1']), axis=1)
    df['question2_punctuation_count'] = df.swifter.apply(lambda r: punctuation_count(r['question2']), axis=1)

    return df


train_df = add_misc_features(train_df.copy(deep=True))

train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:01<00:00, 20.17it/s]
Dask Apply: 100%|██████████| 32/32 [00:01<00:00, 21.22it/s]
Dask Apply: 100%|██████████| 32/32 [00:02<00:00, 15.34it/s]
Dask Apply: 100%|██████████| 32/32 [00:02<00:00, 14.56it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type,question1_length,question2_length,question1_punctuation_count,question2_punctuation_count
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,92,100,2,2,66,57,1,1
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,59,89,2,2,51,88,5,5
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,70,84,6,6,73,59,1,1
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,22,23,1,3,50,65,2,10
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,40,51,-1,-1,76,39,3,1


In [47]:
def create_question_hash(question):
    return question

train_df['question1_hash'] = train_df['question1'].swifter.apply(create_question_hash, axis=1)
train_df['question2_hash'] = train_df['question2'].swifter.apply(create_question_hash, axis=1)

g = nx.Graph()
g.add_nodes_from(train_df.question1_hash)
g.add_nodes_from(train_df.question2_hash)

edges = list(train_df[['question1_hash', 'question2_hash']].to_records(index=False))
g.add_edges_from(edges)

In [49]:
g_avg_degree = sum(dict(g.degree()).values())/float(len(g))
g_stdev = statistics.stdev(dict(g.degree()).values())

def get_degree(question_hash):
    degree = g.degree[question_hash]
    return degree

def get_degree_deviation(question_hash):
    degree = g.degree[question_hash]
    return (degree - g_avg_degree)/g_stdev

def get_second_order_total_degree(question_hash):
    neighbors = g[question_hash]
    return sum([g.degree[neighbor] for neighbor in neighbors])

def get_kcore_max():
    df_output = pd.DataFrame(data=g.nodes(), columns=["qid"])

    NB_CORES = 20

    for k in range(2, NB_CORES + 1):

        fieldname = "kcore{}".format(k)

        print("fieldname = ", fieldname)

        ck = nx.k_core(g, k=k).nodes()

        print("len(ck) = ", len(ck))

        df_output[fieldname] = 0

        df_output.loc[df_output.qid.isin(ck), fieldname] = k
        
    df_output.index.names = ["qid"]

    df_output['max_kcore'] = df_output.apply(lambda row: max(row), axis=1)

def get_kcore():
    NB_CORES = 20

    q1_kcore_cols = []
    q2_kcore_cols = []

    for k in range(2, NB_CORES + 1):
        fieldname = "kcore{}".format(k)
        
        q1_kcore_cols.append(fieldname+'_qid1')
        q2_kcore_cols.append(fieldname+'_qid2')

        ck = nx.k_core(g, k=k).nodes()

        train_df[fieldname+'_qid1'] = 0
        train_df[fieldname+'_qid2'] = 0

        train_df.loc[train_df.qid1.isin(ck), fieldname+'_qid1'] = k
        train_df.loc[train_df.qid2.isin(ck), fieldname+'_qid2'] = k
    
    return q1_kcore_cols, q2_kcore_cols

def get_max_kcore(row, cols):
    return max(row[cols])

def get_graph_features(df, q1_kcore_cols, q2_kcore_cols):
    df['question1_degree'] = df.swifter.apply(lambda r: get_degree(r['question1_hash']), axis=1)
    df['question2_degree'] = df.swifter.apply(lambda r: get_degree(r['question2_hash']), axis=1)
    
    df['question1_degree_deviation'] = df.swifter.apply(lambda r: get_degree_deviation(r['question1_hash']), axis=1)
    df['question2_degree_deviation'] = df.swifter.apply(lambda r: get_degree_deviation(r['question2_hash']), axis=1)
    
    df['question1_second_order_total_degree'] = df.swifter.apply(lambda r: get_second_order_total_degree(r['question1_hash']), axis=1)
    df['question2_second_order_total_degree'] = df.swifter.apply(lambda r: get_second_order_total_degree(r['question2_hash']), axis=1)
    
    df['question1_max_kcore'] = df.swifter.apply(lambda r: get_max_kcore(r, q1_kcore_cols), axis=1)
    df['question2_max_kcore'] = df.swifter.apply(lambda r: get_max_kcore(r, q2_kcore_cols), axis=1)
        
    return df

q1_kcore_cols, q2_kcore_cols = get_kcore()
train_df = get_graph_features(train_df.copy(deep=True), q1_kcore_cols, q2_kcore_cols)
train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:39<00:00,  1.23s/it]
Dask Apply: 100%|██████████| 32/32 [00:38<00:00,  1.20s/it]
Dask Apply: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Dask Apply: 100%|██████████| 32/32 [00:38<00:00,  1.19s/it]
Dask Apply: 100%|██████████| 32/32 [00:39<00:00,  1.22s/it]
Dask Apply: 100%|██████████| 32/32 [00:39<00:00,  1.25s/it]
Dask Apply: 100%|██████████| 32/32 [00:17<00:00,  1.83it/s]
Dask Apply: 100%|██████████| 32/32 [00:17<00:00,  1.83it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,kcore20_qid1,kcore20_qid2,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation,question1_second_order_total_degree,question2_second_order_total_degree,question1_max_kcore,question2_max_kcore
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,...,0,0,1,1,-0.264051,-0.264051,1,1,0,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,...,0,0,4,2,1.305445,0.259114,5,7,0,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,...,0,0,1,1,-0.264051,-0.264051,1,1,0,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,...,0,0,1,1,-0.264051,-0.264051,1,1,0,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,...,0,0,3,1,0.78228,-0.264051,4,3,0,0


In [50]:
def match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in STOP_WORDS:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in STOP_WORDS:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

train_df['match_share'] = train_df.swifter.apply(match_share, axis=1)

train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:02<00:00, 10.95it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,kcore20_qid2,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation,question1_second_order_total_degree,question2_second_order_total_degree,question1_max_kcore,question2_max_kcore,match_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,...,0,1,1,-0.264051,-0.264051,1,1,0,0,0.727273
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,...,0,4,2,1.305445,0.259114,5,7,0,0,0.307692
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,...,0,1,1,-0.264051,-0.264051,1,1,0,0,0.363636
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,...,0,1,1,-0.264051,-0.264051,1,1,0,0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,...,0,3,1,0.78228,-0.264051,4,3,0,0,0.0


In [51]:
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [52]:
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in STOP_WORDS:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in STOP_WORDS:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train_df['tfidf_word_match_share'] = train_df.swifter.apply(tfidf_word_match_share, axis=1)

train_df.head()

Dask Apply: 100%|██████████| 32/32 [00:05<00:00,  5.61it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation,question1_second_order_total_degree,question2_second_order_total_degree,question1_max_kcore,question2_max_kcore,match_share,tfidf_word_match_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,...,1,1,-0.264051,-0.264051,1,1,0,0,0.727273,0.772164
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,...,4,2,1.305445,0.259114,5,7,0,0,0.307692,0.361758
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,...,1,1,-0.264051,-0.264051,1,1,0,0,0.363636,0.355191
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,...,1,1,-0.264051,-0.264051,1,1,0,0,0.0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,...,3,1,0.78228,-0.264051,4,3,0,0,0.0,0.0


In [53]:
train_df.to_csv(f'../output/train_{DATASET_NAME}.csv', index=False)