In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
import swifter
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx


In [2]:
train = pd.read_csv('../data/train.csv.zip').dropna()
train_df, valid_df = train_test_split(train, test_size=0.2)
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
355842,355842,485085,485086,What makes a person happy and wise?,What is there that makes a person happy?,0
227595,227595,36744,336492,Is it possible to get pregnant if you touch a ...,Is it possible to get pregnant a day after my ...,0
268605,268605,386200,386201,Shirting fabric in kg Mumbai?,What is the highest used application of multif...,0
173603,173603,267774,267775,"What is the difference between a concept, an i...","What lies beyond thoughts, forms and concepts?",0
295,295,590,591,What will be the effect of banning 500 and 100...,What will be the effect of the ban of the 1000...,1


In [3]:
DATASET_NAME = 'dataset_v2'
STOP_WORDS = set(stopwords.words('english'))
EMBEDDINGS_FILE = ''

In [4]:
lemmatizer = WordNetLemmatizer()

def clean(uncleaned_string):
    cleaned_string = uncleaned_string.lower()
    cleaned_string = cleaned_string.translate(str.maketrans(dict.fromkeys(string.punctuation, ' ')))

    return cleaned_string
    

def preprocess(question_text):
    processed_string = ' '.join([lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'n'), 'v') for word in question_text.split(' ')])
    
    processed_string = clean(processed_string)
    
    words = processed_string.split(' ')
    
    processed_string = " ".join(word for word in words if word not in STOP_WORDS)
    
    return processed_string

train_df['question1_lemma'] = train_df['question1'].swifter.apply(preprocess)
train_df['question2_lemma'] = train_df['question2'].swifter.apply(preprocess)

valid_df['question1_lemma'] = valid_df['question1'].swifter.apply(preprocess)
valid_df['question2_lemma'] = valid_df['question2'].swifter.apply(preprocess)

train_df.head()

Pandas Apply:  10%|█         | 33638/323429 [00:01<00:13, 21333.07it/s]


KeyboardInterrupt: 

In [None]:
def add_fuzzywuzzy_features(df):
    df['simple_ratio'] = df.swifter.apply(lambda x: fuzz.ratio(x['question1_lemma'], x['question2']), axis=1)
    df['partial_ratio'] = df.swifter.apply(lambda x: fuzz.partial_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_sort_ratio'] = df.swifter.apply(lambda x: fuzz.token_sort_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_set_ratio'] = df.swifter.apply(lambda x: fuzz.token_set_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    return df

train_df = add_fuzzywuzzy_features(train_df.copy(deep=True))
valid_df = add_fuzzywuzzy_features(valid_df.copy(deep=True))

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:04<00:00, 75033.08it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:10<00:00, 31505.84it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:06<00:00, 47755.56it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:08<00:00, 36894.95it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 83539.50it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:02<00:00, 32427.24it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:01<00:00, 42469.52it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:02<00:00, 37407.70it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio
224553,224553,371,27778,What was the significance of the battle of Som...,What was the significance of the battle of Som...,1,wa significance battle somme battle compare c...,wa significance battle somme battle compare c...,64,91,91,93
31197,31197,57541,8255,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,1,get meth system 2 days,get meth system asap,50,81,86,86
310228,310228,23108,29504,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,1,salt water taffy candy import france,saltwater taffy candy import brazil,73,86,65,87
15827,15827,30203,30204,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,best way take picture phone,take good picture phone,57,79,76,88
128555,128555,206708,206709,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,0,beautiful actress china,beautiful actress europe,62,75,77,85


In [None]:
""" TODO: Add the following features
 2. topic classification to categorize the question and check if the categories/topics are the same for both questions (perhaps a pretrained topic classifier)
 3. break down question into parts e.g. sentences, separated by comma, logical separators (like, and, or, etc.), etc.
 4. Research features that have 
 5. Borrow ideas from that other guys solution but make sure the reasoning is supported so it doesnt look like im copying.
"""

' TODO: Add the following features\n 2. topic classification to categorize the question and check if the categories/topics are the same for both questions (perhaps a pretrained topic classifier)\n 3. break down question into parts e.g. sentences, separated by comma, logical separators (like, and, or, etc.), etc.\n 4. Research features that have \n 5. Borrow ideas from that other guys solution but make sure the reasoning is supported so it doesnt look like im copying.\n'

In [None]:
def get_question_type(question_text):
    """Look for the question type by looking for certain keywords, if multiple
    keywords are found take the leftmost one.
    """
    
    question_text = clean(question_text)
    
    question_type_map = {
        'why': 1,
        'what': 2,
        'when': 3,
        'where': 4,
        'who': 5,
        'how': 6
    }
    
    found_keywords = []
    for question_type in question_type_map:
        if question_type in question_text:
            found_keywords.append((question_type, question_text.index(question_type)))
           
    found_keywords.sort(key=lambda x: x[1])
            
    return question_type_map[found_keywords[0][0]] if found_keywords else -1
    

def add_question_type(df):
    """
        Checks the question type, the type can be any of the 5W 1H why, what, when, where, who and how or n/a. The types are
        stored as integers from 1 to 6 and -1 for n/a.
            -1 -> n/a
            1 -> why
            2-> what
            3 -> when
            4 -> where
            5 -> who
            6 -> how
    """
    
    df['question1_type'] = df.swifter.apply(lambda r: get_question_type(r['question1']), axis=1)
    df['question2_type'] = df.swifter.apply(lambda r: get_question_type(r['question2']), axis=1)
    
    return df

train_df = add_question_type(train_df.copy(deep=True))
valid_df = add_question_type(valid_df.copy(deep=True))

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:03<00:00, 89455.56it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:03<00:00, 90113.75it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 98612.23it/s] 
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 96952.03it/s] 


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type
224553,224553,371,27778,What was the significance of the battle of Som...,What was the significance of the battle of Som...,1,wa significance battle somme battle compare c...,wa significance battle somme battle compare c...,64,91,91,93,2,2
31197,31197,57541,8255,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,1,get meth system 2 days,get meth system asap,50,81,86,86,6,6
310228,310228,23108,29504,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,1,salt water taffy candy import france,saltwater taffy candy import brazil,73,86,65,87,1,1
15827,15827,30203,30204,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,best way take picture phone,take good picture phone,57,79,76,88,2,6
128555,128555,206708,206709,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,0,beautiful actress china,beautiful actress europe,62,75,77,85,5,5


In [None]:
def punctuation_count(question):
    counts = Counter(question)
    return sum(counts[p] for p in string.punctuation)


def add_misc_features(df):
    df['question1_length'] = df.swifter.apply(lambda r: len(r['question1']), axis=1)
    df['question2_length'] = df.swifter.apply(lambda r: len(r['question2']), axis=1)

    df['question1_punctuation_count'] = df.swifter.apply(lambda r: punctuation_count(r['question1']), axis=1)
    df['question2_punctuation_count'] = df.swifter.apply(lambda r: punctuation_count(r['question2']), axis=1)

    return df


train_df = add_misc_features(train_df.copy(deep=True))
valid_df = add_misc_features(valid_df.copy(deep=True))

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:02<00:00, 148712.66it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:01<00:00, 197637.11it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:04<00:00, 68282.67it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:04<00:00, 70739.55it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 197465.15it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 193868.69it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:01<00:00, 76518.08it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:01<00:00, 74945.19it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type,question1_length,question2_length,question1_punctuation_count,question2_punctuation_count
224553,224553,371,27778,What was the significance of the battle of Som...,What was the significance of the battle of Som...,1,wa significance battle somme battle compare c...,wa significance battle somme battle compare c...,64,91,91,93,2,2,119,123,2,2
31197,31197,57541,8255,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,1,get meth system 2 days,get meth system asap,50,81,86,86,6,6,45,41,1,1
310228,310228,23108,29504,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,1,salt water taffy candy import france,saltwater taffy candy import brazil,73,86,65,87,1,1,49,48,1,1
15827,15827,30203,30204,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,best way take picture phone,take good picture phone,57,79,76,88,2,6,52,42,1,1
128555,128555,206708,206709,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,0,beautiful actress china,beautiful actress europe,62,75,77,85,5,5,43,44,1,1


In [None]:
def create_question_hash(question):
    # TODO: create a proper hash
    return question

train_df['question1_hash'] = train_df['question1'].swifter.apply(create_question_hash, axis=1)
train_df['question2_hash'] = train_df['question2'].swifter.apply(create_question_hash, axis=1)
valid_df['question1_hash'] = valid_df['question1'].swifter.apply(create_question_hash, axis=1)
valid_df['question2_hash'] = valid_df['question2'].swifter.apply(create_question_hash, axis=1)

g = nx.Graph()
g.add_nodes_from(train_df.question1_hash)
g.add_nodes_from(train_df.question2_hash)

g.add_nodes_from(valid_df.question2_hash)
g.add_nodes_from(valid_df.question2_hash)

edges = list(train_df[['question1_hash', 'question2_hash']].to_records(index=False))
g.add_edges_from(edges)

edges = list(valid_df[['question1_hash', 'question2_hash']].to_records(index=False))
g.add_edges_from(edges)

In [None]:
g_degree = sum(dict(g.degree()).values())/float(len(g))

def get_degree(question_hash):
    degree = g.degree[question_hash]
    return degree

def get_degree_deviation(question_hash):
    degree = g.degree[question_hash]
    return degree - g_degree

def get_graph_features(df):
    df['question1_degree'] = df.swifter.apply(lambda r: get_degree(r['question1_hash']), axis=1)
    df['question2_degree'] = df.swifter.apply(lambda r: get_degree(r['question2_hash']), axis=1)
    
    df['question1_degree_deviation'] = df.swifter.apply(lambda r: get_degree_deviation(r['question1_hash']), axis=1)
    df['question2_degree_deviation'] = df.swifter.apply(lambda r: get_degree_deviation(r['question2_hash']), axis=1)
    return df

train_df = get_graph_features(train_df.copy(deep=True))
valid_df = get_graph_features(valid_df.copy(deep=True))
train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:02<00:00, 120940.66it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:02<00:00, 133318.25it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:02<00:00, 150078.58it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:02<00:00, 120903.47it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 145535.77it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 148842.58it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 150399.22it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 148908.33it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_length,question2_length,question1_punctuation_count,question2_punctuation_count,question1_hash,question2_hash,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation
224553,224553,371,27778,What was the significance of the battle of Som...,What was the significance of the battle of Som...,1,wa significance battle somme battle compare c...,wa significance battle somme battle compare c...,64,91,...,119,123,2,2,What was the significance of the battle of Som...,What was the significance of the battle of Som...,20,20,18.495282,18.495282
31197,31197,57541,8255,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,1,get meth system 2 days,get meth system asap,50,81,...,45,41,1,1,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,17,17,15.495282,15.495282
310228,310228,23108,29504,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,1,salt water taffy candy import france,saltwater taffy candy import brazil,73,86,...,49,48,1,1,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,26,27,24.495282,25.495282
15827,15827,30203,30204,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,best way take picture phone,take good picture phone,57,79,...,52,42,1,1,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,1,-0.504718,-0.504718
128555,128555,206708,206709,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,0,beautiful actress china,beautiful actress europe,62,75,...,43,44,1,1,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,2,2,0.495282,0.495282


In [None]:
train_df.to_csv(f'../output/train_{DATASET_NAME}.csv', index=False)
valid_df.to_csv(f'../output/valid_{DATASET_NAME}.csv', index=False)