In [39]:
import pandas as pd
from fuzzywuzzy import fuzz
import swifter
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
from nltk.corpus import stopwords

In [40]:
train = pd.read_csv('../data/train.csv.zip').dropna()
train_df, valid_df = train_test_split(train, test_size=0.2)
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
227692,227692,336605,336606,What are the negative consequences of polyamory?,Could Russia have prevented WWI?,0
150484,150484,236840,236841,Is Zeus still powerful without his thunderbolts?,Is Zeus powerful without his thunderbolts and ...,1
183632,183632,280761,280762,How do we buy a house?,How do you buy a house?,1
342846,342846,470884,106122,How do I bring back dead one?,Will science ever make it possible to bring ba...,0
403822,403822,537442,372925,How will rising sea levels and subsequent tida...,How would I calculate the visible surface area...,0


In [41]:
DATASET_NAME = 'dataset_v2'
STOP_WORDS = set(stopwords.words('english'))
EMBEDDINGS_FILE = ''

In [42]:
lemmatizer = WordNetLemmatizer()

def clean(uncleaned_string):
    cleaned_string = uncleaned_string.lower()
    cleaned_string = cleaned_string.translate(str.maketrans(dict.fromkeys(string.punctuation, ' ')))

    return cleaned_string
    

def preprocess(question_text):
    processed_string = ' '.join([lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'n'), 'v') for word in question_text.split(' ')])
    
    processed_string = clean(processed_string)
    
    words = processed_string.split(' ')
    
    processed_string = " ".join(word for word in words if word not in STOP_WORDS)
    
    return processed_string

train_df['question1_lemma'] = train_df['question1'].swifter.apply(preprocess)
train_df['question2_lemma'] = train_df['question2'].swifter.apply(preprocess)

valid_df['question1_lemma'] = valid_df['question1'].swifter.apply(preprocess)
valid_df['question2_lemma'] = valid_df['question2'].swifter.apply(preprocess)

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:14<00:00, 22267.11it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['question1_lemma'] = train_df['question1'].swifter.apply(preprocess)
Pandas Apply: 100%|██████████| 323429/323429 [00:14<00:00, 22221.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['question2_lemma'] = train_df['question2'].swifter.apply(preprocess)
Pandas Apply: 100%|██████████| 80858/80858 [00:03<00:00, 22490.96it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma
227692,227692,336605,336606,What are the negative consequences of polyamory?,Could Russia have prevented WWI?,0,negative consequence polyamory,could russia prevent wwi
150484,150484,236840,236841,Is Zeus still powerful without his thunderbolts?,Is Zeus powerful without his thunderbolts and ...,1,zeus still powerful without thunderbolts,zeus powerful without thunderbolt
183632,183632,280761,280762,How do we buy a house?,How do you buy a house?,1,buy house,buy house
342846,342846,470884,106122,How do I bring back dead one?,Will science ever make it possible to bring ba...,0,bring back dead one,science ever make possible bring back dead
403822,403822,537442,372925,How will rising sea levels and subsequent tida...,How would I calculate the visible surface area...,0,rise sea level subsequent tidal change affect ...,would calculate visible surface area moon caus...


In [43]:
def add_fuzzywuzzy_features(df):
    df['simple_ratio'] = df.swifter.apply(lambda x: fuzz.ratio(x['question1_lemma'], x['question2']), axis=1)
    df['partial_ratio'] = df.swifter.apply(lambda x: fuzz.partial_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_sort_ratio'] = df.swifter.apply(lambda x: fuzz.token_sort_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    df['token_set_ratio'] = df.swifter.apply(lambda x: fuzz.token_set_ratio(x['question1_lemma'], x['question2_lemma']), axis=1)
    return df

train_df = add_fuzzywuzzy_features(train_df.copy(deep=True))
valid_df = add_fuzzywuzzy_features(valid_df.copy(deep=True))

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:04<00:00, 76138.87it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:10<00:00, 31414.44it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:07<00:00, 45408.96it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:08<00:00, 36648.88it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 83141.55it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:02<00:00, 31212.66it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:01<00:00, 49639.36it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:02<00:00, 35625.20it/s]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio
227692,227692,336605,336606,What are the negative consequences of polyamory?,Could Russia have prevented WWI?,0,negative consequence polyamory,could russia prevent wwi,29,32,33,33
150484,150484,236840,236841,Is Zeus still powerful without his thunderbolts?,Is Zeus powerful without his thunderbolts and ...,1,zeus still powerful without thunderbolts,zeus powerful without thunderbolt,75,88,90,90
183632,183632,280761,280762,How do we buy a house?,How do you buy a house?,1,buy house,buy house,55,100,100,100
342846,342846,470884,106122,How do I bring back dead one?,Will science ever make it possible to bring ba...,0,bring back dead one,science ever make possible bring back dead,38,89,62,88
403822,403822,537442,372925,How will rising sea levels and subsequent tida...,How would I calculate the visible surface area...,0,rise sea level subsequent tidal change affect ...,would calculate visible surface area moon caus...,44,48,48,51


In [44]:
""" TODO: Add the following features
 1. Check the type of question create a feature for each 5W1H and give them a value of 0 or 1 for both questions
 2. topic classification to categorize the question and check if the categories/topics are the same for both questions (perhaps a pretrained topic classifier)
 3. break down question into parts e.g. sentences, separated by comma, logical separators (like, and, or, etc.), etc.
 4. Research features that have 
 5. Borrow ideas from that other guys solution but make sure the reasoning is supported so it doesnt look like im copying.
"""

' TODO: Add the following features\n 1. Check the type of question create a feature for each 5W1H and give them a value of 0 or 1 for both questions\n 2. topic classification to categorize the question and check if the categories/topics are the same for both questions (perhaps a pretrained topic classifier)\n 3. break down question into parts e.g. sentences, separated by comma, logical separators (like, and, or, etc.), etc.\n 4. Research features that have \n 5. Borrow ideas from that other guys solution but make sure the reasoning is supported so it doesnt look like im copying.\n'

In [45]:
def get_question_type(question_text):
    """Look for the question type by looking for certain keywords, if multiple
    keywords are found take the leftmost one.
    """
    
    question_text = clean(question_text)
    
    question_type_map = {
        'why': 1,
        'what': 2,
        'when': 3,
        'where': 4,
        'who': 5,
        'how': 6
    }
    
    found_keywords = []
    for question_type in question_type_map:
        if question_type in question_text:
            found_keywords.append((question_type, question_text.index(question_type)))
           
    found_keywords.sort(key=lambda x: x[1])
            
    return question_type_map[found_keywords[0][0]] if found_keywords else -1
    

def add_question_type(df):
    """
        Checks the question type, the type can be any of the 5W 1H why, what, when, where, who and how or n/a. The types are
        stored as integers from 1 to 6 and -1 for n/a.
            -1 -> n/a
            1 -> why
            2-> what
            3 -> when
            4 -> where
            5 -> who
            6 -> how
    """
    
    df['question1_type'] = df.swifter.apply(lambda r: get_question_type(r['question1']), axis=1)
    df['question2_type'] = df.swifter.apply(lambda r: get_question_type(r['question2']), axis=1)
    
    return df

train_df = add_question_type(train_df.copy(deep=True))
valid_df = add_question_type(valid_df.copy(deep=True))

train_df.head()

Pandas Apply: 100%|██████████| 323429/323429 [00:03<00:00, 89408.81it/s]
Pandas Apply: 100%|██████████| 323429/323429 [00:03<00:00, 83890.52it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 93960.48it/s]
Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 98102.80it/s] 


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type
227692,227692,336605,336606,What are the negative consequences of polyamory?,Could Russia have prevented WWI?,0,negative consequence polyamory,could russia prevent wwi,29,32,33,33,2,-1
150484,150484,236840,236841,Is Zeus still powerful without his thunderbolts?,Is Zeus powerful without his thunderbolts and ...,1,zeus still powerful without thunderbolts,zeus powerful without thunderbolt,75,88,90,90,-1,1
183632,183632,280761,280762,How do we buy a house?,How do you buy a house?,1,buy house,buy house,55,100,100,100,6,6
342846,342846,470884,106122,How do I bring back dead one?,Will science ever make it possible to bring ba...,0,bring back dead one,science ever make possible bring back dead,38,89,62,88,6,-1
403822,403822,537442,372925,How will rising sea levels and subsequent tida...,How would I calculate the visible surface area...,0,rise sea level subsequent tidal change affect ...,would calculate visible surface area moon caus...,44,48,48,51,6,6


In [46]:
train_df.to_csv(f'../output/train_{DATASET_NAME}.csv', index=False)
valid_df.to_csv(f'../output/valid_{DATASET_NAME}.csv', index=False)