In [21]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from __future__ import division
import re, time, os, gc, datetime
import sys
import string

import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils
import argparse
import functools
from collections import defaultdict
from nltk.corpus import stopwords
from collections import Counter

from multiprocessing import Pool, cpu_count
num_partitions = cpu_count() #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
print cpu_count()

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


# df =  pd.read_csv(config.RAW_PATH+'train.csv',nrows=200000)
# df['question1'] = df['question1'].astype(str)
# df['question2'] = df['question2'].astype(str)
# train = df.reset_index()

28


In [22]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
test_orig['is_duplicate'] = -1

train = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
        test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
del train_orig, test_orig
gc.collect()
train.head()

165

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [23]:
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_weights = [weights.get(w,0) for w in q1words.keys() if w in q2words] +\
                 [weights.get(w,0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


In [24]:
def build_features(train):
    f = functools.partial(word_match_share, stops=stops)
    train['word_match'] = train.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    train['tfidf_wm'] = train.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    train['tfidf_wm_stops'] = train.apply(f, axis=1, raw=True) #3

    train['jaccard'] = train.apply(jaccard, axis=1, raw=True) #4
    train['wc_diff'] = train.apply(wc_diff, axis=1, raw=True) #5
    train['wc_ratio'] = train.apply(wc_ratio, axis=1, raw=True) #6
    train['wc_diff_unique'] = train.apply(wc_diff_unique, axis=1, raw=True) #7
    train['wc_ratio_unique'] = train.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    train['wc_diff_unq_stop'] = train.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    train['wc_ratio_unique_stop'] = train.apply(f, axis=1, raw=True) #10

    train['same_start'] = train.apply(same_start_word, axis=1, raw=True) #11
    train['char_diff'] = train.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    train['char_diff_unq_stop'] = train.apply(f, axis=1, raw=True) #13
#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    train['total_unique_words'] = train.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    train['total_unq_words_stop'] = train.apply(f, axis=1, raw=True)  #16
    train['char_ratio'] = train.apply(char_ratio, axis=1, raw=True) #17    

    return train

In [25]:
from text_clean import *
train['question1'] = train['question1'].astype(str).apply(lambda x:text_to_wordlist(x))
train['question2'] = train['question2'].astype(str).apply(lambda x:text_to_wordlist(x))
train['question1'] = train['question1'].astype(str).apply(lambda x:substitute_thousands(x))
train['question2'] = train['question2'].astype(str).apply(lambda x:substitute_thousands(x))
# train = abbr_clean(train)
train['question1'] = train['question1'].map(lambda x: str(x).lower().split())
train['question2'] = train['question2'].map(lambda x: str(x).lower().split())


stops = set(["http","www","img","border","home","body","a","about","above","after","again","against","all","am","an",
"and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't",
"cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from",
"further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers",
"herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
"itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought",
"our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
"than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're",
"they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
"weren't","what","what's","when","when's""where","where's","which","while","who","who's","whom","why","why's","with","won't","would",
"wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves" ])
stops = set(['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','what','which',
              'is','if','while','this'])
stops = set(stopwords.words("english"))

train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist())
words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

train.head()

Unnamed: 0,question1,question2,is_duplicate
0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",0
1,"[what, is, the, story, of, kohinoor, (koh-i-no...","[what, would, happen, if, the, indian, governm...",0
2,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",0
3,"[why, am, i, mentally, very, lonely?, how, can...","[find, the, remainder, when, [math]23^{24}[/ma...",0
4,"[which, one, dissolve, in, water, quickly, sug...","[which, fish, would, survive, in, salt, water?]",0


In [26]:
print('Building Features')
# train = build_features(train)
train = parallelize_dataframe(train, build_features)

Building Features


In [27]:
train.to_csv(config.FEAT_PATH+'feat_158_stpf_clean.csv',index=False)

In [7]:
train[train['is_duplicate']!=-1].corr()

Unnamed: 0,is_duplicate,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,same_start,char_diff,char_diff_unq_stop,total_unique_words,total_unq_words_stop,char_ratio
is_duplicate,1.0,0.45674,0.42672,0.424473,0.344256,-0.201759,-0.072018,-0.20767,-0.065854,-0.214178,-0.066107,0.203637,-0.211784,-0.222334,-0.289463,-0.302564,-0.044462
word_match,0.45674,1.0,0.933074,0.929834,0.84531,-0.393913,-0.138327,-0.417275,-0.130146,-0.426188,-0.140378,0.365201,-0.402031,-0.42316,-0.506505,-0.517122,-0.098176
tfidf_wm,0.42672,0.933074,1.0,0.997003,0.77926,-0.368303,-0.126823,-0.389549,-0.119108,-0.392971,-0.125137,0.312027,-0.377275,-0.397022,-0.481682,-0.493831,-0.090756
tfidf_wm_stops,0.424473,0.929834,0.997003,1.0,0.755014,-0.360898,-0.124247,-0.380887,-0.116534,-0.386744,-0.123681,0.295058,-0.370341,-0.3917,-0.476174,-0.491886,-0.089168
jaccard,0.344256,0.84531,0.77926,0.755014,1.0,-0.398219,-0.137928,-0.431076,-0.131276,-0.41219,-0.12756,0.568942,-0.394437,-0.397736,-0.484933,-0.4467,-0.094975
wc_diff,-0.201759,-0.393913,-0.368303,-0.360898,-0.398219,1.0,0.382017,0.96198,0.33462,0.853699,0.316312,-0.257883,0.922882,0.779298,0.642812,0.611834,0.281046
wc_ratio,-0.072018,-0.138327,-0.126823,-0.124247,-0.137928,0.382017,1.0,0.378106,0.985102,0.321502,0.823967,-0.089547,0.335791,0.276684,0.182496,0.173595,0.814486
wc_diff_unique,-0.20767,-0.417275,-0.389549,-0.380887,-0.431076,0.96198,0.378106,1.0,0.35031,0.870158,0.325153,-0.273255,0.903732,0.795976,0.627535,0.59526,0.280352
wc_ratio_unique,-0.065854,-0.130146,-0.119108,-0.116534,-0.131276,0.33462,0.985102,0.35031,1.0,0.293189,0.831735,-0.083595,0.295596,0.251192,0.160382,0.152093,0.804711
wc_diff_unq_stop,-0.214178,-0.426188,-0.392971,-0.386744,-0.41219,0.853699,0.321502,0.870158,0.293189,1.0,0.35188,-0.260475,0.874144,0.899551,0.620451,0.634554,0.255916


In [10]:
train

Unnamed: 0,question1,question2,is_duplicate,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,same_start,char_diff,char_diff_unq_stop,total_unique_words,total_unq_words_stop,char_ratio
0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",0,0.727273,0.821476,0.815193,0.769231,2,0.857143,1,0.916667,1,0.833333,1,7,5,13,7,0.867925
1,"[what, is, the, story, of, kohinoor, (koh-i-no...","[what, would, happen, if, the, indian, governm...",0,0.307692,0.444512,0.445868,0.250000,5,1.625000,4,1.500000,5,2.250000,1,32,31,16,11,1.727273
2,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",0,0.363636,0.245211,0.263744,0.200000,4,0.714286,4,0.714286,1,0.833333,1,10,7,20,9,0.833333
3,"[why, am, i, mentally, very, lonely?, how, can...","[find, the, remainder, when, [math]23^{24}[/ma...",0,0.000000,0.000000,0.000000,0.000000,2,0.818182,1,0.900000,1,1.250000,0,17,23,19,9,1.425000
4,"[which, one, dissolve, in, water, quickly, sug...","[which, fish, would, survive, in, salt, water?]",0,0.000000,0.006965,0.000000,0.111111,6,0.538462,6,0.538462,5,0.500000,1,32,29,18,15,0.507692
5,"[astrology:, i, am, a, capricorn, sun, cap, mo...","[i, am, a, triple, capricorn, (sun,, moon, and...",1,0.500000,0.454088,0.440662,0.454545,1,1.062500,2,1.133333,0,1.000000,0,4,1,22,12,1.056338
6,"[should, i, buy, tiago?]","[what, keeps, childern, active, and, far, from...",0,0.000000,0.000000,0.000000,0.000000,7,2.750000,6,2.500000,5,3.500000,0,36,29,14,9,3.250000
7,"[how, can, i, be, a, good, geologist?]","[what, should, i, do, to, be, a, great, geolog...",1,0.500000,0.801124,0.818731,0.333333,2,1.285714,2,1.285714,0,1.000000,0,9,1,12,3,1.375000
8,"[when, do, you, use, シ, instead, of, し?]","[when, do, you, use, ""&"", instead, of, ""and""?]",0,0.500000,0.243085,0.215713,0.600000,0,1.000000,0,1.000000,0,1.000000,1,2,2,10,6,1.071429
9,"[motorola, (company):, can, i, hack, my, chart...","[how, do, i, hack, motorola, dcx3400, for, fre...",0,0.363636,0.495962,0.500431,0.200000,0,1.000000,0,1.000000,1,0.833333,0,11,14,15,9,0.788462


In [19]:
df_train = pd.read_csv(config.RAW_PATH+'train.csv')
df_train = df_train.fillna(' ')

df_test = pd.read_csv(config.RAW_PATH+'test.csv')
df_test['is_duplicate'] = -1
ques = pd.concat([df_train[['question1', 'question2','is_duplicate']], \
    df_test[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))

def q2_freq(row):
    return(len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['is_duplicate','q1_q2_intersect','q1_freq','q2_freq']]
del df_test
train_leaky = df_train.loc[:, ['is_duplicate','q1_q2_intersect','q1_freq','q2_freq']]
leaky = pd.concat([train_leaky, test_leaky], axis=0).reset_index(drop=True)
leaky.to_csv(config.FEAT_PATH+'magic_feature_1.csv',index=False)

In [20]:
leaky

Unnamed: 0,is_duplicate,q1_q2_intersect,q1_freq,q2_freq
0,0,0,1,2
1,0,0,8,3
2,0,0,2,1
3,0,0,1,1
4,0,0,3,1
5,1,0,1,1
6,0,0,1,1
7,1,0,1,1
8,0,1,2,3
9,0,0,1,1
