In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from __future__ import division
import re, time, os, gc
import sys
import string

import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils

df =  pd.read_csv(config.RAW_PATH+'train.csv')
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)
train = df.sample(n=100)[['question1', 'question2']]

In [3]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)

train = pd.concat([train_orig[['question1', 'question2']], \
        test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)

In [2]:
def jaccard_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._jaccard_coef(obs_ngrams, target_ngrams)

def dicedistence_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._dice_dist(obs_ngrams, target_ngrams)

def compression_dist(obs, target):
    return dist_utils._compression_dist(obs, target)

def edit_dist(obs, target):
    return dist_utils._edit_dist(obs, target)

def compression_dist_ngram(obs, target, ngram=2, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
#     for w1 in obs_ngrams:
#         _val_list = []
#         for w2 in target_ngrams:
#             s = dist_utils._compression_dist(w1, w2)
#             _val_list.append(s)
#         if len(_val_list) == 0:
#             _val_list = [ -1 ]
#         val_list.append( _val_list )
#     if len(val_list) == 0:
#         val_list = [ [-1] ]
    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._compression_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( max(_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return min(val_list)

def edit_dist_ngram(obs, target, ngram=2, token_pattern=" ", agg=[np.min, np.max]):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []

    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._edit_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( agg[0](_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return float(agg[1](val_list))


In [3]:
for NGRAMS in [1,2,3]:
    train['jaccard_n%s'%NGRAMS] = train.apply(lambda x: jaccard_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
    train['dicedistence_n%s'%NGRAMS] = train.apply(lambda x: dicedistence_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)

train['compression_dist'] = train.apply(lambda x: compression_dist(x['question1'],x['question2']), axis=1)
train['edit_dist'] = train.apply(lambda x: edit_dist(x['question1'],x['question2']), axis=1)

np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }

for AGG_NGRAMS in [2,3]:
    for agg1 in np_dict.keys():
        for agg2 in np_dict.keys():
            AGG_BY = agg1 + '_' + agg2
            AGG_FUNC = [np_dict[agg1],np_dict[agg2]]
            # train['compression_dist_agg_n%s'%AGG_NGRAMS] = train.apply(lambda x: compression_dist_ngram(x['question1'],x['question2'],ngram=AGG_NGRAMS), axis=1)
            train['edit_dist_agg_n%s_%s'%(AGG_NGRAMS,AGG_BY)] = train.apply(lambda x: 
                edit_dist_ngram(x['question1'],x['question2'], ngram=AGG_NGRAMS, agg=AGG_FUNC), axis=1)

train.corr()

Unnamed: 0,jaccard_n1,dicedistence_n1,jaccard_n2,dicedistence_n2,jaccard_n3,dicedistence_n3,compression_dist,edit_dist,edit_dist_agg_n2_std_std,edit_dist_agg_n2_std_max,...,edit_dist_agg_n3_median_std,edit_dist_agg_n3_median_max,edit_dist_agg_n3_median_min,edit_dist_agg_n3_median_median,edit_dist_agg_n3_median_mean,edit_dist_agg_n3_mean_std,edit_dist_agg_n3_mean_max,edit_dist_agg_n3_mean_min,edit_dist_agg_n3_mean_median,edit_dist_agg_n3_mean_mean
jaccard_n1,1.0,0.982943,0.934118,0.942921,0.868539,0.886974,-0.877246,-0.890037,0.138353,0.556003,...,0.088286,-0.246377,-0.197416,-0.190264,-0.197416,0.114764,-0.561728,-0.627769,-0.648024,-0.627769
dicedistence_n1,0.982943,1.0,0.883537,0.90973,0.804859,0.830277,-0.878147,-0.875469,0.22713,0.634191,...,0.12585,-0.258933,-0.240955,-0.227452,-0.240955,0.155383,-0.563821,-0.655982,-0.666951,-0.655982
jaccard_n2,0.934118,0.883537,1.0,0.990633,0.972191,0.977776,-0.813558,-0.851613,0.021721,0.429524,...,0.028798,-0.20778,-0.119122,-0.118729,-0.119122,0.096264,-0.503711,-0.568843,-0.587434,-0.568843
dicedistence_n2,0.942921,0.90973,0.990633,1.0,0.942598,0.960532,-0.829129,-0.863995,0.107304,0.501708,...,0.051561,-0.210232,-0.136617,-0.126057,-0.136617,0.136016,-0.501605,-0.592744,-0.601548,-0.592744
jaccard_n3,0.868539,0.804859,0.972191,0.942598,1.0,0.993172,-0.744493,-0.784451,-0.102441,0.3068,...,-0.024181,-0.208826,-0.078909,-0.089606,-0.078909,0.027464,-0.488843,-0.514491,-0.551257,-0.514491
dicedistence_n3,0.886974,0.830277,0.977776,0.960532,0.993172,1.0,-0.75921,-0.800785,-0.061266,0.343137,...,-0.020811,-0.202706,-0.075709,-0.084389,-0.075709,0.042157,-0.485523,-0.526487,-0.564213,-0.526487
compression_dist,-0.877246,-0.878147,-0.813558,-0.829129,-0.744493,-0.75921,1.0,0.90904,-0.315299,-0.667567,...,-0.24871,0.396686,0.408671,0.382662,0.408671,-0.278608,0.649737,0.792047,0.78638,0.792047
edit_dist,-0.890037,-0.875469,-0.851613,-0.863995,-0.784451,-0.800785,0.90904,1.0,-0.266988,-0.632327,...,-0.202496,0.348619,0.342492,0.347726,0.342492,-0.217338,0.625375,0.745811,0.741253,0.745811
edit_dist_agg_n2_std_std,0.138353,0.22713,0.021721,0.107304,-0.102441,-0.061266,-0.315299,-0.266988,1.0,0.781036,...,0.555425,-0.106513,-0.483549,-0.351782,-0.483549,0.720441,0.008608,-0.47448,-0.357057,-0.47448
edit_dist_agg_n2_std_max,0.556003,0.634191,0.429524,0.501708,0.3068,0.343137,-0.667567,-0.632327,0.781036,1.0,...,0.475221,-0.251333,-0.516681,-0.44804,-0.516681,0.53292,-0.417932,-0.741256,-0.666954,-0.741256


In [266]:
import datetime
print datetime.datetime.now()

2017-05-25 01:43:56.646323


In [267]:
train.to_csv(config.RAW_PATH+'train_1111111.csv',index=False)

In [269]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)
test_orig['is_duplicate'] = -1

train1 = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
        test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop='index')
train['is_duplicate'] = train1['is_duplicate']

In [270]:
train[train['is_duplicate']>=0].corr()

Unnamed: 0,jaccard_n1,dicedistence_n1,jaccard_n2,dicedistence_n2,jaccard_n3,dicedistence_n3,compression_dist,edit_dist,edit_dist_agg_n2_std_std,edit_dist_agg_n2_std_max,...,edit_dist_agg_n3_median_max,edit_dist_agg_n3_median_min,edit_dist_agg_n3_median_median,edit_dist_agg_n3_median_mean,edit_dist_agg_n3_mean_std,edit_dist_agg_n3_mean_max,edit_dist_agg_n3_mean_min,edit_dist_agg_n3_mean_median,edit_dist_agg_n3_mean_mean,is_duplicate
jaccard_n1,1.0,0.983648,0.905742,0.923042,0.796528,0.823201,-0.87341,-0.87746,0.218352,0.557366,...,-0.218653,-0.278028,-0.270836,-0.278028,0.113985,-0.478705,-0.570892,-0.574023,-0.570892,0.322136
dicedistence_n1,0.983648,1.0,0.851457,0.889893,0.724861,0.76304,-0.883301,-0.871926,0.300094,0.630752,...,-0.233918,-0.302649,-0.292346,-0.302649,0.155684,-0.487522,-0.599021,-0.599014,-0.599021,0.367482
jaccard_n2,0.905742,0.851457,1.0,0.985303,0.952456,0.961399,-0.778677,-0.80995,0.124674,0.42418,...,-0.140668,-0.182755,-0.180052,-0.182755,0.092353,-0.377393,-0.468938,-0.478263,-0.468938,0.191811
dicedistence_n2,0.923042,0.889893,0.985303,1.0,0.902431,0.933682,-0.80952,-0.833151,0.211759,0.505759,...,-0.163152,-0.215242,-0.20978,-0.215242,0.134406,-0.405831,-0.515965,-0.522163,-0.515965,0.240642
jaccard_n3,0.796528,0.724861,0.952456,0.902431,1.0,0.986737,-0.664738,-0.70566,-0.006118,0.261021,...,-0.055785,-0.080148,-0.081498,-0.080148,0.043047,-0.265128,-0.340203,-0.355774,-0.340203,0.109883
dicedistence_n3,0.823201,0.76304,0.961399,0.933682,0.986737,1.0,-0.694339,-0.733957,0.047581,0.309062,...,-0.06447,-0.096115,-0.095873,-0.096115,0.073591,-0.286078,-0.37593,-0.39038,-0.37593,0.145447
compression_dist,-0.87341,-0.883301,-0.778677,-0.80952,-0.664738,-0.694339,1.0,0.912464,-0.357816,-0.673521,...,0.328422,0.415863,0.397073,0.415863,-0.240151,0.571655,0.707521,0.700699,0.707521,-0.375731
edit_dist,-0.87746,-0.871926,-0.80995,-0.833151,-0.70566,-0.733957,0.912464,1.0,-0.261785,-0.594266,...,0.30568,0.37889,0.368472,0.37889,-0.169119,0.565274,0.673387,0.672552,0.673387,-0.337489
edit_dist_agg_n2_std_std,0.218352,0.300094,0.124674,0.211759,-0.006118,0.047581,-0.357816,-0.261785,1.0,0.788365,...,-0.112065,-0.342641,-0.287802,-0.342641,0.680104,-0.072229,-0.407506,-0.366728,-0.407506,0.203267
edit_dist_agg_n2_std_max,0.557366,0.630752,0.42418,0.505759,0.261021,0.309062,-0.673521,-0.594266,0.788365,1.0,...,-0.289265,-0.491662,-0.445111,-0.491662,0.522129,-0.432572,-0.683978,-0.641201,-0.683978,0.330128


In [4]:
def get_position_list(obs, target, ngram=1, token_pattern=" "):
    """
        Get the list of positions of obs in target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target, len(obs)


for ngram in [1,2]:
    for target_name in ['question1','question2']:
        for obs_name in ['question1','question2']:
            if target_name != obs_name:
                position = train[['question1','question2']].apply(lambda x: get_position_list(obs=x[obs_name],target=x[target_name],ngram=ngram), axis=1)
                pos = [i[0] for i in position]
                obs_len = [i[1] for i in position]
                ## stats feat on pos
                train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np.min, pos)
                train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np.mean, pos)
                train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np.median, pos)
                train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np.max, pos)
                train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np.std, pos)
                # stats feat on normalized_pos
                train["norm_pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] ,obs_len)

train.corr()[train.corr().index.str.contains('pos_of')]

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,pos_of_question1_n2_in_question2_min,pos_of_question1_n2_in_question2_mean,pos_of_question1_n2_in_question2_median,pos_of_question1_n2_in_question2_max,pos_of_question1_n2_in_question2_std,norm_pos_of_question1_n2_in_question2_min,norm_pos_of_question1_n2_in_question2_mean,norm_pos_of_question1_n2_in_question2_median,norm_pos_of_question1_n2_in_question2_max,norm_pos_of_question1_n2_in_question2_std
pos_of_question2_n1_in_question1_min,0.091188,0.091188,0.091188,-0.070505,-0.270079,-0.285221,0.299071,0.314354,0.09537,1.0,...,0.136549,-0.01418,-0.007304,-0.102428,-0.220998,0.126898,-0.015142,-0.010791,-0.117006,-0.270792
pos_of_question2_n1_in_question1_mean,0.081232,0.081232,0.081232,0.036182,0.073449,0.06855,0.066148,0.00548,0.223616,0.562078,...,0.330662,0.436351,0.434052,0.409361,0.252429,0.20956,0.252194,0.251629,0.236988,0.09309
pos_of_question2_n1_in_question1_median,0.077059,0.077059,0.077059,0.028635,0.058116,0.053006,0.068766,0.006716,0.212374,0.525533,...,0.371946,0.456286,0.460653,0.406857,0.22397,0.247964,0.283484,0.286801,0.25387,0.082694
pos_of_question2_n1_in_question1_max,0.05625,0.05625,0.05625,0.082929,0.21669,0.217676,-0.054016,-0.12835,0.225545,0.253399,...,0.292477,0.491941,0.480088,0.516095,0.401354,0.15154,0.266443,0.260082,0.30577,0.228553
pos_of_question2_n1_in_question1_std,0.024478,0.024478,0.024478,0.09039,0.249456,0.261758,-0.115304,-0.199164,0.193695,-0.147556,...,0.234796,0.443623,0.427193,0.484413,0.413629,0.107738,0.242712,0.233664,0.301975,0.27515
norm_pos_of_question2_n1_in_question1_min,0.083675,0.083675,0.083675,-0.022449,-0.232674,-0.232562,0.124182,0.20185,-0.150352,0.84205,...,0.086237,-0.088551,-0.082982,-0.174595,-0.266676,0.142736,-0.005863,-0.003723,-0.111243,-0.266057
norm_pos_of_question2_n1_in_question1_mean,0.117718,0.117718,0.117718,0.232939,0.226316,0.273982,-0.320109,-0.305905,-0.121146,0.551932,...,0.336494,0.355985,0.358512,0.294229,0.139657,0.382569,0.434812,0.429571,0.403077,0.188142
norm_pos_of_question2_n1_in_question1_median,0.112239,0.112239,0.112239,0.216539,0.202683,0.245646,-0.292469,-0.289202,-0.097456,0.526367,...,0.406495,0.412683,0.423171,0.324405,0.127978,0.441299,0.486974,0.487168,0.435851,0.176461
norm_pos_of_question2_n1_in_question1_max,0.101182,0.101182,0.101182,0.336523,0.470675,0.536,-0.519161,-0.541313,-0.077635,0.189331,...,0.312481,0.462734,0.455256,0.464052,0.359845,0.335077,0.507346,0.494531,0.556283,0.426362
norm_pos_of_question2_n1_in_question1_std,0.047338,0.047338,0.047338,0.320685,0.488246,0.557265,-0.508239,-0.573647,-0.003227,-0.322718,...,0.242315,0.427838,0.415603,0.457938,0.407233,0.248703,0.449633,0.43507,0.529076,0.477085


In [5]:
# class Count_Ngram_BaseEstimator:
#     def __init__(self, idx=-1, ngram=1, aggr="", threshold=config.STR_MATCH_THRESHOLD):
#         self.idx = idx
#         self.ngram = ngram
#         self.threshold = threshold

#     def _get_match_count(self, obs, target):
#         cnt = 0
#         if (len(obs) != 0) and (len(target) != 0):
#             if self.idx == -1:
#                 for obs_word in obs:
#                     for word in target:
#                         if dist_utils._is_str_match(word, obs_word, self.threshold):
#                             cnt += 1
#             else:
#                 for word in target:
#                     if dist_utils._is_str_match(word, obs[self.idx], self.threshold):
#                         cnt += 1
#         return cnt

#     def count_close_ngram(self, obs, target, token_pattern=" "):
#         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
#         target_tokens = nlp_utils._tokenize(target, token_pattern)
#         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
#         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
#         return self._get_match_count(obs_ngrams, target_ngrams)
    
    
def count_close_ngram(obs, target, idx=-1, ratio='count', ngram=123, aggr="", token_pattern=" ", threshold=config.STR_MATCH_THRESHOLD):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    cnt = 0
    if (len(obs) != 0) and (len(target) != 0):
        if idx == -1:
            for obs_word in obs:
                for word in target:
                    if dist_utils._is_str_match(word, obs_word, threshold):
                        cnt += 1
        else:
            for word in target:
                if dist_utils._is_str_match(word, obs[idx], threshold):
                    cnt += 1
    if ratio == 'count': 
        return cnt
    else: return np_utils._try_divide(cnt, (len(obs)+len(target))/2.0)
    
# count1 = Count_Ngram_BaseEstimator(threshold=0.65)
# train['intersect_count'] = train[['question1','question2']].apply(lambda x: 
#                     count1.count_close_ngram(x[0],x[1]), axis=1)
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
    for ratio in RATIO:
        train['intersect_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                    count_close_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
train.corr()[train.corr().index.str.contains('intersect_close')]

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,norm_pos_of_question1_n2_in_question2_mean,norm_pos_of_question1_n2_in_question2_median,norm_pos_of_question1_n2_in_question2_max,norm_pos_of_question1_n2_in_question2_std,intersect_close_count_n1,intersect_close_ratio_n1,intersect_close_count_n2,intersect_close_ratio_n2,intersect_close_count_n3,intersect_close_ratio_n3
intersect_close_count_n1,-0.013122,-0.013122,-0.013122,0.131526,0.561268,0.534027,-0.388786,-0.427734,0.03722,-0.112805,...,0.268151,0.261078,0.419248,0.46586,1.0,0.75633,0.909778,0.530302,0.850685,0.457997
intersect_close_ratio_n1,0.027262,0.027262,0.027262,0.313315,0.840758,0.865684,-0.81631,-0.806912,-0.088754,-0.249321,...,0.489601,0.486149,0.659656,0.681442,0.75633,1.0,0.784211,0.859992,0.763991,0.780021
intersect_close_count_n2,-0.027767,-0.027767,-0.027767,0.179281,0.728372,0.695477,-0.583789,-0.598518,0.014591,-0.169004,...,0.339158,0.331018,0.523922,0.588936,0.909778,0.784211,1.0,0.745167,0.954822,0.654655
intersect_close_ratio_n2,0.003594,0.003594,0.003594,0.30914,0.891032,0.908138,-0.893406,-0.884361,-0.09462,-0.259414,...,0.49309,0.488313,0.674455,0.71827,0.530302,0.859992,0.745167,1.0,0.732753,0.908437
intersect_close_count_n3,-0.024523,-0.024523,-0.024523,0.172974,0.765048,0.730787,-0.609192,-0.634673,0.027904,-0.182026,...,0.369754,0.363618,0.557435,0.61185,0.850685,0.763991,0.954822,0.732753,1.0,0.754092
intersect_close_ratio_n3,-0.02482,-0.02482,-0.02482,0.269428,0.87867,0.891131,-0.842762,-0.86713,-0.049912,-0.252568,...,0.512276,0.511296,0.685461,0.710932,0.457997,0.780021,0.654655,0.908437,0.754092,1.0


In [6]:
def cooccurrence_ngram(obs, target, ngram=1, threshold=0.8, ratio='ratio', token_pattern=" "):
    """
        Get the count cooccurrence_ngram in obs and target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)

    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
    if ratio == 'count': 
        return s
    else: return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
    

NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
    for ratio in RATIO:
        train['cooccurrence_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                    cooccurrence_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
# train.corr().ix[-1*len(NGRAMS)*len(RATIO):]
train.corr()[train.corr().index.str.contains('cooccurrence')]

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,intersect_close_count_n2,intersect_close_ratio_n2,intersect_close_count_n3,intersect_close_ratio_n3,cooccurrence_close_count_n1,cooccurrence_close_ratio_n1,cooccurrence_close_count_n2,cooccurrence_close_ratio_n2,cooccurrence_close_count_n3,cooccurrence_close_ratio_n3
cooccurrence_close_count_n1,-0.013122,-0.013122,-0.013122,0.131526,0.561268,0.534027,-0.388786,-0.427734,0.03722,-0.112805,...,0.909778,0.530302,0.850685,0.457997,1.0,0.153259,0.909778,0.070105,0.850685,0.064748
cooccurrence_close_ratio_n1,0.023832,0.023832,0.023832,0.328474,0.640463,0.699828,-0.786163,-0.733716,-0.252815,-0.268398,...,0.255664,0.724433,0.268879,0.656784,0.153259,1.0,0.255664,0.895333,0.268879,0.791832
cooccurrence_close_count_n2,-0.027767,-0.027767,-0.027767,0.179281,0.728372,0.695477,-0.583789,-0.598518,0.014591,-0.169004,...,1.0,0.745167,0.954822,0.654655,0.909778,0.255664,1.0,0.266881,0.954822,0.232364
cooccurrence_close_ratio_n2,-0.008782,-0.008782,-0.008782,0.302772,0.632048,0.677373,-0.759505,-0.736952,-0.220926,-0.24339,...,0.266881,0.797931,0.270481,0.722184,0.070105,0.895333,0.266881,1.0,0.270481,0.912122
cooccurrence_close_count_n3,-0.024523,-0.024523,-0.024523,0.172974,0.765048,0.730787,-0.609192,-0.634673,0.027904,-0.182026,...,0.954822,0.732753,1.0,0.754092,0.850685,0.268879,0.954822,0.270481,1.0,0.310343
cooccurrence_close_ratio_n3,-0.07027,-0.07027,-0.07027,0.266992,0.615008,0.651725,-0.690567,-0.710414,-0.15337,-0.223456,...,0.232364,0.720548,0.310343,0.80104,0.064748,0.791832,0.232364,0.912122,0.310343,1.0


In [7]:
def LongestMatchSize(obs_corpus, target_corpus):
    return dist_utils._longest_match_size(obs_corpus, target_corpus)

def LongestMatchRatio(obs_corpus, target_corpus):
    return dist_utils._longest_match_ratio(obs_corpus, target_corpus)

train['LongestMatchSize'] = train[['question1','question2']].apply(lambda x: LongestMatchSize(x[0],x[1]), axis=1)
train['LongestMatchRatio'] = train[['question1','question2']].apply(lambda x: LongestMatchRatio(x[0],x[1]), axis=1)
train.corr()[train.corr().index.str.contains('LongestMatch')]

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,intersect_close_count_n3,intersect_close_ratio_n3,cooccurrence_close_count_n1,cooccurrence_close_ratio_n1,cooccurrence_close_count_n2,cooccurrence_close_ratio_n2,cooccurrence_close_count_n3,cooccurrence_close_ratio_n3,LongestMatchSize,LongestMatchRatio
LongestMatchSize,0.102081,0.102081,0.102081,0.218323,0.374156,0.405512,-0.492384,-0.48579,-0.150488,0.027599,...,0.3756,0.408711,0.397615,0.380343,0.412382,0.350781,0.3756,0.319515,1.0,0.440454
LongestMatchRatio,0.079977,0.079977,0.079977,0.213539,0.172969,0.233214,-0.420301,-0.349987,-0.246562,-0.07936,...,-0.092456,0.267227,-0.221972,0.71029,-0.11967,0.639684,-0.092456,0.530724,0.440454,1.0


In [9]:
'''
QuestionQuality
IsInSpellCheckingList
'''

In [10]:
from collections import defaultdict

def _get_df_dict(target_corpus, ngram=1, token_pattern=" "):
    d = defaultdict(lambda : 1)
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        for w in set(target_ngrams):
            d[w] += 1
    return d

def _get_idf(word, idf_dict, N):
    return np.log((N - idf_dict[word] + 0.5)/(idf_dict[word] + 0.5))

def cooc_tfidf_ngram(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        if ratio == "count":
            val_list.append(s * _get_idf(w1, idf_dict, doc_num))
        elif ratio == "ratio":
            val_list.append(np_utils._try_divide(s, len(target_ngrams)) * _get_idf(w1, idf_dict, doc_num))
            
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return val_list

doc_num = train.shape[0]

for ngram in [1,2]:
    idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
    for ratio in ['count','ratio']:
        for target_name in ['question1','question2']:
            for obs_name in ['question1','question2']:
                if target_name != obs_name:
                    pos = train[['question1','question2']].apply(lambda x: cooc_tfidf_ngram(
                                obs=x[obs_name],target=x[target_name], ngram=ngram,ratio=ratio), axis=1)
#                     train["cooc_tfidf_%s_n%s_%s_min" % (obs_name, ngram, ratio)] = map(np.min, pos)
                    train["cooc_tfidf_%s_n%s_%s_mean" % (obs_name, ngram, ratio)] = map(np.mean, pos)
                    train["cooc_tfidf_%s_n%s_%s_median" % (obs_name, ngram, ratio)] = map(np.median, pos)
                    train["cooc_tfidf_%s_n%s_%s_max" % (obs_name, ngram, ratio)] = map(np.max, pos)
                    train["cooc_tfidf_%s_n%s_%s_std" % (obs_name, ngram, ratio)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('cooc_tfidf')]

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,cooc_tfidf_question1_n2_count_max,cooc_tfidf_question1_n2_count_std,cooc_tfidf_question2_n2_ratio_mean,cooc_tfidf_question2_n2_ratio_median,cooc_tfidf_question2_n2_ratio_max,cooc_tfidf_question2_n2_ratio_std,cooc_tfidf_question1_n2_ratio_mean,cooc_tfidf_question1_n2_ratio_median,cooc_tfidf_question1_n2_ratio_max,cooc_tfidf_question1_n2_ratio_std
cooc_tfidf_question2_n1_count_mean,0.003989,0.003989,0.003989,0.315111,0.635442,0.665384,-0.702794,-0.625465,-0.241676,-0.182138,...,0.571941,0.570117,0.587249,0.51922,0.43224,0.436726,0.586654,0.440112,0.583396,0.582061
cooc_tfidf_question2_n1_count_median,-0.090565,-0.090565,-0.090565,0.192691,0.619518,0.610902,-0.563681,-0.537952,-0.225618,-0.182404,...,0.322563,0.323437,0.548403,0.598038,0.271173,0.273418,0.548101,0.518649,0.399391,0.41036
cooc_tfidf_question2_n1_count_max,0.086535,0.086535,0.086535,0.155608,0.34133,0.371928,-0.363684,-0.305797,-0.135749,0.103649,...,0.547275,0.462243,0.224502,0.140407,0.288522,0.245106,0.224415,0.117888,0.346485,0.304483
cooc_tfidf_question2_n1_count_std,0.090671,0.090671,0.090671,0.257174,0.466729,0.507125,-0.546591,-0.473529,-0.26264,0.00409,...,0.60532,0.547798,0.377138,0.23853,0.408312,0.385299,0.376335,0.228525,0.49808,0.459331
cooc_tfidf_question1_n1_count_mean,0.044628,0.044628,0.044628,0.323651,0.669951,0.699076,-0.703957,-0.640552,-0.017445,-0.074931,...,0.566548,0.591664,0.588435,0.458586,0.58209,0.572508,0.588429,0.514155,0.424021,0.456587
cooc_tfidf_question1_n1_count_median,-0.095041,-0.095041,-0.095041,0.174151,0.735778,0.718386,-0.636585,-0.647597,-0.010198,-0.208854,...,0.317519,0.318884,0.586724,0.569745,0.423187,0.416085,0.587563,0.641572,0.25579,0.273033
cooc_tfidf_question1_n1_count_max,0.056639,0.056639,0.056639,0.22218,0.335312,0.359008,-0.351301,-0.31558,-0.049986,0.089702,...,0.54561,0.449786,0.182511,0.114927,0.24885,0.215359,0.18217,0.121081,0.231717,0.205097
cooc_tfidf_question1_n1_count_std,0.077963,0.077963,0.077963,0.272164,0.478915,0.516322,-0.544003,-0.494883,-0.056859,0.086213,...,0.623527,0.572896,0.334842,0.216113,0.389645,0.351953,0.334206,0.226593,0.359995,0.343779
cooc_tfidf_question2_n1_ratio_mean,0.019067,0.019067,0.019067,0.316319,0.499107,0.547125,-0.668777,-0.576776,-0.280002,-0.159105,...,0.431774,0.501173,0.757793,0.548919,0.741336,0.735641,0.757621,0.579479,0.698421,0.726802
cooc_tfidf_question2_n1_ratio_median,-0.03826,-0.03826,-0.03826,0.215351,0.491605,0.501658,-0.502786,-0.467245,-0.278656,-0.159788,...,0.261734,0.298588,0.640449,0.59405,0.466169,0.46737,0.640353,0.602926,0.481934,0.511012


In [13]:
BM25_K1=config.BM25_K1
BM25_B=config.BM25_B

def _get_avg_ngram_doc_len(target_corpus, ngram=1, token_pattern=" "):
    lst = []
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        lst.append(len(target_ngrams))
    return np.mean(lst)

def bm25(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" ", b=None, k1=None, doc_len=None, idf_dict=idf_dict):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    K = k1 * (1 - b + b * np_utils._try_divide(len(target_ngrams), doc_len))
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        bm25 = s * _get_idf(w1, idf_dict, doc_num) * np_utils._try_divide(1 + k1, s + K)
        val_list.append(bm25)
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return val_list

for ngram in [1,2]:
    idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
#     for ratio in ['count','ratio']:
    for target_name in ['question1','question2']:
        avg_target_len = _get_avg_ngram_doc_len(train[target_name].values, ngram=ngram)
        for obs_name in ['question1','question2']:
            if target_name != obs_name:
                pos = train[['question1','question2']].apply(lambda x: bm25(obs=x[obs_name],target=x[target_name], 
                                ngram=ngram,threshold=0.85,b=BM25_B, k1=BM25_K1, doc_len=avg_target_len), axis=1)
#                 train["bm25_%s_n%s_min" % (obs_name, ngram)] = map(np.min, pos)
                train["bm25_%s_n%s_mean" % (obs_name, ngram)] = map(np.mean, pos)
                train["bm25_%s_n%s_median" % (obs_name, ngram)] = map(np.median, pos)
                train["bm25_%s_n%s_max" % (obs_name, ngram)] = map(np.max, pos)
                train["bm25_%s_n%s_std" % (obs_name, ngram)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('bm25_')].sort_values(by='is_duplicate',ascending=0)

Unnamed: 0,id,qid1,qid2,is_duplicate,jaccard_n123,dicedistence_n123,compression_dist,edit_dist,edit_dist_agg_n2,pos_of_question2_n1_in_question1_min,...,bm25_question1_n1_max,bm25_question1_n1_std,bm25_question2_n2_mean,bm25_question2_n2_median,bm25_question2_n2_max,bm25_question2_n2_std,bm25_question1_n2_mean,bm25_question1_n2_median,bm25_question1_n2_max,bm25_question1_n2_std
bm25_question1_n1_mean,0.03648,0.03648,0.03648,0.352203,0.790041,0.849268,-0.865517,-0.832492,-0.094787,-0.255752,...,0.571958,0.268813,0.786613,0.584403,0.715651,0.694923,0.844578,0.668199,0.670111,0.67395
bm25_question2_n1_mean,0.022915,0.022915,0.022915,0.340193,0.792846,0.850719,-0.887646,-0.850095,-0.230319,-0.30215,...,0.63646,0.408147,0.846947,0.661692,0.689764,0.675161,0.826091,0.633913,0.721958,0.703803
bm25_question2_n1_median,0.035406,0.035406,0.035406,0.299429,0.693846,0.756189,-0.770685,-0.771992,-0.165842,-0.365725,...,0.445785,0.234845,0.745906,0.543764,0.583438,0.623651,0.743594,0.559853,0.623996,0.623892
bm25_question1_n1_median,0.007516,0.007516,0.007516,0.296185,0.66791,0.724268,-0.752738,-0.749062,-0.009808,-0.304165,...,0.355916,0.17071,0.657273,0.446939,0.599223,0.600207,0.721546,0.54644,0.532266,0.570665
bm25_question1_n2_mean,-0.013548,-0.013548,-0.013548,0.274326,0.851427,0.875016,-0.835873,-0.827952,-0.062225,-0.239055,...,0.395922,0.100099,0.960971,0.784017,0.727546,0.665076,1.0,0.854926,0.699136,0.646522
bm25_question2_n2_std,0.187795,0.187795,0.187795,0.270694,0.451326,0.541306,-0.626486,-0.575094,-0.042077,-0.167574,...,0.45579,0.360794,0.632415,0.269978,0.925471,1.0,0.665076,0.315418,0.900906,0.9592
bm25_question1_n2_std,0.166054,0.166054,0.166054,0.268527,0.435053,0.527615,-0.625563,-0.569202,-0.122542,-0.17464,...,0.474244,0.441958,0.672491,0.314522,0.917058,0.9592,0.646522,0.277646,0.936658,1.0
bm25_question1_n1_max,0.103296,0.103296,0.103296,0.264274,0.365579,0.420987,-0.508355,-0.451527,-0.300477,0.039928,...,1.0,0.715579,0.420601,0.27485,0.472655,0.45579,0.395922,0.240243,0.515742,0.474244
bm25_question2_n2_mean,-0.022076,-0.022076,-0.022076,0.246086,0.825447,0.848674,-0.828015,-0.812105,-0.157002,-0.245,...,0.420601,0.21532,1.0,0.838139,0.700719,0.632415,0.960971,0.794369,0.736143,0.672491
bm25_question2_n2_max,0.16231,0.16231,0.16231,0.244414,0.500077,0.577379,-0.640644,-0.599235,-0.02518,-0.156416,...,0.472655,0.361338,0.700719,0.354677,1.0,0.925471,0.727546,0.39147,0.957137,0.917058


In [57]:
# ------------------------ Vector Space Features -------------------------------

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils, pkl_utils
from utils import logging_utils, time_utils

class VectorSpace:
    ## word based
    def _init_word_bow(self, ngram, vocabulary=None):
        bow = CountVectorizer(min_df=3,
                                max_df=0.75,
                                max_features=None,
                                # norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                vocabulary=vocabulary)
        return bow

    ## word based
    def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75,                                
                                max_features=None,
                                norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

    ## char based
    def _init_char_tfidf(self, include_digit=False):
        chars = list(string.ascii_lowercase)
        if include_digit:
            chars += list(string.digits)        
        vocabulary = dict(zip(chars, range(len(chars))))
        tfidf = TfidfVectorizer(strip_accents="unicode",
                                analyzer="char",
                                norm=None,
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, 1), 
                                use_idf=0,
                                vocabulary=vocabulary)
        return tfidf

    ## char based ngram
    def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75, 
                                max_features=None, 
                                norm="l2",
                                strip_accents="unicode", 
                                analyzer="char",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram), 
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1, 
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

# ------------------------ LSA -------------------------------
class LSA_Ngram(VectorSpace):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.corpus = corpus
        self.target_corpus = target_corpus
        
    def word_transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
#         word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
    
    def char_transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)

    def pair_transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_obs = tfidf.transform(self.obs_corpus)
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_target = tfidf.transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd
    
all_corpus = []
feats_corpus = ['question1','question2']
for f in feats_corpus:
    train[f] = train[f].astype(str)
    all_corpus += train[f].values.tolist()

for f in ['question1','question2']:
    lsa_word = LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_word.word_transform()
    break
    
for f in ['question1','question2']:
    lsa_char = LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_char.char_transform()
    break

lsa_pair = LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.pair_transform()

[[ 0.48897685 -0.12364507 -0.12529131 ...,  0.19337968 -0.04179793
  -0.21828433]
 [ 0.76094565 -0.13066384 -0.24926977 ..., -0.19513835 -0.00652884
   0.01361816]
 [ 0.20219964  0.32453762 -0.08998923 ..., -0.13682609  0.11677467
   0.02283278]
 ..., 
 [ 0.13093228  0.32318256 -0.10861661 ...,  0.01381884 -0.01144202
  -0.05396418]
 [ 0.28598382  0.02573773  0.37589184 ..., -0.13925678  0.02044775
  -0.11142202]
 [ 0.36398638 -0.09518961 -0.13558644 ...,  0.04975152 -0.01402062
   0.02189748]]
[[ 0.30628278  0.13072922 -0.05537384 ...,  0.02647884 -0.07637475
   0.00863192]
 [ 0.28948935  0.18839956 -0.04557625 ..., -0.03926573 -0.08395679
  -0.00401218]
 [ 0.30066084 -0.16248022 -0.1117007  ..., -0.06727614 -0.0669044
   0.09360335]
 ..., 
 [ 0.22409702 -0.17455413 -0.12172682 ...,  0.12891339 -0.05281741
   0.05558532]
 [ 0.30431468  0.05538239  0.10953847 ..., -0.15892938 -0.1105081
   0.0027384 ]
 [ 0.28330364  0.18447348 -0.046028   ...,  0.19158112 -0.03420802
   0.02571635]]
[[

In [79]:
class TSNE_LSA_Ngram(LSA_Ngram):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        LSA_Ngram.__init__(self, corpus, obs_corpus, target_corpus, ngram, svd_dim, svd_n_iter)

    def tsne_word_transform(self):
        X_svd = self.word_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne
    
    def tsne_char_transform(self):
        X_svd = self.char_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne
    
    def tsne_pair_transform(self):
        X_svd = self.pair_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne


for f in ['question1','question2']:
    lsa_word = TSNE_LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_word.tsne_word_transform()
    break
    
for f in ['question1','question2']:
    lsa_char = TSNE_LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_char.tsne_char_transform()
    break

lsa_pair = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.tsne_pair_transform()


[[ -5.02173026e+01   9.81634541e+01]
 [ -5.50605519e+01   5.28976124e+01]
 [ -1.45345964e+02   1.26105509e+01]
 [ -9.71501000e+01  -1.51336779e+01]
 [  9.53674103e+01   5.00098621e+01]
 [  6.59470030e+01   4.15461335e+01]
 [ -5.28130951e+01  -6.40893766e+01]
 [ -7.26328449e+01  -9.78179242e+00]
 [ -3.06216697e+00   2.34854209e+01]
 [ -6.36726993e+01   2.97892017e+01]
 [  2.94998169e+01  -4.11528362e+01]
 [  9.72237633e+00  -2.31139806e+01]
 [ -4.29106814e+01   5.45876588e+00]
 [ -3.27922544e+00   1.20207713e+02]
 [ -1.36735814e+01   5.63581756e+00]
 [  2.06460624e+02   2.19009850e+02]
 [  1.02913555e+02   2.21966611e+01]
 [  4.45491606e+01  -4.41626501e+01]
 [ -7.17631231e+01   7.41048310e+01]
 [  7.48757185e+01   6.20560869e+01]
 [  4.01956832e+01  -2.81648141e+01]
 [  2.04307677e+01   8.57176030e+00]
 [  1.04242928e+01  -6.43013667e+01]
 [  8.84815880e+01  -1.76258105e+02]
 [ -1.31826118e+02   1.16035510e+02]
 [ -9.25258676e+01   3.67804484e+01]
 [ -4.66930999e+01   4.11563876e+01]
 

In [21]:
class LSA_Ngram_Cooc(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, 
            obs_ngram=1, target_ngram=1, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.obs_ngram = obs_ngram
        self.target_ngram = target_ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.obs_ngram_str = ngram_utils._ngram_str_map[self.obs_ngram]
        self.target_ngram_str = ngram_utils._ngram_str_map[self.target_ngram]

    def _get_cooc_terms(self, lst1, lst2, join_str):
        out = [""] * len(lst1) * len(lst2)
        cnt =  0
        for item1 in lst1:
            for item2 in lst2:
                out[cnt] = item1 + join_str + item2
                cnt += 1
        res = " ".join(out)
        return res

    def transform(self):
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))

        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
    
    
lsa_word = LSA_Ngram_Cooc(train['question1'],train['question2'], svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_word.transform()

[[ 0.38716338 -0.11492736  0.02693557 ..., -0.21959318  0.04572143
  -0.20302537]
 [ 0.37860533 -0.17642094 -0.11915297 ..., -0.02106806  0.06484059
  -0.14097846]
 [ 0.16008797  0.26690364  0.07742443 ...,  0.00483183 -0.03306614
   0.19714163]
 ..., 
 [ 0.18403166  0.3767789   0.17852834 ..., -0.06044721 -0.00204786
  -0.02433432]
 [ 0.35875601 -0.10181138 -0.30319755 ...,  0.02949101  0.00761067
  -0.05966009]
 [ 0.43520514 -0.10988795  0.11824322 ..., -0.02219283 -0.1385344
  -0.03518994]]


In [25]:
# ------------------------ LSA Cosine Similarity -------------------------------
class LSA_Ngram_CosineSim(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter

    def word_transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
    
    def char_transform(self):
        ## get common vocabulary
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
    
cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=3, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_word.word_transform()
cosinesim_char = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_char.char_transform()


[ 0.97847646  0.68061563  0.95171286  0.21872809  0.94611844  0.96902353
  0.25696486  0.59150984  0.9846641   0.20744675  0.48132165  0.32993468
  0.82545877  0.97468417  0.99838001  0.52968499  0.99922397  0.2458281
  0.78626417  0.99988169  0.72661768  0.47052346  0.49049765  0.20727479
  0.35482186  0.9999447   0.89821017  0.02040301  0.98584412  0.88299513
  0.92618964  0.05305127  0.98456905  0.10727389  0.98793669  0.53883786
  0.96040861  0.44068233  0.91302845  0.16338309  0.27571471  0.99434496
  0.99971668  0.72264042  0.99988953  0.19729982 -0.08330814  0.63705422
  0.93522447  0.45299138  0.88233123  0.78467812  0.19955324  0.88649557
  0.09486967  0.13843015  0.61367832  0.82848928  0.9149584   0.04698079
  0.00418346  0.92522448  0.93782364  0.95337823  0.59217851  0.33222821
  0.79530532  0.43176658  0.86225878  0.39578685  0.97123748  0.88583323
  0.9787098   0.4108435   0.98901547  0.80988778 -0.00455441  0.42917243
  0.14127949  0.95090123  0.53664426  0.35051511  0.

201

In [56]:
# ------------------- Char distribution -------------------
class CharDistribution(VectorSpace):
    def __init__(self, obs_corpus, target_corpus):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus

    def normalize(self, text):
        # pat = re.compile("[a-z0-9]")
        pat = re.compile("[a-z]")
        group = pat.findall(text.lower())
        if group is None:
            res = " "
        else:
            res = "".join(group)
            res += " "
        return res

    def preprocess(self, corpus):
        return [self.normalize(text) for text in corpus]

    def get_distribution(self):
        ## obs tfidf
        tfidf = self._init_char_tfidf()
        X_obs = tfidf.fit_transform(self.preprocess(self.obs_corpus)).todense()
        X_obs = np.asarray(X_obs)
        # apply laplacian smoothing
        s = 1.
        X_obs = (X_obs + s) / (np.sum(X_obs, axis=1)[:,None] + X_obs.shape[1]*s)
        ## targetument tfidf
        tfidf = self._init_char_tfidf()
        X_target = tfidf.fit_transform(self.preprocess(self.target_corpus)).todense()
        X_target = np.asarray(X_target)
        X_target = (X_target + s) / (np.sum(X_target, axis=1)[:,None] + X_target.shape[1]*s)
        return X_obs, X_target

class CharDistribution_transform(CharDistribution):
    def __init__(self, obs_corpus, target_corpus, const_A=1., const_B=1.):
        CharDistribution.__init__(self, obs_corpus, target_corpus)
        self.const_A = const_A
        self.const_B = const_B

    def ratio_transform(self):
        X_obs, X_target = self.get_distribution()
        ratio = (X_obs + self.const_A) / (X_target + self.const_B)
        return ratio

    def cosine_transform(self):
        X_obs, X_target = self.get_distribution()
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim

    def kl_transform(self):
        X_obs, X_target = self.get_distribution()
        kl = dist_utils._KL(X_obs, X_target)
        return kl
    
# cosinesim_word = CharDistribution(train['question1'],train['question2'])
# print cosinesim_word.get_distribution()
cosinesim_word = CharDistribution_transform(train['question1'],train['question2'])
print cosinesim_word.ratio_transform()
print cosinesim_word.cosine_transform()
print cosinesim_word.kl_transform()

[[ 1.00735043  0.99754127  0.99875356 ...,  0.99875356  0.99754127
   0.99875356]
 [ 0.98521285  0.99487179  0.99487179 ...,  1.00502355  1.02025118
   1.00502355]
 [ 0.99404319  0.97285068  1.00521221 ...,  0.99845201  0.99694423
   0.99845201]
 ..., 
 [ 0.99753695  0.99753695  0.99524376 ...,  0.99874608  0.99874608
   0.99874608]
 [ 0.98924731  0.98924731  0.99966044 ...,  0.99988438  0.99988438
   1.01052145]
 [ 1.01079219  0.98275058  0.99855769 ...,  0.9992674   0.99855769
   0.9992674 ]]
[ 0.9826394   0.94292652  0.95321004  0.81583971  0.92166164  0.97178847
  0.79229621  0.95374099  0.99052999  0.89268778  0.91977606  0.91901409
  0.97485929  0.99200608  0.99795932  0.9512718   0.99629752  0.926243
  0.94867303  0.98153152  0.90606067  0.94434925  0.94616624  0.79632086
  0.82330525  0.98999437  0.96656932  0.94472788  0.98221291  0.92317469
  0.9654985   0.940977    0.98609112  0.89835154  0.97657288  0.94763479
  0.95091058  0.93958255  0.91248902  0.94261539  0.88117256  0.

In [17]:
from nltk.corpus import wordnet as wn
from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
from utils import logging_utils, time_utils

# tune the token pattern to get a better correlation with y_train
token_pattern = r"(?u)\b\w\w+\b"
# token_pattern = r"\w{1,}"
# token_pattern = r"\w+"
# token_pattern = r"[\w']+"
# token_pattern = " " 

class WordNet_Similarity:
    """Double aggregation features"""
    def __init__(self, metric="path"):
#         super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
        self.metric = metric
        if self.metric == "path":
            self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
        elif self.metric == "lch":
            self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
        elif self.metric == "wup":
            self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
        else:
            raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
            
    def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
        s = 0.
        if syn_list1 and syn_list2:
            for syn1 in syn_list1:
                for syn2 in syn_list2:
                    try:
                        _s = self.metric_func(syn1, syn2)
                    except:
                        _s = config.MISSING_VALUE_NUMERIC
                    if _s and _s > s:
                        s = _s
        return s

    def transform_one(self, obs, target):
        obs_tokens = nlp_utils._tokenize(obs, token_pattern)
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        obs_synset_list = [wn.synsets((obs_token).decode('utf-8')) for obs_token in obs_tokens]
        target_synset_list = [wn.synsets((target_token).decode('utf-8')) for target_token in target_tokens]
        val_list = []
        for obs_synset in obs_synset_list:
            _val_list = []
            for target_synset in target_synset_list:
                _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
                _val_list.append(_s)
            if len(_val_list) == 0:
                _val_list = [config.MISSING_VALUE_NUMERIC]
            val_list.append( max(_val_list) )
        if len(val_list) == 0:
            val_list = [[config.MISSING_VALUE_NUMERIC]]
        return np.mean(val_list)
    
t = train.sample(n=10)
wn_list = ["path", "lch", "wup"]
for wn_method in wn_list:
    wn_sim = WordNet_Similarity(metric=wn_method)
    t.apply(lambda x: wn_sim.transform_one(x['question1'],x['question2']), axis=1)

290086    0.138095
72013     0.363054
366541    0.500000
67367     0.555556
89896     0.500000
217790    0.239914
275125    0.425397
608       0.513889
21845     0.532407
218776    0.127778
dtype: float64

290086    1.271662
72013     1.721377
366541    1.999934
67367     2.427426
89896     1.818793
217790    1.640691
275125    1.723845
608       2.375150
21845     1.875742
218776    0.904142
dtype: float64

290086    0.426494
72013     0.515385
366541    0.565171
67367     0.658333
89896     0.500000
217790    0.558175
275125    0.631258
608       0.706926
21845     0.625356
218776    0.270120
dtype: float64

In [4]:
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction import text
from sklearn.metrics import log_loss

import cPickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
import config
stop_words = stopwords.words('english')

PATH = config.RAW_PATH
FEAT_PATH = config.FEAT_PATH
# train = pd.read_csv(PATH+'train.csv',nrows=config.TRAIN_SIZE)
# test = pd.read_csv(PATH+'test.csv',nrows=config.TEST_SIZE)
data = train#.ix[:100]

def wmd(s1,s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words("english")
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1,s2)

def norm_wmd(s1,s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words("english")
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

# data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
# data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
# data['diff_len'] = data.len_q1 - data.len_q2
# data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
# data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
# data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

model = gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

# norm_model =  gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# norm_model.init_sims(replace=True)
# data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

# data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]


# data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
# data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
# data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
# data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

# cPickle.dump(question1_vectors, open(FEAT_PATH+'q1_w2v.pkl', 'wb'), -1)
# cPickle.dump(question2_vectors, open(FEAT_PATH+'q2_w2v.pkl', 'wb'), -1)

# data.to_csv(FEAT_PATH+'ab_features.csv', index=False)

# data



2750086it [10:59, 4170.62it/s]
2750086it [11:02, 4149.21it/s]


In [5]:
from utils import dist_utils, ngram_utils, nlp_utils
data['RMSE_distance'] = [dist_utils._rmse(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]