In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords

INPUT_PATH = '/home/lyz/work/kaggle/kaggle-quora-question-pairs/'

# 特征工程 + 树模型
# 词向量 + 孪生网络

In [8]:
df_train = pd.read_csv(INPUT_PATH + 'train.csv', nrows=5000)
df_test  = pd.read_csv(INPUT_PATH + 'test.csv', nrows=5000)

In [9]:
# idf
def get_weight(count, eps=10000, min_count=2):
    return 0 if count < min_count else 1 / (count + eps)


train_qs = pd.Series(
    df_train['question1'].tolist() + df_train['question2'].tolist()
).astype(str)

words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [10]:
stops = set(stopwords.words("english"))

def word_shares(row):
    
    # 第1种情况：句子1只包含停用词
    q1_list = str(row['question1']).lower().split()
    q1 = set(q1_list)
    q1words = q1.difference(stops)
    if len(q1words) == 0:
        return '0:0:0:0:0:0:0:0'
    
    # 第2种情况：句子2只包含停用词
    q2_list = str(row['question2']).lower().split()
    q2 = set(q2_list)
    q2words = q2.difference(stops)
    if len(q2words) == 0:
        return '0:0:0:0:0:0:0:0'

    
    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max( len(q1_list), len(q2_list) )
    q1stops = q1.intersection(stops)
    q2stops = q2.intersection(stops)

    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

    shared_2gram = q1_2gram.intersection(q2_2gram)

    shared_words = q1words.intersection(q2words)
    shared_weights = [weights.get(w, 0) for w in shared_words]
    q1_weights = [weights.get(w, 0) for w in q1words]
    q2_weights = [weights.get(w, 0) for w in q2words]
    total_weights = q1_weights + q2_weights
    
    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
    R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
    R31 = len(q1stops) / len(q1words) #stops in q1
    R32 = len(q2stops) / len(q2words) #stops in q2
    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
    if len(q1_2gram) + len(q2_2gram) == 0:
        R2gram = 0
    else:
        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)


In [11]:
df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1)

train_test = pd.DataFrame()

train_test['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
train_test['word_match_2root'] = np.sqrt(train_test['word_match'])
train_test['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
train_test['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

train_test['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
train_test['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
train_test['shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
train_test['cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
train_test['words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
train_test['diff_stops_r']     = train_test['stops1_ratio'] - train_test['stops2_ratio']

train_test['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
train_test['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
train_test['diff_len'] = train_test['len_q1'] - train_test['len_q2']

train_test['caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
train_test['caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
train_test['diff_caps'] = train_test['caps_count_q1'] - train_test['caps_count_q2']

train_test['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
train_test['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
train_test['diff_len_char'] = train_test['len_char_q1'] - train_test['len_char_q2']

train_test['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
train_test['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
train_test['diff_len_word'] = train_test['len_word_q1'] - train_test['len_word_q2']

train_test['avg_world_len1'] = train_test['len_char_q1'] / train_test['len_word_q1']
train_test['avg_world_len2'] = train_test['len_char_q2'] / train_test['len_word_q2']
train_test['diff_avg_word'] = train_test['avg_world_len1'] - train_test['avg_world_len2']

train_test['exactly_same'] = (df['question1'] == df['question2']).astype(int)
train_test['duplicated'] = df.duplicated(['question1','question2']).astype(int)

def add_word_count(x, df, word):
    x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
    
add_word_count(train_test, df, 'how')
add_word_count(train_test, df, 'what')
add_word_count(train_test, df, 'which')
add_word_count(train_test, df, 'who')
add_word_count(train_test, df, 'where')
add_word_count(train_test, df, 'when')
add_word_count(train_test, df, 'why')



In [12]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.1,
    'max_depth': 5,
}

cv_results = xgb.cv(
    params,
    xgb.DMatrix(train_test.iloc[:df_train.shape[0]], df_train['is_duplicate'].values),
    num_boost_round=100,
    seed=42,
    nfold=5,
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.65458,0.000536,0.65668,0.001123
1,0.622795,0.00105,0.626756,0.001573
2,0.595998,0.001334,0.601964,0.002194
3,0.573223,0.001615,0.581153,0.002629
4,0.553685,0.001801,0.563505,0.00303
5,0.536679,0.002084,0.548681,0.002879
6,0.521496,0.00218,0.535619,0.00347
7,0.508277,0.002232,0.524311,0.00345
8,0.496489,0.002388,0.514817,0.003736
9,0.486039,0.002441,0.506007,0.004347
