In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from nltk.corpus import stopwords
from collections import Counter

%matplotlib inline

In [2]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

nlp_train_features = pd.read_csv('data/nlp_features_train.csv')

In [3]:
nlp_train_features.head()

Unnamed: 0.1,Unnamed: 0,ner_overlap,lemma_overlap,ner_q1_count,ner_q2_count,qn_word_overlap,bigram_overlap,trigram_overlap,root_match,subj_match,dobj_match
0,0,1,0.433333,1,1,1,10,9,1,1,1
1,1,0,0.5,0,0,1,1,0,0,0,0
2,2,0,0.5,0,0,1,1,0,0,0,0
3,3,0,0.5,0,0,0,0,0,0,0,0
4,4,1,0.46875,1,1,1,0,0,0,0,1


In [4]:
print data_train['question1'][0]
print data_train['question2'][0]
print data_train['is_duplicate'][0]

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?
0


In [6]:
#feature generation
stops = set(stopwords.words("english"))

train_qs = pd.Series(data_train['question1'].tolist() + data_train['question2'].tolist()).astype(str)
test_qs = pd.Series(data_test['question1'].tolist() + data_test['question2'].tolist()).astype(str)

#word_share_match
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R


#tfidf_share_match
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / float(count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


train_word_match = data_train.apply(word_match_share, axis=1, raw=True)
tfidf_train_word_match = data_train.apply(tfidf_word_match_share, axis=1, raw=True)

#Features
x_train = pd.DataFrame()
x_test = pd.DataFrame()

x_train['word_match'] = train_word_match
x_train['q1len'] = data_train['question1'].str.len()
x_train['q2len'] = data_train['question2'].str.len()
x_train['tfidf_word_match'] = tfidf_train_word_match
x_train['q1_n_words'] = data_train['question1'].apply(lambda row: len(str(row).split(" ")))
x_train['q2_n_words'] = data_train['question2'].apply(lambda row: len(str(row).split(" ")))
x_train['ner_overlap'] = nlp_train_features['ner_overlap']
x_train['lemma_overlap'] = nlp_train_features['lemma_overlap']
x_train['ner_q1_count'] = nlp_train_features['ner_q1_count']
x_train['ner_q2_count'] = nlp_train_features['ner_q2_count']
x_train['qn_word_overlap'] = nlp_train_features['qn_word_overlap']
x_train['bigram_overlap'] = nlp_train_features['bigram_overlap']
x_train['trigram_overlap'] = nlp_train_features['trigram_overlap']
x_train['root_match'] = nlp_train_features['root_match']
x_train['subj_match'] = nlp_train_features['subj_match']
x_train['dobj_match'] = nlp_train_features['dobj_match']



x_test['word_match'] = data_test.apply(word_match_share, axis=1, raw=True)
x_test['q1len'] = data_test['question1'].str.len()
x_test['q2len'] = data_test['question2'].str.len()
x_test['tfidf_word_match'] = data_test.apply(tfidf_word_match_share, axis=1, raw=True)
x_test['q1_n_words'] = data_test['question1'].apply(lambda row: len(str(row).split(" ")))
x_test['q2_n_words'] = data_test['question2'].apply(lambda row: len(str(row).split(" ")))


y_train = data_train['is_duplicate'].values


In [7]:
#Since we have 37% positive class in our training data, and only 17% in the test data. 
#By re-balancing the data so our training set has 17% positives

pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]
p = 0.165

scale = ((len(pos_train) / float(len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / float(len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train


# Finally, we split some of the data off for validation
from sklearn.cross_validation import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)


0.191243661001


In [9]:
#baseline model : training using xgboost
import xgboost as xgb

#parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50)



[0]	train-logloss:0.683348	valid-logloss:0.683384
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[1]	train-logloss:0.673934	valid-logloss:0.674007
[2]	train-logloss:0.664883	valid-logloss:0.664992
[3]	train-logloss:0.656176	valid-logloss:0.656321
[4]	train-logloss:0.647795	valid-logloss:0.647976
[5]	train-logloss:0.639724	valid-logloss:0.639937
[6]	train-logloss:0.631942	valid-logloss:0.632187
[7]	train-logloss:0.624444	valid-logloss:0.624723
[8]	train-logloss:0.617208	valid-logloss:0.617518
[9]	train-logloss:0.610233	valid-logloss:0.610579
[10]	train-logloss:0.603501	valid-logloss:0.603881
[11]	train-logloss:0.597	valid-logloss:0.597414
[12]	train-logloss:0.590717	valid-logloss:0.59116
[13]	train-logloss:0.584648	valid-logloss:0.585125
[14]	train-logloss:0.578781	valid-logloss:0.57929
[15]	train-logloss:0.573108	valid-logloss:0.57365
[16]	train-logloss:0.567621	valid-logloss:0.56819

[158]	train-logloss:0.375866	valid-logloss:0.378087
[159]	train-logloss:0.375694	valid-logloss:0.377919
[160]	train-logloss:0.375522	valid-logloss:0.377752
[161]	train-logloss:0.375354	valid-logloss:0.377589
[162]	train-logloss:0.375194	valid-logloss:0.377434
[163]	train-logloss:0.375031	valid-logloss:0.377277
[164]	train-logloss:0.37486	valid-logloss:0.377109
[165]	train-logloss:0.374705	valid-logloss:0.37696
[166]	train-logloss:0.374553	valid-logloss:0.376812
[167]	train-logloss:0.374403	valid-logloss:0.376666
[168]	train-logloss:0.374237	valid-logloss:0.376507
[169]	train-logloss:0.374093	valid-logloss:0.376367
[170]	train-logloss:0.373932	valid-logloss:0.37621
[171]	train-logloss:0.373798	valid-logloss:0.376078
[172]	train-logloss:0.37365	valid-logloss:0.37593
[173]	train-logloss:0.373497	valid-logloss:0.37578
[174]	train-logloss:0.373371	valid-logloss:0.375657
[175]	train-logloss:0.373224	valid-logloss:0.375514
[176]	train-logloss:0.373081	valid-logloss:0.375373
[177]	train-loglos

[317]	train-logloss:0.365279	valid-logloss:0.367791
[318]	train-logloss:0.365252	valid-logloss:0.367766
[319]	train-logloss:0.365219	valid-logloss:0.36773
[320]	train-logloss:0.365188	valid-logloss:0.367704
[321]	train-logloss:0.365177	valid-logloss:0.367693
[322]	train-logloss:0.365163	valid-logloss:0.367682
[323]	train-logloss:0.3651	valid-logloss:0.367617
[324]	train-logloss:0.36508	valid-logloss:0.367597
[325]	train-logloss:0.36505	valid-logloss:0.367564
[326]	train-logloss:0.365022	valid-logloss:0.367537
[327]	train-logloss:0.364995	valid-logloss:0.36751
[328]	train-logloss:0.364934	valid-logloss:0.367449
[329]	train-logloss:0.364924	valid-logloss:0.367438
[330]	train-logloss:0.364905	valid-logloss:0.367421
[331]	train-logloss:0.364868	valid-logloss:0.367383
[332]	train-logloss:0.364837	valid-logloss:0.367353
[333]	train-logloss:0.364808	valid-logloss:0.367322
[334]	train-logloss:0.364795	valid-logloss:0.367309
[335]	train-logloss:0.364729	valid-logloss:0.367246
[336]	train-loglos

[476]	train-logloss:0.360869	valid-logloss:0.363627
[477]	train-logloss:0.36084	valid-logloss:0.363602
[478]	train-logloss:0.360798	valid-logloss:0.363565
[479]	train-logloss:0.36077	valid-logloss:0.363542
[480]	train-logloss:0.360753	valid-logloss:0.363527
[481]	train-logloss:0.360723	valid-logloss:0.363499
[482]	train-logloss:0.360688	valid-logloss:0.363468
[483]	train-logloss:0.360648	valid-logloss:0.363432
[484]	train-logloss:0.360613	valid-logloss:0.3634
[485]	train-logloss:0.360573	valid-logloss:0.36336
[486]	train-logloss:0.360559	valid-logloss:0.363347
[487]	train-logloss:0.360545	valid-logloss:0.363334
[488]	train-logloss:0.360524	valid-logloss:0.363312
[489]	train-logloss:0.360484	valid-logloss:0.363274
[490]	train-logloss:0.360451	valid-logloss:0.363244
[491]	train-logloss:0.360409	valid-logloss:0.363201
[492]	train-logloss:0.360383	valid-logloss:0.363178
[493]	train-logloss:0.360369	valid-logloss:0.363169
[494]	train-logloss:0.36035	valid-logloss:0.363151
[495]	train-loglos

[635]	train-logloss:0.357477	valid-logloss:0.360547
[636]	train-logloss:0.357474	valid-logloss:0.360544
[637]	train-logloss:0.357467	valid-logloss:0.360541
[638]	train-logloss:0.357456	valid-logloss:0.360532
[639]	train-logloss:0.357437	valid-logloss:0.360517
[640]	train-logloss:0.357424	valid-logloss:0.360507
[641]	train-logloss:0.35742	valid-logloss:0.360505
[642]	train-logloss:0.357405	valid-logloss:0.360491
[643]	train-logloss:0.357384	valid-logloss:0.360472
[644]	train-logloss:0.35736	valid-logloss:0.360448
[645]	train-logloss:0.357353	valid-logloss:0.360441
[646]	train-logloss:0.35733	valid-logloss:0.360422
[647]	train-logloss:0.357297	valid-logloss:0.36039
[648]	train-logloss:0.357288	valid-logloss:0.360383
[649]	train-logloss:0.35727	valid-logloss:0.360368
[650]	train-logloss:0.357238	valid-logloss:0.360337
[651]	train-logloss:0.357217	valid-logloss:0.360317
[652]	train-logloss:0.357185	valid-logloss:0.360286
[653]	train-logloss:0.357172	valid-logloss:0.360276
[654]	train-loglo

[794]	train-logloss:0.355109	valid-logloss:0.358435
[795]	train-logloss:0.355098	valid-logloss:0.358427
[796]	train-logloss:0.355087	valid-logloss:0.35842
[797]	train-logloss:0.355083	valid-logloss:0.358416
[798]	train-logloss:0.355077	valid-logloss:0.358412
[799]	train-logloss:0.355072	valid-logloss:0.358408
[800]	train-logloss:0.35506	valid-logloss:0.358398
[801]	train-logloss:0.355048	valid-logloss:0.358388
[802]	train-logloss:0.355037	valid-logloss:0.358376
[803]	train-logloss:0.355033	valid-logloss:0.358375
[804]	train-logloss:0.35502	valid-logloss:0.358363
[805]	train-logloss:0.355002	valid-logloss:0.358343
[806]	train-logloss:0.354989	valid-logloss:0.358333
[807]	train-logloss:0.354983	valid-logloss:0.358328
[808]	train-logloss:0.354968	valid-logloss:0.358314
[809]	train-logloss:0.354955	valid-logloss:0.358301
[810]	train-logloss:0.354938	valid-logloss:0.358285
[811]	train-logloss:0.354926	valid-logloss:0.358276
[812]	train-logloss:0.354913	valid-logloss:0.358265
[813]	train-log

[953]	train-logloss:0.353387	valid-logloss:0.356941
[954]	train-logloss:0.353376	valid-logloss:0.356933
[955]	train-logloss:0.353366	valid-logloss:0.356927
[956]	train-logloss:0.35336	valid-logloss:0.356921
[957]	train-logloss:0.35335	valid-logloss:0.356913
[958]	train-logloss:0.353343	valid-logloss:0.356907
[959]	train-logloss:0.353341	valid-logloss:0.356905
[960]	train-logloss:0.353334	valid-logloss:0.3569
[961]	train-logloss:0.353318	valid-logloss:0.356884
[962]	train-logloss:0.353305	valid-logloss:0.356873
[963]	train-logloss:0.353295	valid-logloss:0.356864
[964]	train-logloss:0.353282	valid-logloss:0.356852
[965]	train-logloss:0.353277	valid-logloss:0.356847
[966]	train-logloss:0.35327	valid-logloss:0.356841
[967]	train-logloss:0.353258	valid-logloss:0.35683
[968]	train-logloss:0.353249	valid-logloss:0.356825
[969]	train-logloss:0.353236	valid-logloss:0.356815
[970]	train-logloss:0.353225	valid-logloss:0.356807
[971]	train-logloss:0.353223	valid-logloss:0.356805
[972]	train-loglos

In [None]:
#predictions
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = data_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('data/quora/baseline_xgb.csv', index=False)