In [10]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from nltk.corpus import stopwords
from collections import Counter

%matplotlib inline

In [12]:
data_train = pd.read_csv('data/quora/train.csv')
data_test = pd.read_csv('data/quora/test.csv')

nlp_train_features = pd.read_csv('data/quora/nlp_features_train.csv')
train_cosine_sim = pd.read_pickle('data/quora/train_cosine_sim.pickle')
test_cosine_sim = pd.read_pickle('data/quora/test_cosine_sim.pickle')

In [13]:
test_cosine_sim.head()

Unnamed: 0,cosine_sim
0,0.514389
1,0.761217
2,0.838156
3,0.447083
4,1.0


In [14]:
print data_train['question1'][0]
print data_train['question2'][0]
print data_train['is_duplicate'][0]

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?
0


In [15]:
#cosine similarity
def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

In [16]:
#feature generation
stops = set(stopwords.words("english"))

train_qs = pd.Series(data_train['question1'].tolist() + data_train['question2'].tolist()).astype(str)
test_qs = pd.Series(data_test['question1'].tolist() + data_test['question2'].tolist()).astype(str)

#word_share_match
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R


#tfidf_share_match
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / float(count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


train_word_match = data_train.apply(word_match_share, axis=1, raw=True)
tfidf_train_word_match = data_train.apply(tfidf_word_match_share, axis=1, raw=True)

#Features
x_train = pd.DataFrame()
x_test = pd.DataFrame()

x_train['word_match'] = train_word_match
x_train['q1len'] = data_train['question1'].str.len()
x_train['q2len'] = data_train['question2'].str.len()
x_train['tfidf_word_match'] = tfidf_train_word_match
x_train['q1_n_words'] = data_train['question1'].apply(lambda row: len(str(row).split(" ")))
x_train['q2_n_words'] = data_train['question2'].apply(lambda row: len(str(row).split(" ")))
x_train['cosine_sim'] = train_cosine_sim['cosine_sim']
#x_train['ner_overlap'] = nlp_train_features['ner_overlap']
#x_train['lemma_overlap'] = nlp_train_features['lemma_overlap']
#x_train['ner_q1_count'] = nlp_train_features['ner_q1_count']
#x_train['ner_q2_count'] = nlp_train_features['ner_q2_count']
#x_train['qn_word_overlap'] = nlp_train_features['qn_word_overlap']
#x_train['bigram_overlap'] = nlp_train_features['bigram_overlap']
#x_train['trigram_overlap'] = nlp_train_features['trigram_overlap']
#x_train['root_match'] = nlp_train_features['root_match']
#x_train['subj_match'] = nlp_train_features['subj_match']
#x_train['dobj_match'] = nlp_train_features['dobj_match']



x_test['word_match'] = data_test.apply(word_match_share, axis=1, raw=True)
x_test['q1len'] = data_test['question1'].str.len()
x_test['q2len'] = data_test['question2'].str.len()
x_test['tfidf_word_match'] = data_test.apply(tfidf_word_match_share, axis=1, raw=True)
x_test['q1_n_words'] = data_test['question1'].apply(lambda row: len(str(row).split(" ")))
x_test['q2_n_words'] = data_test['question2'].apply(lambda row: len(str(row).split(" ")))
x_test['cosine_sim'] = test_cosine_sim['cosine_sim']


y_train = data_train['is_duplicate'].values


In [17]:
#Since we have 37% positive class in our training data, and only 17% in the test data. 
#By re-balancing the data so our training set has 17% positives

pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]
p = 0.165

scale = ((len(pos_train) / float(len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / float(len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train


# Finally, we split some of the data off for validation
from sklearn.cross_validation import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)


0.191243661001


In [20]:
#baseline model : training using xgboost
import xgboost as xgb

#parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4
params['scale_pos_weight'] = 0.8

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50)



Will train until valid error hasn't decreased in 50 rounds.
[0]	train-logloss:0.683010	valid-logloss:0.683173
[1]	train-logloss:0.673487	valid-logloss:0.673573
[2]	train-logloss:0.664290	valid-logloss:0.664346
[3]	train-logloss:0.655148	valid-logloss:0.655488
[4]	train-logloss:0.646530	valid-logloss:0.646967
[5]	train-logloss:0.638656	valid-logloss:0.638803
[6]	train-logloss:0.630781	valid-logloss:0.630959
[7]	train-logloss:0.622800	valid-logloss:0.623320
[8]	train-logloss:0.615446	valid-logloss:0.616072
[9]	train-logloss:0.608550	valid-logloss:0.609047
[10]	train-logloss:0.601502	valid-logloss:0.602258
[11]	train-logloss:0.595236	valid-logloss:0.595675
[12]	train-logloss:0.588718	valid-logloss:0.589400
[13]	train-logloss:0.582531	valid-logloss:0.583299
[14]	train-logloss:0.576500	valid-logloss:0.577419
[15]	train-logloss:0.570874	valid-logloss:0.571728
[16]	train-logloss:0.565433	valid-logloss:0.566184
[17]	train-logloss:0.560096	valid-logloss:0.560854
[18]	train-logloss:0.554824	vali

In [21]:
#predictions
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = data_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('data/quora/cosine_4_xgb.csv', index=False)