In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

%matplotlib inline
from keras.utils import np_utils

train = pd.read_csv('../train.csv', sep=',', header=0)
#test = pd.read_csv('../test.csv', sep=',', header=0)

train = train.fillna(" ")
#test = test.fillna(" ")
print(train.shape)

import nltk
nltk.download('stopwords')
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer 
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import log_loss

train.head(2)

Using TensorFlow backend.


(404290, 6)
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/timomoeller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


In [2]:
nltk.download('averaged_perceptron_tagger')
import difflib
from collections import Counter

def diff_ratios(st1, st2):
    seq = difflib.SequenceMatcher()
    seq.set_seqs(str(st1).lower(), str(st2).lower())
    return seq.ratio()

def noun_extracter(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    return np.array([w for w, t in nltk.pos_tag(tokens) if t[:1] in ['N']])
    
def word_match_share(q1,q2):
    q1 = q1.split()
    q2 = q2.split()
    if(len(q1) == 0 or len(q2) == 0):
        return 0
    else:
        both = set(q1).intersection(set(q2))
        return len(both) * 2 / (len(q1) + len(q2))
    
def removePunktAndStopwords(phrases):
    toInclude = set(['above','below','between','but','couldn','didn','doesn','down',
                     'few','hadn','haven','isn','just','mightn','mustn','needn','nor',
                     'not','off','once','only','out','over','should','shouldn','some',
                     'very','wasn','weren','won','wouldn','again','against','all',
                     'any','aren'])
    stop_wordsLarge = set(stopwords.words('english'))
    stop_words = set([w for w in stop_wordsLarge if w not in toInclude])
    tokenizer = RegexpTokenizer('[a-z]\w+')
    docs_toke = [tokenizer.tokenize(doc.lower()) for doc in phrases]
    stopwordsremoved = [[t for t in temp if t not in stop_words] for temp in docs_toke]
    return np.asarray([" ".join(w) for w in stopwordsremoved])

class LemmaTokenizer(object):
    def __init__(self):
        self.snbstem = SnowballStemmer('english')
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.remove('but')
        self.stop_words.remove('not') # left outer join would be nice...
        self.tokenizer = RegexpTokenizer('[a-z]\w+')
    def lmtokenize(self,doc):
        temp = [self.snbstem.stem(t) for t in self.tokenizer.tokenize(doc.lower()) if t not in self.stop_words]
        return " ".join(temp)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/timomoeller/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def getFeatures(df):
    questions1 = df["question1"].values
    questions2 = df["question2"].values

    stopWRe_q1 = removePunktAndStopwords(questions1)
    stopWRe_q2 = removePunktAndStopwords(questions2)

    df['question1_nouns'] = pd.Series(Parallel(n_jobs=7)(delayed(noun_extracter)(x) for x in questions1))
    df['question2_nouns'] = pd.Series(Parallel(n_jobs=7)(delayed(noun_extracter)(x) for x in questions2))
    df['z_len1'] = df.question1.map(lambda x: len(str(x)))
    df['z_len2'] = df.question2.map(lambda x: len(str(x)))
    df['z_word_len1'] = df.question1.map(lambda x: len(str(x).split()))
    df['z_word_len2'] = df.question2.map(lambda x: len(str(x).split()))
    df['z_noun_match'] = df.apply(lambda r: 
                                        sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)
    df['z_match_ratio'] = pd.Series(Parallel(n_jobs=7)
                                       (delayed(diff_ratios)(x,y) for x,y in zip(stopWRe_q1,stopWRe_q2)))
    df['z_word_match'] = pd.Series(Parallel(n_jobs=7)
                                      (delayed(word_match_share)(x,y) for x,y in zip(questions1,questions2)))
    
    return df
    

train = getFeatures(train)
import pickle
pickle.dump(train,open( "trainFeatures.p", "wb" ))
#test = getFeatures(test)
col = [c for c in train.columns if c[:1]=='z']



In [None]:
preprocessor = LemmaTokenizer()
stemmed_q1 = Parallel(n_jobs=6)(delayed(preprocessor.lmtokenize)(sentence) for sentence in stopWRe_q1)
stemmed_q2 = Parallel(n_jobs=6)(delayed(preprocessor.lmtokenize)(sentence) for sentence in stopWRe_q2)

length1 = np.array([len(x.split()) for x in stopWRe_q1])
length2 = np.array([len(x.split()) for x in stopWRe_q2])

vect = CountVectorizer(max_df=0.6,min_df=4)
vect.fit(np.concatenate((stemmed_q1,stemmed_q2)))
bow_q1 = vect.transform(stemmed_q1)
bow_q2 = vect.transform(stemmed_q2)
vocab_q1 = np.asarray(["q1_" + x for x in vect.get_feature_names()])
vocab_q2 = np.asarray(["q2_" + x for x in vect.get_feature_names()])

In [5]:
features = sp.sparse.hstack([bow_q1,bow_q2,train[col].values])
labels = train["is_duplicate"]
feature_names = np.concatenate([vocab_q1,vocab_q2,np.array(col)])
print(features.shape)
print("done preprocessing")

(404290, 46186)
done preprocessing


In [28]:
# scaling of neg and pos examples (ratio as in test set) EVEN reduces logloss on training data ~6 percentpoints
pos_train = train[train['is_duplicate'] == 1]
neg_train = train[train['is_duplicate'] == 0]
p = 0.165 # kaggle test set ratio 
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1

while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
train2 = pd.concat([pos_train, neg_train])
print(train2.shape)
print(train.shape)

(780486, 15)
(404290, 15)


In [82]:
# xgb api with additional methods and params 

X_train, X_test, y_train, y_test = train_test_split(train2[col].values, train2["is_duplicate"].values, test_size=0.2, random_state=42)

# num_testing = 100000
# X_train = train[col].values[:-num_testing]
# X_test = train[col].values[-num_testing:]
# y_train = train["is_duplicate"].values[:-num_testing]
# y_test = train["is_duplicate"].values[-num_testing:]
# # classification

Dtrain = xgb.DMatrix(X_train,y_train,feature_names=col)
Dtest = xgb.DMatrix(X_test,y_test,feature_names=col)

params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = 'logloss'
params["max_depth"] = 6

watchlist = [(Dtest,"valid")]

bst = xgb.train(params, Dtrain, 200, watchlist, early_stopping_rounds=20, verbose_eval=10)

[0]	valid-logloss:0.571953
Will train until valid-logloss hasn't improved in 20 rounds.
[10]	valid-logloss:0.393677
[20]	valid-logloss:0.386118
[30]	valid-logloss:0.38215
[40]	valid-logloss:0.380261
[50]	valid-logloss:0.378966
[60]	valid-logloss:0.377763
[70]	valid-logloss:0.377107
[80]	valid-logloss:0.376484
[90]	valid-logloss:0.375726
[100]	valid-logloss:0.3752
[110]	valid-logloss:0.374837
[120]	valid-logloss:0.374335
[130]	valid-logloss:0.373795
[140]	valid-logloss:0.373538
[150]	valid-logloss:0.373186
[160]	valid-logloss:0.372921
[170]	valid-logloss:0.3727
[180]	valid-logloss:0.372453
[190]	valid-logloss:0.372327


In [78]:
preds = bst.predict(Dtest)
errors = np.abs(preds-y_test)
idx = np.argsort(errors)[::-1] + num_testing

for i in range(20):
    print(train.iloc[idx[i],3:5].values)
    print("error: %.3f with preds %.3f and truth %i" %(errors[idx[i]-num_testing],preds[idx[i]-num_testing],y_test[idx[i]-num_testing]))


[ 'Which is the best post graduate medical college to do MS in Orthopaedics?'
 'Which is the best private post graduate medical college for MS in orthopaedics?']
error: 1.000 with preds 0.000 and truth 1
['What are some of the best camping tools?'
 'What are some of the best camping blogs?']
error: 0.999 with preds 0.001 and truth 1
['What is the use of having flavors in the condoms?'
 'What is the use of flavoured condoms?']
error: 0.999 with preds 0.001 and truth 1
['How do I add add text to a IPython/Jupyter Notebook?'
 'How do I install IPython Notebook and Sublime Text on the same PC?']
error: 0.999 with preds 0.001 and truth 1
["How did Adam D'Angelo get so much learned people to sign up for Quora?"
 'How did Quora get initial traction?']
error: 0.998 with preds 0.002 and truth 1
['How do I recover my Gmail email addresses?'
 'How do I recover deleted emails in my gmail account?']
error: 0.998 with preds 0.002 and truth 1
['How you ever been raped?'
 'Have you ever been subjected

In [81]:
train.iloc[idx[0]]
#y_test[idx[0]-num_testing]

id                                                            133367
qid1                                                          213367
qid2                                                          213368
question1          Which is the best post graduate medical colleg...
question2          Which is the best private post graduate medica...
is_duplicate                                                       0
question1_nouns          [post, graduate, college, ms, orthopaedics]
question2_nouns          [post, graduate, college, ms, orthopaedics]
z_len1                                                            73
z_len2                                                            79
z_word_len1                                                       13
z_word_len2                                                       13
z_noun_match                                                       5
z_match_ratio                                               0.925926
z_word_match                      

In [42]:
zip(y_test.values,preds)


TypeError: 'zip' object is not subscriptable

In [20]:
plt.rcParams['figure.figsize'] = (10.0, 60.0)
xgb.plot_importance(bst); plt.show()