In [2]:
import argparse
import functools
from collections import defaultdict
import pickle

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier

import re


def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13

#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X

def clean_text(text):
    #substitute thousands
    matches = re.finditer(r'[0-9]+(?P<thousands>\s{0,2}k\b)', text, flags=re.I)
    result = ''
    len_offset = 0
    for match in matches:
        result += '{}000'.format(text[len(result)-len_offset:match.start('thousands')])
        len_offset += 3 - (match.end('thousands') - match.start('thousands'))
    result += text[len(result)-len_offset:]
    text = result
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r" J K ", " JK ", text)
    return text.lower()

abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    '["\'?,\.]':'',
    '\s+':' '}



In [None]:
df_train = pd.read_csv('../data/train_features.csv', encoding="ISO-8859-1") # abhishek
X_train_ab = df_train.iloc[:, 2:-1]
X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

df_train = pd.read_csv('../train.csv')
df_train = df_train.fillna(' ')
df_test = pd.read_csv('../test.csv')
df_test = df_test.fillna(' ')

# remove class noise
df_train = pd.read_csv('../train.csv')
df_train = df_train.fillna(' ')
df_train.replace(abbr_dict,regex=True,inplace=True)
df_train.loc[:,'question1'] = df_train.question1.map(lambda x: x.lower())
df_train.loc[:,'question2'] = df_train.question2.map(lambda x: x.lower())

idx1 = df_train.is_duplicate == 0
idx2 = df_train.question1 == df_train.question2
idx3 = (idx1*1 + idx2*1) == 2
print("%i is duplicate rows switched" %np.sum(idx3))
idxint = np.nonzero(idx3)[0]
df_train.iloc[idxint,-1] = 1



# clean text
df_train.loc[:,'question1'] = df_train.question1.map(lambda x: clean_text(x))
df_train.loc[:,'question2'] = df_train.question2.map(lambda x: clean_text(x))


df_test.replace(abbr_dict,regex=True,inplace=True)
df_test.loc[:,'question1'] = df_test.question1.map(lambda x: clean_text(x))
df_test.loc[:,'question2'] = df_test.question2.map(lambda x: clean_text(x))




ques = pd.concat([df_train[['question1', 'question2']], \
    df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))

def q2_freq(row):
    return(len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
#del df_test

train_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]

# explore
stops = set(stopwords.words("english"))

df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

print('Building Features')
X_train = build_features(df_train, stops, weights)
X_train = pd.concat((X_train, X_train_ab, train_leaky), axis=1)
y_train = df_train['is_duplicate'].values

In [6]:
pickle.dump([X_train,y_train],open("../data/XandytrainForum158cleaned.p","wb"))
print("dumped")

dumped


In [3]:
temp = pickle.load(open("../data/XandytrainForum158.p","rb"))
X_train= temp[0]
y_train = temp[1]

In [4]:
#UPDownSampling
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

0.189780815727
0.189272444346


In [4]:
#UPSampling
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train, neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid, neg_valid))
y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

0.226481905009
0.22590133115


In [5]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.19
params['nthread'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2501, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
pickle.dump(bst,open("../data/XandytrainForum158cleanedBST.p","wb"))
print("dumped")

# no text cleaning, no class noise removal, updownsampling, eta 0.02:
# [2198]	train-logloss:0.142188	valid-logloss:0.182794

# text cleaning, class noise removed, updownsampling, eta 0.02:
#[2450]	train-logloss:0.142341	valid-logloss:0.186482


# text cleaning, class noise removed, only upsampling, eta 0.02:

#temptest scale pos weight 0.19:
#[500]	train-logloss:0.251567	valid-logloss:0.258253
#without scale pos weight
#[500]	train-logloss:0.185092	valid-logloss:0.194429

[0]	train-logloss:0.526197	valid-logloss:0.525457
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.300284	valid-logloss:0.301808
[100]	train-logloss:0.249291	valid-logloss:0.252032
[150]	train-logloss:0.231175	valid-logloss:0.234622
[200]	train-logloss:0.222838	valid-logloss:0.22698
[250]	train-logloss:0.217739	valid-logloss:0.222674
[300]	train-logloss:0.21338	valid-logloss:0.21913
[350]	train-logloss:0.20973	valid-logloss:0.216531
[400]	train-logloss:0.206702	valid-logloss:0.214497
[450]	train-logloss:0.204056	valid-logloss:0.212929
[500]	train-logloss:0.201904	valid-logloss:0.211776
[550]	train-logloss:0.199906	valid-logloss:0.210805
[600]	train-logloss:0.198135	valid-logloss:0.209984
[650]	train-logloss:0.196319	valid-logloss:0.20923
[700]	train-logloss:0.19468	valid-logloss:0.208609
[750]	train-logloss:0.193117	valid-logloss:0.208096
[800]	train-logloss:0.1916

In [None]:
print('Building Test Features')
df_abhifeatures = pd.read_csv('../data/test_features.csv', encoding="ISO-8859-1") # already loaded and cleaned
x_test_ab = df_abhifeatures.iloc[:, 2:-1]
x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)

#df_test = pd.read_csv('../test.csv')
#df_test = df_test.fillna(' ')

df_test['question1'] = df_test['question1'].map(lambda x: str(x).split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).split())

x_test = build_features(df_test, stops, weights)
x_test = pd.concat((x_test, x_test_ab, test_leaky), axis=1)


pickle.dump(x_test,open("../data/XtestForum158cleaned.p","wb"))

In [15]:
import pickle
import pandas as pd
import xgboost as xgb

# df1 = pickle.load(open("../data/XandytrainForum158.p","rb"))
# df2 = pickle.load(open("../data/xandy158untouched.p","rb")) # they are the same except 2 freq vals, be cause of fillna(" ")

#x_test = pickle.load(open("../data/XtestForum158.p","rb"))
#bst = pickle.load(open("../data/XandytrainForum158UpdownBST.p","rb"))
#bst = pickle.load(open("../data/bstuntouched.p","rb"))



In [6]:
def convertPredictions(preds):
    meanTrain = 0.19  #no sampling: 0.37 || only upsampling: 0.226 || upDownsample 0.19
    a = 0.165 / meanTrain
    b = (1 - 0.165) / (1 - meanTrain)

    return (preds * a)  / (preds * a + (1 - preds) * b)


d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

p_test_conv = convertPredictions(p_test)

df_test = pd.read_csv('../test.csv')
df_test["is_duplicate"] = pd.Series(p_test_conv,index=df_test.index)

header = ["test_id", "is_duplicate"]
df_test.to_csv('../submissions/predictions.csv', columns = header,index=False)

In [7]:
p1 = pd.read_csv("../submissions/predictions.csv")
p2 = pd.read_csv("../submissions/predictionsold.csv")

In [10]:
import numpy as np
print(np.mean(p1.is_duplicate))
print(np.mean(p2.is_duplicate))

0.0829372008725
0.0794072414555
