In [1]:
from __future__ import division

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import xgboost as xgb
import tqdm
import string

from nltk.corpus import stopwords # Import the stop word list

%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
train.drop(['id', 'qid1', 'qid2'], axis = 1, inplace = True)
train.fillna('', inplace = True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 3 columns):
question1       404290 non-null object
question2       404290 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(1), object(2)
memory usage: 9.3+ MB


In [3]:
train.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
test = pd.read_csv('test.csv')
ids = test['test_id']
test.drop(['test_id'], axis = 1, inplace = True)
test.fillna('', inplace = True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 2 columns):
question1    object
question2    object
dtypes: object(2)
memory usage: 35.8+ MB


In [5]:
test.head()

Unnamed: 0,question1,question2
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [6]:
def text_to_word_sets( raw_text ):
    # Convert to lower case
    words = raw_text.lower()
    # Replace punctuation with spaces
    words = words.translate(string.maketrans(string.punctuation,' '*len(string.punctuation))) 
    # Split into individual words, convert into set
    words = set(words.split())
    #  In Python, searching a set is much faster than searching
    #  a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # Remove stop words
    return pd.Series([words, words - stops])

def text_to_word_lists( raw_text ):
    # Convert to lower case
    words = raw_text.lower()
    # Replace punctuation with spaces
    words = words.translate(string.maketrans(string.punctuation,' '*len(string.punctuation))) 
    # Split into individual words, convert into set
    words = words.split()
    #  In Python, searching a set is much faster than searching
    #  a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # Remove stop words
    return pd.Series([words, [w for w in words if not w in stops]])

def generate_features_from_sets(set1, set2):
    feature1 = len(set1)
    feature2 = len(set2)
    
    feature3 = len(set1 & set2)
    feature4 = len(set1 | set2)
    feature5 = feature3 / max(feature4, 1)
    
    feature6 = sum(len(e) for e in set1) / max(len(set1), 1)
    feature7 = sum(len(e) for e in set2) / max(len(set2), 1)
    
    return pd.Series([feature1, feature2, feature3, feature4, feature5, feature6, feature7])

features_from_sets = [
    '# of words 1',
    '# of words 2',
    '# of words in intersection',
    '# of words in union',
    'jaccard similarity',
    'average word length 1',
    'average word length 2'
]

all_features = features_from_sets

In [7]:
n = 5
print train["question1"][n]
print train["question2"][n], '\n'
set_no_stops1 = text_to_word_sets( train["question1"][n] )[0]
set_no_stops2 = text_to_word_sets( train["question2"][n] )[0]
list_no_stops1 = text_to_word_lists( train["question1"][n] )[0]
list_no_stops2 = text_to_word_lists( train["question2"][n] )[0]
print set_no_stops1
print list_no_stops1
print set_no_stops1
print list_no_stops2, '\n'
print generate_features_from_sets(set_no_stops1, set_no_stops2)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

set(['a', 'and', 'what', 'that', 'i', 'sun', 'cap', 'am', 'about', 'me', 'moon', 'capricorn', 'say', 'does', 'rising', 'astrology'])
['astrology', 'i', 'am', 'a', 'capricorn', 'sun', 'cap', 'moon', 'and', 'cap', 'rising', 'what', 'does', 'that', 'say', 'about', 'me']
set(['a', 'and', 'what', 'that', 'i', 'sun', 'cap', 'am', 'about', 'me', 'moon', 'capricorn', 'say', 'does', 'rising', 'astrology'])
['i', 'm', 'a', 'triple', 'capricorn', 'sun', 'moon', 'and', 'ascendant', 'in', 'capricorn', 'what', 'does', 'this', 'say', 'about', 'me'] 

0    16.00000
1    16.00000
2    11.00000
3    21.00000
4     0.52381
5     3.93750
6     3.81250
dtype: float64


In [8]:
set1 = text_to_word_sets( train["question1"][n] )[1]
set2 = text_to_word_sets( train["question2"][n] )[1]
list1 = text_to_word_lists( train["question1"][n] )[1]
list2 = text_to_word_lists( train["question2"][n] )[1]
print set1
print list1
print set2
print list2, '\n'
print generate_features_from_sets(set1, set2)

set(['capricorn', 'sun', 'cap', 'moon', 'say', 'rising', 'astrology'])
['astrology', 'capricorn', 'sun', 'cap', 'moon', 'cap', 'rising', 'say']
set(['ascendant', 'capricorn', 'sun', 'moon', 'say', 'triple'])
['triple', 'capricorn', 'sun', 'moon', 'ascendant', 'capricorn', 'say'] 

0    7.000000
1    6.000000
2    4.000000
3    9.000000
4    0.444444
5    5.285714
6    5.666667
dtype: float64


In [9]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 1 to word sets:'))
train[['word_set1', 'word_set_no_stops1']] = train['question1'].progress_apply(lambda x : text_to_word_sets(x))

tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 2 to word sets:'))
train[['word_set2', 'word_set_no_stops2']] = train['question2'].progress_apply(lambda x : text_to_word_sets(x))

tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 1 to word lists:'))
train[['word_list1', 'word_list_no_stops1']] = train['question1'].progress_apply(lambda x : text_to_word_lists(x))

tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 2 to word lists:'))
train[['word_list2', 'word_list_no_stops2']] = train['question2'].progress_apply(lambda x : text_to_word_lists(x))

Question 1 to word sets::   0%|          | 230/404290 [00:00<02:55, 2297.43it/s]




Question 1 to word sets:: 100%|██████████| 404290/404290 [02:45<00:00, 2436.64it/s]
Question 2 to word sets::   0%|          | 269/404290 [00:00<02:30, 2687.55it/s]




Question 2 to word sets:: 100%|██████████| 404290/404290 [02:57<00:00, 2277.92it/s]
Question 1 to word lists::   0%|          | 131/404290 [00:00<05:09, 1307.49it/s]




Question 1 to word lists:: 100%|██████████| 404290/404290 [03:02<00:00, 2219.63it/s]
Question 2 to word lists::   0%|          | 6/404290 [00:00<2:06:27, 53.28it/s]




Question 2 to word lists:: 100%|██████████| 404290/404290 [03:00<00:00, 2238.39it/s]


In [10]:
train[['question1', 'question2',
       'word_set1', 'word_set2',
       'word_set_no_stops1', 'word_set_no_stops2',
       'word_list1', 'word_list2',
       'word_list_no_stops1', 'word_list_no_stops2',]].to_csv('train_sets_lists.csv', header = False, index = False)

In [11]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Set intersections:'))
train[features_from_sets] = train.progress_apply( lambda row : \
    generate_features_from_sets(row['word_set_no_stops1'], row['word_set_no_stops2']), axis = 1)
train.head()

Set intersections::   0%|          | 0/11 [00:00<?, ?it/s]




Set intersections:: 404291it [02:07, 3160.17it/s]                 


Unnamed: 0,question1,question2,is_duplicate,word_set1,word_set_no_stops1,word_set2,word_set_no_stops2,word_list1,word_list_no_stops1,word_list2,word_list_no_stops2,# of words 1,# of words 2,# of words in intersection,# of words in union,jaccard similarity,average word length 1,average word length 2
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"{what, invest, is, india, share, by, to, step,...","{invest, share, step, india, guide, market}","{what, invest, is, share, by, to, step, in, th...","{invest, step, share, guide, market}","[what, is, the, step, by, step, guide, to, inv...","[step, step, guide, invest, share, market, india]","[what, is, the, step, by, step, guide, to, inv...","[step, step, guide, invest, share, market]",6.0,5.0,5.0,6.0,0.833333,5.166667,5.2
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"{koh, what, diamond, i, of, is, kohinoor, noor...","{koh, story, diamond, kohinoor, noor}","{koh, what, diamond, would, government, i, koh...","{koh, diamond, would, government, back, kohino...","[what, is, the, story, of, kohinoor, koh, i, n...","[story, kohinoor, koh, noor, diamond]","[what, would, happen, if, the, indian, governm...","[would, happen, indian, government, stole, koh...",5.0,10.0,4.0,11.0,0.363636,5.4,5.8
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"{a, using, i, of, while, increase, how, connec...","{increase, connection, internet, using, vpn, s...","{be, increased, how, through, can, dns, intern...","{hacking, speed, increased, dns, internet}","[how, can, i, increase, the, speed, of, my, in...","[increase, speed, internet, connection, using,...","[how, can, internet, speed, be, increased, by,...","[internet, speed, increased, hacking, dns]",6.0,5.0,2.0,9.0,0.222222,6.5,6.4
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"{lonely, how, i, am, it, very, solve, can, why...","{lonely, solve, mentally}","{24, 23, is, when, by, the, remainder, find, m...","{24, 23, divided, remainder, find, math}","[why, am, i, mentally, very, lonely, how, can,...","[mentally, lonely, solve]","[find, the, remainder, when, math, 23, 24, mat...","[find, remainder, math, 23, 24, math, divided,...",3.0,6.0,0.0,9.0,0.0,6.333333,4.666667
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"{and, di, one, water, oxide, which, in, quikly...","{di, one, water, oxide, carbon, quikly, sugar,...","{would, fish, water, which, in, survive, salt}","{water, survive, fish, salt, would}","[which, one, dissolve, in, water, quikly, suga...","[one, dissolve, water, quikly, sugar, salt, me...","[which, fish, would, survive, in, salt, water]","[fish, would, survive, salt, water]",10.0,5.0,2.0,13.0,0.153846,5.1,5.0


In [12]:
train[['is_duplicate']].to_csv('target.csv', header = False, index = False)
train[all_features].to_csv('00 train features.csv', header = False, index = False)

dtrain = xgb.DMatrix(train[all_features], label = train['is_duplicate'])
dtrain.save_binary('00 dtrain features')

# Now process test data

In [None]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 1 to word sets:'))
test[['word_set1', 'word_set_no_stops1']] = test['question1'].progress_apply(lambda x : text_to_word_sets(x))

Question 1 to word sets::   0%|          | 0/2345796 [00:00<?, ?it/s]




Question 1 to word sets::  96%|█████████▌| 2244074/2345796 [21:40<07:48, 217.14it/s]             

In [None]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 2 to word sets:'))
test[['word_set2', 'word_set_no_stops2']] = test['question2'].progress_apply(lambda x : text_to_word_sets(x))

In [None]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 1 to word lists:'))
test[['word_list1', 'word_list_no_stops1']] = test['question1'].progress_apply(lambda x : text_to_word_lists(x))

In [None]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Question 2 to word lists:'))
test[['word_list2', 'word_list_no_stops2']] = test['question2'].progress_apply(lambda x : text_to_word_lists(x))

Question 1 to word sets::   0%|          | 0/2345796 [00:00<?, ?it/s]




Question 1 to word sets::  39%|███▉      | 914058/2345796 [05:53<08:00, 2980.80it/s] 

In [None]:
test[['question1', 'question2',
      'word_set1', #'word_set2',
      'word_set_no_stops1',# 'word_set_no_stops2',
#       'word_list1', 'word_list2',
#       'word_list_no_stops1', 'word_list_no_stops2'
     ]].to_csv('test_sets_lists.csv', header = False, index = False)

In [None]:
tqdm.tqdm_notebook(tqdm.tqdm.pandas(desc = 'Set intersections:'))
test[features_from_sets] = test.progress_apply( lambda row : \
    generate_features_from_sets(row['word_set_no_stops1'], row['word_set_no_stops2']), axis = 1)
test.head()

In [13]:
test[all_features].to_csv('00 train.csv', header = False, index = False)
ids.to_csv('ids.csv', header = False, index = False)
dtest = xgb.DMatrix(test[all_features])
dtest.save_binary('01 test')

In [45]:
true_duplicates = (test['question1'].str.lower() == test['question2'].str.lower())
true_duplicates.to_csv('true_duplicates', header = False, index = False)