# Just like the old saying:
### *All happy families resemble each other, while unhappy ones each have their own problems.*
# So we can assume:
### Duplicated questions resemble each other, while unduplicated questions have their own differences.

Suppose  if A=B and B=C, we assume that A=C

For example:  if we have this kind of data, and they are duplicated

    [qid1,qid2]
    [1,2]
    [1,3]
    [11,2]
    [12,2]

so we can get [1,2,3,11,12] are duplicated questions, 
so we can generate more positive training data:

    [1,2]
    [1,3]
    [1,11]
    [1,12]
    [2,3]
    [2,11]
    [2,12]
    [3,11]
    [3,12]
    [11,12]

Let's do it

In [2]:
import numpy as np
import pandas as pd
from IPython.display import  display
from collections import defaultdict
from itertools import combinations
pd.set_option('display.max_colwidth',-1)

In [3]:
train_df=pd.read_csv('data/train.csv')
train_df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0


In [31]:
# only duplicated questions
ddf=train_df[train_df.is_duplicate==1]
print('Duplicated questions shape:',ddf.shape)
ddf.head(2)

Duplicated questions shape: (149263, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1


In [32]:
nddf=train_df[train_df.is_duplicate==0]
print(len(nddf))
nddf.head(2)

255027


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0


There are 149263 pairs duplicated questions

In [5]:
# get all duplicated questions
clean_ddf1=ddf[['qid1','question1']].drop_duplicates()
clean_ddf1.columns=['qid','question']
clean_ddf2=ddf[['qid2','question2']].drop_duplicates()
clean_ddf2.columns=['qid','question']
all_dqdf=clean_ddf1.append(clean_ddf2,ignore_index=True)
print(all_dqdf.shape)
all_dqdf.head(2)

(172286, 2)


Unnamed: 0,qid,question
0,11,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
1,15,How can I be a good geologist?


There are 172286 questions in the above df

In [6]:
# groupby qid1, and then we get all the combinations of id in each group
dqids12=ddf[['qid1','qid2']]
df12list=dqids12.groupby('qid1', as_index=False)['qid2'].agg({'dlist':(lambda x: list(x))})
print(len(df12list))
d12list=df12list.values
d12list=[[i]+j for i,j in d12list]
# get all the combinations of id, like (id1,id2)...
d12ids=set()
for ids in d12list:
    ids_len=len(ids)
    for i in range(ids_len):
        for j in range(i+1,ids_len):
            d12ids.add((ids[i],ids[j]))
print(len(d12ids))

86197


230884


In [8]:
# the same operation of qid2
dqids21=ddf[['qid2','qid1']]
display(dqids21.head(2))
df21list=dqids21.groupby('qid2', as_index=False)['qid1'].agg({'dlist':(lambda x: list(x))})
print(len(df21list))
ids2=df21list.qid2.values
d21list=df21list.values
d21list=[[i]+j for i,j in d21list]
d21ids=set()
for ids in d21list:
    ids_len=len(ids)
    for i in range(ids_len):
        for j in range(i+1,ids_len):
            d21ids.add((ids[i],ids[j]))
len(d21ids)

Unnamed: 0,qid2,qid1
5,12,11
7,16,15


86089


230768

In [8]:
# merge two set
dids=list(d12ids | d21ids)
len(dids)

363294

In [10]:
# let's define union-find function
def indices_dict(lis):
    d = defaultdict(list)
    for i,(a,b) in enumerate(lis):
        d[a].append(i)
        d[b].append(i)
    return d

def disjoint_indices(lis):
    d = indices_dict(lis)
    sets = []
    while len(d):
        que = set(d.popitem()[1])
        ind = set()
        while len(que):
            ind |= que 
            que = set([y for i in que 
                         for x in lis[i] 
                         for y in d.pop(x, [])]) - ind
        sets += [ind]
    return sets

def disjoint_sets(lis):
    return [set([x for i in s for x in lis[i]]) for s in disjoint_indices(lis)]

In [11]:
# split data into groups, so that each question in each group are duplicated
did_u=disjoint_sets(dids)
new_dids=[]
for u in did_u:
    new_dids.extend(list(combinations(u,2)))
len(new_dids)

228548

In [12]:
new_ddf=pd.DataFrame(new_dids,columns=['qid1','qid2'])
print('New duplicated shape:',new_ddf.shape)
display(new_ddf.head(2))

New duplicated shape: (228548, 2)


Unnamed: 0,qid1,qid2
0,28291,13817
1,28291,5902


In [13]:
# merge with all_dqdf to get question1 description
new_ddf=new_ddf.merge(all_dqdf,left_on='qid1',right_on='qid',how='left')
new_ddf.drop('qid',inplace=True,axis=1)
new_ddf.columns=['qid1','qid2','question1']
new_ddf.drop_duplicates(inplace=True)
print(new_ddf.shape)
new_ddf.head(2)

(228548, 3)


Unnamed: 0,qid1,qid2,question1
0,28291,13817,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?
2,28291,5902,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?


In [14]:
# the same operation with qid2
new_ddf=new_ddf.merge(all_dqdf,left_on='qid2',right_on='qid',how='left')
new_ddf.drop('qid',inplace=True,axis=1)
new_ddf.columns=['qid1','qid2','question1','question2']
new_ddf.drop_duplicates(inplace=True)
print(new_ddf.shape)
new_ddf.head(2)

(228548, 4)


Unnamed: 0,qid1,qid2,question1,question2
0,28291,13817,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Massachisetts?
2,28291,5902,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Illinois?


In [15]:
# is_duplicate flag
new_ddf['is_duplicate']=1
new_ddf.head(2)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,28291,13817,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Massachisetts?,1
2,28291,5902,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Illinois?,1


In [16]:
# let random select 10 rows to check the result
new_ddf.sample(10)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
135509,41598,20607,How can you lose gigantic love handles FAST?,How can someone lose weight quickly?,1
329378,453474,453475,What are some of the challenges you face as a writer?,Is it challenging to be a writer?,1
196204,390977,274724,How can I get fund for my startup like Flipboard?,How startup get funding?,1
293586,85683,85684,How do I install a kick start in yamaha fzs version 2.0?,How do I install a kick start in yamaha FZ-S version 2.0?,1
177094,480900,20204,"Why do my easy-to-understand questions keep getting marked as ""needing improvement"" on Quora?",Why does Quora always mark my question as needing clarification but does not specify the reason?,1
20767,24734,27851,How can I speaking fluently speaking English?,How the way to speak english fluently?,1
288342,340226,340227,How are sound waves formed?,How do waves form sound waves?,1
101468,38339,46707,Can somebody help me with the list of best top horror movies of all time?,What are some best horror movies?,1
290767,343656,343657,My btech percentage is below 60 will I get a job?Is there any companies accept below 60?,My btech percentage is below 60 will I get a job? Is there any companies accept below 60 percentage?,1
277955,68923,233615,How much time it will take for the beginners to learn the basic of guitars?,How much time did you take to learn guitar?,1


### It seemed quite make sense

In [33]:
# the orininal duplicated pairs count:
print('Old duplicates:  {}'.format(len(all_dqdf)))
# after we generate more data, then the duplicated pairs count:
print('New duplicates:  {}'.format(len(new_ddf)))
print('None duplicates: {}'.format(len(nddf)))

Old duplicates:  172286
New duplicates:  228548
None duplicates: 255027


In [36]:
new_ddf.to_csv('data/all_duplicates.csv', sep=",", index=True)
nddf.to_csv('data/all_none_duplicates', index=False)

In [37]:
new_ddf.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,28291,13817,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Massachisetts?,1
2,28291,5902,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in Illinois?,1
4,28291,5903,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in South Carolina?,1
6,28291,102817,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns proposed by the NRA in North Carolina?,1
7,28291,57257,What are the safety precautions on handling shotguns proposed by the NRA in Maryland?,What are the safety precautions on handling shotguns ?,1


In [40]:
nddf.drop('id', 1).head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [45]:
all = new_ddf.append(nddf.drop('id', 1))
all = all.sample(frac=1)
all.head()
all.to_csv('data/new_dataset.csv')