### Quora paraphrase dataset processing

In [1]:
import pandas as pd

quora = pd.read_table('../data/quora/quora_duplicate_questions.tsv', index_col='id')
quora.dropna(inplace=True)

In [2]:
quora.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
q1s = quora[['qid1', 'question1']].rename(columns={'qid1': 'qid', 'question1': 'question'})
q2s = quora[['qid2', 'question2']].rename(columns={'qid2': 'qid', 'question2': 'question'})
sents = pd.concat([q1s, q2s]).drop_duplicates(subset=['qid'])

In [4]:
from tqdm import tqdm

classes = {r.qid: set([r.qid]) for _, r in tqdm(sents.iterrows())}

for _, row in tqdm(quora[quora.is_duplicate == 1].iterrows()):
    if row.is_duplicate == 1:
        # We should have these questions pair in the same class
        cls = (classes[row.qid1] | classes[row.qid2])
        for qid in cls: classes[qid] = cls

537929it [01:06, 8039.02it/s]
149263it [00:20, 7127.09it/s]


In [5]:
unique_classes = set(tuple(cls) for cls in classes.values())

with open('../data/quora/classes.txt', 'w') as f:
    for cls in unique_classes:
        f.write(' '.join([str(qid) for qid in cls]) + '\n')

In [6]:
with open('../data/quora/quora.txt', 'w') as f:
    for i, q in tqdm(sents.sort_values('qid').iterrows()):
        f.write(q.question + '\n')

537929it [00:53, 10003.67it/s]


In [7]:
lens = [len(cls) for cls in unique_classes]

In [13]:
pd.Series(lens).value_counts()

1      388279
2       48210
3        7287
4        2242
5        1010
6         521
7         323
8         199
9         157
10         76
11         71
12         60
13         53
15         34
14         28
16         27
17         19
28         13
20         12
24         11
26         11
18         10
19         10
21          9
25          7
23          6
29          5
30          5
32          4
33          4
37          3
22          3
38          3
35          3
47          2
51          2
58          2
36          2
97          2
109         1
31          1
40          1
42          1
43          1
44          1
49          1
53          1
54          1
70          1
74          1
76          1
85          1
27          1
dtype: int64

In [12]:
s = 0
for k,v in zip(pd.Series(lens).value_counts().index, pd.Series(lens).value_counts()):
    s += k-1)*v
s

89190