# Building the data set

In this notebook, we load data with only good pairs. First, we create the query and paragraphs embeddings using CBOW (possibly with tf-idf). \\

Then, we only keep 200.000 lines as true pairs, and keep 400.000 additional paragraphs to wrongly associate them with the queries contained in the 200.000 previous lines to create wrong pairs.

In [179]:
import sys
import os 
import pandas as pd
import numpy as np

# load data
#df = pd.read_csv("../data/para_csv.csv")
df = pd.read_csv("../data/para_csv_f1.csv")

In [180]:
df.shape

(709421, 5)

In [181]:
df.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label
0,114,coverage,5c4d8a5fb15fa87ac096174957b3621b67d6a207,The 0114 dialing code includes the whole of S...,1
1,114,history,32c808b9a9d5407d31851cad1125f3e5e4af7dda,Switching to 6-digit numbers produced 90 000 a...,1
2,114,history,4c5c5db4621532149aaf828fca282bcd5a7e757a,Before 1965 Sheffield had 5-digit telephone n...,1
3,114,history,50b5cb599e17e542fa63c14324d123bffc41cec0,Transitioning to 7-digit numbers in 1995 invol...,1
4,114,history,f2523d06b2083c7b4fc048e012460db3d2ac069d,Until the 1980s Stocksbridge Oughtibridge a...,1


In [182]:
# encode questions to unicode
df['pagename'] = df['pagename'].apply(lambda x: str(x))
df['section'] = df['section'].apply(lambda x: str(x))
df['para_text'] = df['para_text'].apply(lambda x: str(x))

In [183]:
# adding a query column which is the concatenation of pagename + section
df['query'] = df['section'] + ' of ' + df['pagename']

In [184]:
# creating an index such that all paragraphs / query associated with same pagename share the same/unique index
ids = {}
i = 0
for pagename in df['pagename'].unique():
    ids[pagename] = i
    i+=1
    
df['pagename_index'] = df['pagename'].apply(lambda x: ids[x])

## Embedding data

In [None]:
# If we want to train our own w2v vectors, we can adapt the code below 
'''
import gensim

questions = list(df['question1']) + list(df['question2'])

# tokenize
c = 0
for question in tqdm(questions):
    questions[c] = list(gensim.utils.tokenize(question, deacc=True, lower=True))
    c += 1

# train model
model = gensim.models.Word2Vec(questions, size=300, workers=16, iter=10, negative=20)

# trim memory
model.init_sims(replace=True)

# creta a dict 
w2v = dict(zip(model.index2word, model.syn0))
print "Number of tokens in Word2Vec:", len(w2v.keys())

# save model
model.save('data/3_word2vec.mdl')
model.save_word2vec_format('data/3_word2vec.bin', binary=True)
'''

In [19]:
# Run this code to use tf-idf weighted CBOW
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
text = list(df['query']) + list(df['para_text'])

tfidf = TfidfVectorizer(lowercase=False, ) # lowercase?
tfidf.fit_transform(text)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
'''

"\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import CountVectorizer\n# merge texts\ntext = list(df['query']) + list(df['para_text'])\n\ntfidf = TfidfVectorizer(lowercase=False, ) # lowercase?\ntfidf.fit_transform(text)\n\n# dict key:word and value:tf-idf score\nword2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n"

In [48]:
'''# exctract word2vec vectors
import spacy
nlp = spacy.load('en')

def embed(column_name):
    vecs = []
    #for qu in tqdm(list(df[column_name])):
    for qu in list(df[column_name]):
        doc = nlp(qu) 
        mean_vec = np.zeros([len(doc), 300])
        for word in doc:   
            vec = word.vector
            # only if using tf-idf
            '''
            # fetch df score
            try:
                idf = word2tfidf[str(word)]
            except:
                #print word
                idf = 0
            '''
            # compute final vec
            mean_vec += vec # * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs
    
query_vecs = embed('query')
paragraph_vecs = embed('para_text')

df['query_CBOW'] = list(query_vecs)
df['paragraph_CBOW'] = list(paragraph_vecs)'''

In [49]:
#df.shape

(709421, 8)

# Creating False labels

In [185]:
# create dictionnary {pagename : [list of section names]}
pagenames = df['pagename'].values.tolist()
sections = df['section'].values.tolist()
d = {}
i=0
for pagename in pagenames:
    if pagename not in d.keys():
        d[pagename]=[sections[i]]
    else:
        d[pagename].append(sections[i])    
    i+=1
    
for k in d.keys():
    d[k] = list(set(d[k]))

In [186]:
# remove wikipedia pages that have only 1 section (not possible to create fake labels)
print('length before %d' %df.shape[0])
uniques = []
for k in d.keys():
    if len(d[k])==1:
        uniques.append(k)
print('number of wikipedia pages with only 1 section : %d' %len(uniques))
df = df[~df.pagename.isin(uniques)]
print('length after %d' %df.shape[0])

length before 709421
number of wikipedia pages with only 1 section : 84
length after 708106


In [187]:
# Keep a subset of wikipedia pages in order not to have a data set too big
indexes = df.pagename_index.unique()
random.shuffle(indexes)
n = len(indexes)
print('number of wikipedia pages : ', n)
n= int(n/2)
print('number of wikipedia pages to keep', n)
keep_indexes = indexes[:n]

number of wikipedia pages :  36018
number of wikipedia pages to keep 18009


In [188]:
df = df[df.pagename_index.isin(keep_indexes)]

In [189]:
df.shape

(351440, 7)

In [190]:
# keep some good pairs
#num_true_pairs = 300
#df_true_pairs = df[-300:]
df_true_pairs = df.copy()

# build wrong pairs (queries from df_true_pairs associated with wrong paragraphs)
#num_false_pairs = 300
# copy good queries
df_false_pairs = df_true_pairs.copy()
df_false_pairs.reset_index(drop=True,inplace=True)

In [191]:
df_false_pairs.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query,pagename_index
0,14th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 14th united states congress,3
1,16th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 16th united states congress,4
2,58th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 58th united states congress,7
3,14th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,1,members of 14th united states congress,3
4,16th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,1,members of 16th united states congress,4


In [192]:
# change section name
import random 
#f = lambda x: random.choice(list(filter(lambda a: a != x[1], d[x[0]])))
#f = lambda x: random.choice(d[x[0]])

def f(pagename, section): 
    # pick a random section name that belongs to the same pagename but is different from current section name
    choices = [sec for sec in d[pagename] if sec != section]
    r = random.choice(choices)
    return r
    
df_false_pairs['new_section'] = df_false_pairs[['pagename', 'section']].apply(lambda row: f(row['pagename'], row['section']),axis=1)

In [195]:
df_false_pairs.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query,pagename_index,new_section
0,14th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 14th united states congress,3,members
1,16th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 16th united states congress,4,party summary
2,58th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,1,committees of 58th united states congress,7,party summary
3,14th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,1,members of 14th united states congress,3,changes in membership
4,16th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,1,members of 16th united states congress,4,party summary


In [196]:
# update the query
df_false_pairs['query'] = df_false_pairs['new_section'] + ' of ' + df_false_pairs['pagename']

# change labels to 1
df_false_pairs['rel_label']=0

In [197]:
df_false_pairs.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query,pagename_index,new_section
0,14th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,0,members of 14th united states congress,3,members
1,16th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,0,party summary of 16th united states congress,4,party summary
2,58th united states congress,committees,9530c09b4dae41d24278d9049cee4f467ec9c121,Lists of committees and their party leaders.,0,party summary of 58th united states congress,7,party summary
3,14th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,0,changes in membership of 14th united states co...,3,changes in membership
4,16th united states congress,members,197ea49b7ff0cb049d6db34a4bdeefc3ab9be297,This list is arranged by chamber then by stat...,0,party summary of 16th united states congress,4,party summary


In [198]:
df_false_pairs = df_false_pairs.drop('new_section', 1)

In [199]:
print(df_true_pairs.shape)
print(df_false_pairs.shape)

(351440, 7)
(351440, 7)


In [200]:
# create final dataset by merge both true and wrong pairs
df_final = df_true_pairs.append(df_false_pairs)
df_final.reset_index(drop=True,inplace=True)
df_final = df_final.reindex(np.random.permutation(df_final.index))
df_final.reset_index(drop=True,inplace=True)

In [201]:
df_final.shape

(702880, 7)

In [202]:
df_final.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query,pagename_index
0,academi,role in the iraq war,44920381f7880d9026644910f734aacdf68fb77b,In June 2011 a federal judge dismissed Erik P...,1,role in the iraq war of academi,3002
1,wide-bandgap semiconductor,materials,3abc958463d33ea14cd2ec5e79d4899e9496df39,Aluminum nitride (AlN) can be used to fabrica...,1,materials of wide-bandgap semiconductor,35388
2,brexit,%22article 50%22 and the procedure for leaving...,d4ecf5f21477b5c5586149c4efd4cb7fe8377333,Various EU leaders have said that they will no...,0,the term %22brexit%22 of brexit,7561
3,mandaue city science high school,history,f8a178b8f18bd8d97c67f0496f704f8697ae0b74,The first batch of students were mostly honor ...,0,admission of mandaue city science high school,21558
4,palador pictures,acquired films,ea9624c488fc93bfaea823f0dae1bc59470f1360,The company has acquired these films from dif...,0,naming of palador pictures,25363


In [203]:
df_final[df_final.rel_label==1].shape

(351440, 7)

In [204]:
labels = df_final['rel_label'].values

y=[]
for i in range(len(labels)):
    if labels[i]==1:
        y.append([0, 1])
    elif labels[i]==0:
        y.append([1, 0])
    else:
        print('label value not in [0,1]') 

In [205]:
y_array = np.array(y)

In [207]:
df_final['query'].values.dump("../data/balanced_full_fold0_600K_query_text.csv")
df_final['para_text'].values.dump("../data/balanced_full_fold0_600K_paragraph_text.csv")
y_array.dump("../data/balanced_full_fold0_600K_labels.csv")

In [41]:
# Save data related to some wikipedia pages to use for testing the model
df_test = df_final[df_final.pagename_index.isin(range(10))]
df_test.to_csv('../data/short_fold0_600K_test_df.csv')

In [82]:
df[:1000]['query'].values.dump("../data/short_fold0_600K_query_text.csv")
df[:1000]['para_text'].values.dump("../data/short_fold0_600K_paragraph_text.csv")
y_array[:1000].dump("../data/short_fold0_600K_labels.csv")
df['query'].values.dump("../data/fold0_600K_query_text.csv")
df['para_text'].values.dump("../data/fold0_600K_paragraph_text.csv")
y_array.dump("../data/fold0_600K_labels.csv")

In [131]:
'''# Creating short data set
df_final[:1000]['query_CBOW'].values.dump("../data/short_fold0_600K_query_CBOW.csv")
df_final[:1000]['paragraph_CBOW'].values.dump("../data/short_fold0_600K_paragraph_CBOW.csv")
df_final[:1000]['query'].values.dump("../data/short_fold0_600K_query.csv")
df_final[:1000]['para_text'].values.dump("../data/short_fold0_600K_paragraph.csv")
y_array[:1000].dump("../data/short_fold0_600K_labels.csv")'''

In [85]:
# Loading data
#q = np.array(np.load("../data/short_fold0_600K_query_text.csv").tolist())

In [None]:
# Re-loading data
q_text = np.array(np.load("../data/fold0_600K_query_text.csv").tolist())
p_text = np.array(np.load("../data/fold0_600K_paragraph_text.csv").tolist())
y = np.array(np.load("../data/fold0_600K_labels.csv").tolist())

In [None]:
df_q = pd.DataFrame(q_text[:50000])
del q_text
df_q[0].values.dump("../data/medium_fold0_600K_query_text.csv")
#q_text[:50000].dump("../data/medium_fold0_600K_query_text.csv")

In [None]:
df_p = pd.DataFrame(p_text[:50000])
del p_text
df_p[0].values.dump("../data/medium_fold0_600K_paragraph_text.csv")
#p_text[:50000].dump("../data/medium_fold0_600K_paragraph_text.csv")

In [None]:
y[:50000].dump("../data/medium_fold0_600K_labels.csv")

In [None]:
y = np.array(np.load("../data/medium_fold0_600K_labels.csv").tolist())

In [None]:
y.shape

In [42]:
y = np.array(np.load("../data/short_fold0_600K_labels.csv").tolist())

In [48]:
pos = sum([np.argmax(x) for x in y])/1000

In [49]:
pos

0.311

In [50]:
1-pos

0.68900000000000006

In [66]:
y = np.array(np.load("../data/medium_fold0_600K_labels.csv").tolist())

In [67]:
y.shape

(50000, 2)

In [70]:
count = 0
for i in range(y.shape[0]):
    if y[i][0]==1:
        count+=1

In [71]:
count

33307