# Building the data set

In this notebook, we load data with only good pairs. First, we create the query and paragraphs embeddings using CBOW (possibly with tf-idf). \\

Then, we only keep 200.000 lines as true pairs, and keep 400.000 additional paragraphs to wrongly associate them with the queries contained in the 200.000 previous lines to create wrong pairs.

In [14]:
import sys
import os 
import pandas as pd
import numpy as np

# load data
#df = pd.read_csv("../data/para_csv.csv")
df = pd.read_csv("../data/para_csv_f1.csv")

In [15]:
df.shape

(709421, 5)

In [16]:
df.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label
0,114,coverage,5c4d8a5fb15fa87ac096174957b3621b67d6a207,The 0114 dialing code includes the whole of S...,1
1,114,history,32c808b9a9d5407d31851cad1125f3e5e4af7dda,Switching to 6-digit numbers produced 90 000 a...,1
2,114,history,4c5c5db4621532149aaf828fca282bcd5a7e757a,Before 1965 Sheffield had 5-digit telephone n...,1
3,114,history,50b5cb599e17e542fa63c14324d123bffc41cec0,Transitioning to 7-digit numbers in 1995 invol...,1
4,114,history,f2523d06b2083c7b4fc048e012460db3d2ac069d,Until the 1980s Stocksbridge Oughtibridge a...,1


In [17]:
# encode questions to unicode
df['pagename'] = df['pagename'].apply(lambda x: str(x))
df['section'] = df['section'].apply(lambda x: str(x))
df['para_text'] = df['para_text'].apply(lambda x: str(x))

In [18]:
# adding a query column which is the concatenation of pagename + section
df['query'] = df['section'] + ' of ' + df['pagename']

In [19]:
# creating an index such that all paragraphs / query associated with same pagename share the same/unique index
ids = {}
i = 0
for pagename in df['pagename'].unique():
    ids[pagename] = i
    i+=1
    
df['pagename_index'] = df['pagename'].apply(lambda x: ids[x])

## Embedding data

In [None]:
# If we want to train our own w2v vectors, we can adapt the code below 
'''
import gensim

questions = list(df['question1']) + list(df['question2'])

# tokenize
c = 0
for question in tqdm(questions):
    questions[c] = list(gensim.utils.tokenize(question, deacc=True, lower=True))
    c += 1

# train model
model = gensim.models.Word2Vec(questions, size=300, workers=16, iter=10, negative=20)

# trim memory
model.init_sims(replace=True)

# creta a dict 
w2v = dict(zip(model.index2word, model.syn0))
print "Number of tokens in Word2Vec:", len(w2v.keys())

# save model
model.save('data/3_word2vec.mdl')
model.save_word2vec_format('data/3_word2vec.bin', binary=True)
'''

In [19]:
# Run this code to use tf-idf weighted CBOW
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
text = list(df['query']) + list(df['para_text'])

tfidf = TfidfVectorizer(lowercase=False, ) # lowercase?
tfidf.fit_transform(text)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
'''

"\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import CountVectorizer\n# merge texts\ntext = list(df['query']) + list(df['para_text'])\n\ntfidf = TfidfVectorizer(lowercase=False, ) # lowercase?\ntfidf.fit_transform(text)\n\n# dict key:word and value:tf-idf score\nword2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n"

In [48]:
'''# exctract word2vec vectors
import spacy
nlp = spacy.load('en')

def embed(column_name):
    vecs = []
    #for qu in tqdm(list(df[column_name])):
    for qu in list(df[column_name]):
        doc = nlp(qu) 
        mean_vec = np.zeros([len(doc), 300])
        for word in doc:   
            vec = word.vector
            # only if using tf-idf
            '''
            # fetch df score
            try:
                idf = word2tfidf[str(word)]
            except:
                #print word
                idf = 0
            '''
            # compute final vec
            mean_vec += vec # * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs
    
query_vecs = embed('query')
paragraph_vecs = embed('para_text')

df['query_CBOW'] = list(query_vecs)
df['paragraph_CBOW'] = list(paragraph_vecs)'''

In [49]:
#df.shape

(709421, 8)

# Creating False labels

In [42]:
# create dictionnary {pagename : [list of section names]}
pagenames = df['pagename'].values.tolist()
sections = df['section'].values.tolist()
d = {}
i=0
for pagename in pagenames:
    if pagename not in d.keys():
        d[pagename]=[sections[i]]
    else:
        d[pagename].append(sections[i])    
    i+=1
    
for k in d.keys():
    d[k] = list(set(d[k]))

In [43]:
# remove wikipedia pages that have only 1 section (not possible to create fake labels)
print('length before %d' %df.shape[0])
uniques = []
for k in d.keys():
    if len(d[k])==1:
        uniques.append(k)
print('number of wikipedia pages with only 1 section : %d' %len(uniques))
df = df[~df.pagename.isin(uniques)]
print('length after %d' %df.shape[0])

length before 671841
number of wikipedia pages with only 1 section : 56
length after 670887


In [44]:
# keep some good pairs
num_true_pairs = 200000
df_true_pairs = df[:200000]

# build wrong pairs (queries from df_true_pairs associated with wrong paragraphs)
# two times more false pairs than true pairs
num_false_pairs = 400000
# copy twice good queries
df_false_pairs = df_true_pairs.append(df_true_pairs)
df_false_pairs.reset_index(drop=True,inplace=True)

In [45]:
# change section name
import random 
#f = lambda x: random.choice(list(filter(lambda a: a != x[1], d[x[0]])))
#f = lambda x: random.choice(d[x[0]])

def f(pagename, section): 
    # pick a random section name that belongs to the same pagename but is different from current section name
    choices = [sec for sec in d[pagename] if sec != section]
    return random.choice(choices)
    
df_false_pairs['section'] = df_false_pairs[['pagename', 'section']].apply(lambda row: f(row['pagename'], row['section']),axis=1)

In [61]:
# update the query
df_false_pairs['query'] = df_false_pairs['section'] + ' of ' + df_false_pairs['pagename']

# change labels to 1
df_false_pairs['rel_label']=0

In [63]:
# create final dataset by merge both true and wrong pairs
df_final = df_true_pairs.append(df_false_pairs)
df_final.reset_index(drop=True,inplace=True)
df_final = df_final.reindex(np.random.permutation(df_final.index))
df_final.reset_index(drop=True,inplace=True)

In [64]:
df_final.shape

(600000, 7)

In [None]:
'temp = df_final.head()'

In [86]:
'temp['query_CBOW'][0][1]

for i in range(len(temp['query_CBOW'][0])):
    #print(i)
    col_name = 'feature_'+str(i+1)
    temp[col_name] = temp['query_CBOW'].apply(lambda x: x[i])
    
    
    #df['J3'] = df.apply(lambda row:lst[row['J1']:row['J2']],axis=1)'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [99]:
#df_final.drop(['query_CBOW', 'paragraph_CBOW'],1).to_csv('../data/fold0_600K.csv', index=False, encoding='utf-8')

In [97]:
#df_final['query_CBOW'].values.dump("../data/fold0_600K_query_CBOW.csv")

In [98]:
#df_final['paragraph_CBOW'].values.dump("../data/fold0_600K_paragraph_CBOW.csv")

In [79]:
labels = df_final['rel_label'].values

y=[]
for i in range(len(labels)):
    if labels[i]==1:
        y.append([0, 1])
    elif labels[i]==0:
        y.append([1, 0])
    else:
        print('label value not in [0,1]') 

In [80]:
y_array = np.array(y)

In [82]:
df[:1000]['query'].values.dump("../data/short_fold0_600K_query_text.csv")
df[:1000]['para_text'].values.dump("../data/short_fold0_600K_paragraph_text.csv")
y_array[:1000].dump("../data/short_fold0_600K_labels.csv")
df['query'].values.dump("../data/fold0_600K_query_text.csv")
df['para_text'].values.dump("../data/fold0_600K_paragraph_text.csv")
y_array.dump("../data/fold0_600K_labels.csv")

In [131]:
'''# Creating short data set
df_final[:1000]['query_CBOW'].values.dump("../data/short_fold0_600K_query_CBOW.csv")
df_final[:1000]['paragraph_CBOW'].values.dump("../data/short_fold0_600K_paragraph_CBOW.csv")
df_final[:1000]['query'].values.dump("../data/short_fold0_600K_query.csv")
df_final[:1000]['para_text'].values.dump("../data/short_fold0_600K_paragraph.csv")
y_array[:1000].dump("../data/short_fold0_600K_labels.csv")'''

In [85]:
# Loading data
#q = np.array(np.load("../data/short_fold0_600K_query_text.csv").tolist())