# Building the data set

In this notebook, we load data with only good pairs. First, we create the query and paragraphs embeddings using CBOW (possibly with tf-idf). \\

Then, we only keep 200.000 lines as true pairs, and keep 400.000 additional paragraphs to wrongly associate them with the queries contained in the 200.000 previous lines to create wrong pairs.

In [42]:
import sys
import os 
import pandas as pd
import numpy as np

# load data
#df = pd.read_csv("../data/para_csv.csv")
df = pd.read_csv("../data/para_csv_f1.csv")

In [43]:
df.shape

(709421, 5)

In [44]:
df.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label
0,114,coverage,5c4d8a5fb15fa87ac096174957b3621b67d6a207,The 0114 dialing code includes the whole of S...,1
1,114,history,32c808b9a9d5407d31851cad1125f3e5e4af7dda,Switching to 6-digit numbers produced 90 000 a...,1
2,114,history,4c5c5db4621532149aaf828fca282bcd5a7e757a,Before 1965 Sheffield had 5-digit telephone n...,1
3,114,history,50b5cb599e17e542fa63c14324d123bffc41cec0,Transitioning to 7-digit numbers in 1995 invol...,1
4,114,history,f2523d06b2083c7b4fc048e012460db3d2ac069d,Until the 1980s Stocksbridge Oughtibridge a...,1


In [45]:
# encode questions to unicode
df['pagename'] = df['pagename'].apply(lambda x: str(x))
df['section'] = df['section'].apply(lambda x: str(x))
df['para_text'] = df['para_text'].apply(lambda x: str(x))

In [46]:
# adding a query column which is the concatenation of pagename + section
df['query'] = df['section'] + ' of ' + df['pagename']

In [47]:
df.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query
0,114,coverage,5c4d8a5fb15fa87ac096174957b3621b67d6a207,The 0114 dialing code includes the whole of S...,1,coverage of 0114
1,114,history,32c808b9a9d5407d31851cad1125f3e5e4af7dda,Switching to 6-digit numbers produced 90 000 a...,1,history of 0114
2,114,history,4c5c5db4621532149aaf828fca282bcd5a7e757a,Before 1965 Sheffield had 5-digit telephone n...,1,history of 0114
3,114,history,50b5cb599e17e542fa63c14324d123bffc41cec0,Transitioning to 7-digit numbers in 1995 invol...,1,history of 0114
4,114,history,f2523d06b2083c7b4fc048e012460db3d2ac069d,Until the 1980s Stocksbridge Oughtibridge a...,1,history of 0114


## Embedding data

In [None]:
# If we want to train our own w2v vectors, we can adapt the code below 
'''
import gensim

questions = list(df['question1']) + list(df['question2'])

# tokenize
c = 0
for question in tqdm(questions):
    questions[c] = list(gensim.utils.tokenize(question, deacc=True, lower=True))
    c += 1

# train model
model = gensim.models.Word2Vec(questions, size=300, workers=16, iter=10, negative=20)

# trim memory
model.init_sims(replace=True)

# creta a dict 
w2v = dict(zip(model.index2word, model.syn0))
print "Number of tokens in Word2Vec:", len(w2v.keys())

# save model
model.save('data/3_word2vec.mdl')
model.save_word2vec_format('data/3_word2vec.bin', binary=True)
'''

In [19]:
# Run this code to use tf-idf weighted CBOW
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
text = list(df['query']) + list(df['para_text'])

tfidf = TfidfVectorizer(lowercase=False, ) # lowercase?
tfidf.fit_transform(text)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
'''

"\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import CountVectorizer\n# merge texts\ntext = list(df['query']) + list(df['para_text'])\n\ntfidf = TfidfVectorizer(lowercase=False, ) # lowercase?\ntfidf.fit_transform(text)\n\n# dict key:word and value:tf-idf score\nword2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n"

In [48]:
# exctract word2vec vectors
import spacy
nlp = spacy.load('en')

def embed(column_name):
    vecs = []
    #for qu in tqdm(list(df[column_name])):
    for qu in list(df[column_name]):
        doc = nlp(qu) 
        mean_vec = np.zeros([len(doc), 300])
        for word in doc:   
            vec = word.vector
            # only if using tf-idf
            '''
            # fetch df score
            try:
                idf = word2tfidf[str(word)]
            except:
                #print word
                idf = 0
            '''
            # compute final vec
            mean_vec += vec # * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs
    
query_vecs = embed('query')
paragraph_vecs = embed('para_text')

df['query_CBOW'] = list(query_vecs)
df['paragraph_CBOW'] = list(paragraph_vecs)

In [49]:
df.shape

(709421, 8)

In [50]:
# keep some good pairs
num_true_pairs = 200000
df_true_pairs = df[:200000]

# build wrong pairs (queries from df_true_pairs associated with wrong paragraphs)
# two times more false pairs than true pairs
num_false_pairs = 400000
# copy twice good queries
df_false_pairs = df_true_pairs.append(df_true_pairs)
df_false_pairs.reset_index(drop=True,inplace=True)
# create wrong paragraphs from the rest of the dataset
col_to_replace = ['para_id', 'para_text', 'paragraph_CBOW']
wrong_paragraps = df[col_to_replace][-num_false_pairs:]
# shuffle
wrong_paragraps = wrong_paragraps.reindex(np.random.permutation(wrong_paragraps.index))
wrong_paragraps.reset_index(drop=True,inplace=True)
# replace good paragraphs by wrong paragraphs
df_false_pairs[col_to_replace] = wrong_paragraps
# change label from 1 to 0
df_false_pairs['rel_label'] = np.zeros(df_false_pairs.shape[0])

In [51]:
#df_false_pairs.head()

In [52]:
# create final dataset by merge both true and wrong pairs
df_final = df_true_pairs.append(df_false_pairs)
df_final.reset_index(drop=True,inplace=True)
df_final = df_final.reindex(np.random.permutation(df_final.index))
df_final.reset_index(drop=True,inplace=True)

In [53]:
df_final.shape

(600000, 8)

In [54]:
df_final.head()

Unnamed: 0,pagename,section,para_id,para_text,rel_label,query,query_CBOW,paragraph_CBOW
0,air transports of heads of state and government,vatican city,6b90fe9d2b64788ac0347945f8415a5508403e73,There is a 37 m lighthouse in Kalpeni from w...,0.0,vatican city of air transports of heads of sta...,"[0.348935413407, -0.0663725803606, 0.205287225...","[1.23432165978, -0.121449815575, 0.66690607648..."
1,chlo%c3%a9 (artwork),damage and restoration,064f436ade9e08519918de1f9e202ad13a5e6374,Six officers were investigated for obstructing...,0.0,damage and restoration of chlo%c3%a9 (artwork),"[0.0838661515154, -0.172712320229, 0.074871452...","[5.87265612092, 0.174174621963, 1.69391192746,..."
2,avesta,structure and content,b8741f21b4afb18ef722b55a702fc674873268ed,Quality ammunition is readily available from a...,0.0,structure and content of avesta,"[0.193173049018, -0.0221950821579, 0.269326308...","[1.74746910855, -0.946291260421, 0.84430990682..."
3,accrington,regeneration and investment,c9f8841507be288a24b8656d8466f151f8d32c78,The following public-use airports are located ...,0.0,regeneration and investment of accrington,"[0.0842982104514, -0.0745575842448, 0.03157324...","[0.525899061002, -0.302285570651, 0.0253334054..."
4,ancient thera,geography,a2944fa6e948729ffb5b71127ac05f1bea0add80,In 1998 noted social worker Anna Hazare was...,0.0,geography of ancient thera,"[0.0351712554693, 0.000748720951378, 0.0625084...","[5.76314407323, -0.610725222592, 2.02003231443..."


In [None]:
temp = df_final.head()

In [86]:
temp['query_CBOW'][0][1]

for i in range(len(temp['query_CBOW'][0])):
    #print(i)
    col_name = 'feature_'+str(i+1)
    temp[col_name] = temp['query_CBOW'].apply(lambda x: x[i])
    
    
    #df['J3'] = df.apply(lambda row:lst[row['J1']:row['J2']],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [99]:
df_final.drop(['query_CBOW', 'paragraph_CBOW'],1).to_csv('../data/fold0_600K.csv', index=False, encoding='utf-8')

In [97]:
df_final['query_CBOW'].values.dump("../data/fold0_600K_query_CBOW.csv")

In [98]:
df_final['paragraph_CBOW'].values.dump("../data/fold0_600K_paragraph_CBOW.csv")

In [105]:
labels = df_final['rel_label'].values

y=[]
for i in range(len(labels)):
    if labels[i]==1:
        y.append([0, 1])
    elif labels[i]==0:
        y.append([1, 0])
    else:
        print('label value not in [0,1]') 

In [118]:
y_array = np.array(y)

In [120]:
y_array.dump("../data/fold0_600K_labels.csv")

In [130]:
# Creating short data set
df_final[:1000]['query_CBOW'].values.dump("../data/short_fold0_600K_query_CBOW.csv")
df_final[:1000]['paragraph_CBOW'].values.dump("../data/short_fold0_600K_paragraph_CBOW.csv")
y_array[:1000].dump("../data/short_fold0_600K_labels.csv")