In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gensim
import gensim.models.keyedvectors as word2vec
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from pandarallel import pandarallel

In [3]:
from parse_data import parseData

In [4]:
pandarallel.initialize(progress_bar=True,nb_workers=15,shm_size_mb=2500)



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


New pandarallel memory created - Size: 2500 MB
Pandarallel will run on 15 workers


In [5]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
baseline_f = pd.read_csv('./www2018-table/feature/features.csv')

In [6]:
w2v_gn_model = word2vec.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)  

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# w2v_gn_model.similarity('computer','ram')

In [None]:
# w2v_gn_model.get_vector('computer')

In [7]:
rdv2vec_model = gensim.models.Word2Vec.load('./DB2Vec_sg_200_5_5_15_2_500')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# rdv2vec_model.wv.similarity('dbr:England','dbr:United_Kingdom')

In [None]:
rdv2vec_model.wv['dbr:SNCF_TGV_Atlantique']

In [None]:
tables_path = '/home/vibhav/table2vec/tables_redi2_1/'

def get_table_path(table_id):
    return tables_path + 're_tables-' + table_id.split('-')[1] + '.json'

In [None]:
baseline_f['table_path'] = baseline_f.table_id.parallel_apply(get_table_path)

In [None]:
baseline_f['parsedTable'] = baseline_f.parallel_apply(lambda x: parseData(x['table_path'],x['table_id']).split(' '),axis=1)

In [8]:
def get_w2v_embd(sentence):
    embd = []
    for word in sentence:
        try:
            embd.append(w2v_gn_model.get_vector(word))
        except Exception as e:
            embd.append(w2v_gn_model.get_vector('UNK'))
    embd = np.array(embd)
    return embd

In [None]:
baseline_f['w2v_embd_table'] = baseline_f.parsedTable.parallel_apply(get_w2v_embd)

In [None]:
baseline_f.head()

In [None]:
semantic_f = baseline_f.loc[:,['query_id','query','table_path','table_id','parsedTable','rel']]
baseline_f.drop(columns=['table_path','parsedTable'],inplace=True)

In [None]:
semantic_f['w2v_embd_query'] = semantic_f['query'].parallel_apply(lambda x: get_w2v_embd(x.split(" ")))

In [9]:
def early_fusion(table,query):
    a = np.average(table,axis=0).reshape(1,-1)
    b = np.average(query,axis=0).reshape(1,-1)
    sim = cosine_similarity(a,b)
    return sim.reshape(-1)[0]

In [None]:
semantic_f['w2v_early_fusion'] = semantic_f.apply(lambda x: early_fusion(x['w2v_embd_table'],x['w2v_embd_query']),axis=1)

In [None]:
# semantic_f[semantic_f.w2v_early_fusion < 0.3].rel.value_counts()
# semantic_f[semantic_f.w2v_late_fusion_avg < 0.3].shape

In [10]:
def late_fusion(table,query):
    s = []
    for i in query:
        for j in table:
            sim = cosine_similarity(i.reshape(1,-1),j.reshape(1,-1))
            s.append(sim)
    s = np.array(s).reshape(-1)
    return s

In [None]:
semantic_f['w2v_late_fusion'] = semantic_f.parallel_apply(lambda x: late_fusion(x['w2v_embd_table'],x['w2v_embd_query']),axis=1)

In [None]:
semantic_f['w2v_late_fusion_max'] = semantic_f.w2v_late_fusion.parallel_apply(np.max)
semantic_f['w2v_late_fusion_avg'] = semantic_f.w2v_late_fusion.parallel_apply(np.average)
semantic_f['w2v_late_fusion_sum'] = semantic_f.w2v_late_fusion.parallel_apply(np.sum)

In [None]:
semantic_f.head()

In [None]:
from get_entities import getEntities

In [None]:
query_entities = {}
for i in semantic_f['query'].unique():
    query_entities[i] = getEntities(i)

semantic_f['query_entites'] = semantic_f['query'].apply(lambda x: query_entities[x])

In [None]:
semantic_f.query_entites.iloc[1]

In [None]:
df_1 = pd.read_csv('./semantic_f_w2v_entities_1.csv')
df_2 = pd.read_csv('./semantic_f_w2v_entities_2.csv')
df_3 = pd.read_csv('./semantic_f_w2v_entities_3.csv')
df_4 = pd.read_csv('./semantic_f_w2v_entities_4.csv')

In [None]:
df = pd.concat([df_1,df_2,df_3,df_4])

In [None]:
import ast
df['table_entities'] = df.table_entities.apply(lambda x: ast.literal_eval(x))

In [11]:
def get_g2v_embd(entities):
    embd = []
    for en in entities:
        word = 'dbr:' + en
        try:
            embd.append(rdv2vec_model.wv[word])
        except Exception as e:
            embd.append(rdv2vec_model.wv['dbr:UNK'])
    return np.array(embd)

In [None]:
df['g2v_embd_table'] = df.table_entities.apply(get_g2v_embd)

df['g2v_embd_query'] = df.query_entites.apply(get_g2v_embd)

In [None]:
df['g2v_early_fusion'] = df.apply(lambda x: early_fusion(x['g2v_embd_table'],x['g2v_embd_query']),axis=1)

In [None]:
df['g2v_late_fusion'] = df.parallel_apply(lambda x: late_fusion(x['g2v_embd_table'],x['g2v_embd_query']),axis=1)

In [None]:
df['g2v_late_fusion_max'] = df.g2v_late_fusion.apply(np.max)
df['g2v_late_fusion_avg'] = df.g2v_late_fusion.apply(np.average)
df['g2v_late_fusion_sum'] = df.g2v_late_fusion.apply(np.sum)

In [None]:
df.head()

In [None]:
df.to_csv('./semantic_f_w2v_g2v.csv',index=False)

# Testing with new filtered entities

In [30]:
df = pd.read_csv('./semantic_f_w2v_g2v_upd_boe.csv')

In [37]:
df.head()

Unnamed: 0,query_id,query,table_path,table_id,parsedTable,rel,w2v_early_fusion,w2v_late_fusion,w2v_late_fusion_max,w2v_late_fusion_avg,w2v_late_fusion_sum,query_entites,query_entities_lst,table_entities,table_entities_filter,g2v_early_fusion,g2v_late_fusion,g2v_late_fusion_max,g2v_late_fusion_avg,g2v_late_fusion_sum,boe_early_fusion,boe_late_fusion,boe_late_fusion_max,boe_late_fusion_avg,boe_late_fusion_sum,g2v_embd_table,g2v_embd_query
0,1,world interest rates Table,/home/vibhav/table2vec/tables_redi2_1/re_table...,table-0875-680,"['experian', 'score', 'expected', 'annual', 'l...",0,0.572375,[ 0.07764402 0.04398614 0.062119 0.131361...,1.0,0.13295,18.081264,{'3': {'entity': '<dbpedia:Permanent_interest_...,"[Rate_risk, Inflation_targeting, Overnight_pol...","['Cash_advance', 'Subprime_lending', 'Credit_c...","[Cash_advance, Subprime_lending, Credit_card_b...",0.931634,"[0.9544216, 0.7464475, 0.9754204, 0.91376805, ...",0.976379,0.753584,165.788513,0.0,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0,0.0,0.0,"[[-0.105519615, 0.16365984, -0.10385874, -0.01...","[[-0.14873455, 0.1397283, -0.13643967, -0.0689..."
1,1,world interest rates Table,/home/vibhav/table2vec/tables_redi2_1/re_table...,table-1020-619,"['annual', 'inflation', 'rates', 'aug', 'sep',...",0,0.681248,[ 0.13136129 0.06132873 0.00324154 0.019790...,1.0,0.160487,10.271143,{'3': {'entity': '<dbpedia:Permanent_interest_...,"[Rate_risk, Inflation_targeting, Overnight_pol...","['Ultra-Tories', 'Zero_interest-rate_policy', ...","[Ultra-Tories, Zero_interest-rate_policy, Inte...",0.948629,"[0.59270334, 0.88259375, 0.8775268, 0.82495767...",1.0,0.702878,161.661957,0.065938,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,1.0,0.004348,1.0,"[[-0.5667573, 0.059294734, -0.35778925, 0.0050...","[[-0.14873455, 0.1397283, -0.13643967, -0.0689..."
2,1,world interest rates Table,/home/vibhav/table2vec/tables_redi2_1/re_table...,table-0288-531,"['regret', 'interest', 'rates', 'rise', 'stati...",0,0.606677,[ 0.09074079 0.07134737 0.00324154 0.122679...,1.0,0.145743,12.825369,{'3': {'entity': '<dbpedia:Permanent_interest_...,"[Rate_risk, Inflation_targeting, Overnight_pol...","['Expected_utility_hypothesis', 'Loss_function...","[Expected_utility_hypothesis, Loss_function, V...",0.890927,"[0.6577677, 0.6136566, 0.8812906, 0.7719419, 0...",0.938203,0.63592,127.184067,0.0,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0,0.0,0.0,"[[-0.01618436, 0.36752364, -0.41611332, -0.236...","[[-0.14873455, 0.1397283, -0.13643967, -0.0689..."
3,1,world interest rates Table,/home/vibhav/table2vec/tables_redi2_1/re_table...,table-0288-530,"['return', 'interest', 'rates', 'rise', 'stati...",0,0.617735,[ 0.0134912 0.07134737 0.00324154 0.122679...,1.0,0.140305,13.469313,{'3': {'entity': '<dbpedia:Permanent_interest_...,"[Rate_risk, Inflation_targeting, Overnight_pol...","['Allan_Saint-Maximin', 'Maximin_Coia', 'Saint...","[Allan_Saint-Maximin, Maximin_Coia, Saint-Maxi...",0.905172,"[0.5995204, 0.42961165, 0.53440076, 0.6587129,...",0.940689,0.638138,134.009064,0.0,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0,0.0,0.0,"[[-0.28710952, 0.19961576, -0.22583863, 0.0329...","[[-0.14873455, 0.1397283, -0.13643967, -0.0689..."
4,1,world interest rates Table,/home/vibhav/table2vec/tables_redi2_1/re_table...,table-1000-57,"['security', 'nominal', 'interest', 'rates', '...",0,0.586386,[ 9.76208225e-02 4.97327670e-02 7.13473707e-...,1.0,0.139647,18.433437,{'3': {'entity': '<dbpedia:Permanent_interest_...,"[Rate_risk, Inflation_targeting, Overnight_pol...","['Quantitative_easing', 'Inflation_derivative'...","[Quantitative_easing, Inflation_derivative, El...",0.964024,"[0.8005127, 0.95039994, 0.96435654, 0.8281962,...",1.0,0.763481,145.061325,0.072548,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,1.0,0.005263,1.0,"[[-0.08840087, 0.19392557, -0.24297218, -0.365...","[[-0.14873455, 0.1397283, -0.13643967, -0.0689..."


In [32]:
df['table_entities_filter'] = df['table_entities_filter'].apply(eval)
df['query_entities_lst'] = df['query_entities_lst'].apply(eval)

In [33]:
df['g2v_embd_table'] = df.table_entities_filter.apply(get_g2v_embd)
df['g2v_embd_query'] = df.query_entities_lst.apply(get_g2v_embd)

In [34]:
df['g2v_early_fusion'] = df.apply(lambda x: early_fusion(x['g2v_embd_table'],x['g2v_embd_query']),axis=1)

In [35]:
df['g2v_late_fusion'] = df.parallel_apply(lambda x: late_fusion(x['g2v_embd_table'],x['g2v_embd_query']),axis=1)

In [36]:
df['g2v_late_fusion_max'] = df.g2v_late_fusion.apply(np.max)
df['g2v_late_fusion_avg'] = df.g2v_late_fusion.apply(np.average)
df['g2v_late_fusion_sum'] = df.g2v_late_fusion.apply(np.sum)

In [41]:
df.to_csv('./semantic_f_w2v_g2v_upd_boe_updg2v.csv',index=False)