In [81]:
import numpy as np
import pandas as pd
import os
import torch
from preprocess import loadpkl, print_table, savepkl, split_overflow_table, tokenize_table, read_table, tokenize_cell
from multiprocessing import Pool
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from pandarallel import pandarallel

In [2]:
pandarallel.initialize(progress_bar=True, nb_workers=15, shm_size_mb=3000)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [48]:
def get_emb(table):
    table_ = table[:]
    for row in table_:
        for j, cell in enumerate(row):
            for i, item in enumerate(cell):
                cell[i] = embeddings[w2i[item]]
            print(cell)
            row[j] = np.average(row[j], axis=0).tolist()
    return table_


def late_fusion(table, query):
    s = []
    for i in query:
        for j in table:
            sim = cosine_similarity(i.reshape(1, -1), j.reshape(1, -1))
            s.append(sim)
    s = np.array(s).reshape(-1)
    return s


def early_fusion(table, query):
    a = np.average(table, axis=0).reshape(1, -1)
    b = np.average(query, axis=0).reshape(1, -1)
    sim = cosine_similarity(a, b)
    return sim.reshape(-1)[0]


def mp(df, func, num_partitions):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def t(df):
    df['table_emb'] = df.table_id.apply(
        lambda x: get_emb(tokenize_table(read_table(x)['data'])))
    return df

In [86]:
# X = loadpkl('./data/xp_2D_10-50.pkl')
vocab = loadpkl('./data/vocab_2D_10-50.pkl')
w2i = {w: i for i, w in enumerate(vocab)}

In [5]:
model = torch.load('./output/11_25_15_56_30/model.pt')
embeddings = model['embeddings.weight'].cpu().data.numpy()

In [6]:
# pool = Pool(processes=30)
# X = pool.map(get_emb, X)
# X = np.array(X)
# print(np.array(X[0][0][0]).shape)

# savepkl('./data/xp_2D_10-50_emb.pkl', X)

In [7]:
baseline_f = pd.read_csv('./baseline_f_t-emb.csv')

In [8]:
# baseline_f['table'] = baseline_f.table_id.parallel_apply(
#     lambda x: tokenize_table(read_table(x)['data']))
# baseline_f['table_emb'] = baseline_f.table.parallel_apply(
#     lambda x: get_emb(x))

In [45]:
cell = []
for i, item in enumerate([]):
    cell[i] = embeddings[w2i[item]]

In [89]:
get_emb(tokenize_table(read_table(baseline_f.table_id.iloc[3])['data']))

[array([ 0.5165264 , -0.22917534,  1.0252395 ,  0.902176  , -0.15047933,
        0.10167567,  1.0283812 ,  1.4609094 ,  0.6924674 , -1.3453443 ,
        0.10453058,  0.4861255 , -0.81123424,  0.6414397 , -0.21214366,
       -0.32476175,  0.771458  ,  0.49426663, -0.6745338 ,  0.6102869 ,
       -1.1919804 , -0.09778426, -0.6510015 ,  1.3467176 ,  0.10646191,
        1.2222006 , -0.64520806,  0.2986686 ,  1.0957872 , -0.2777685 ,
        0.05556099, -0.5152819 ,  0.50323164, -1.3070076 , -0.08184893,
        0.04602394,  1.8057117 ,  0.730136  , -0.38096783, -1.1287721 ,
        0.40267426,  0.35611024, -1.4601114 ,  0.78651583, -0.79491884,
        0.41408262,  0.2936212 , -0.7032055 , -0.2891237 , -2.2241604 ,
        1.6613888 ,  0.44477016,  0.55120015,  3.105822  , -1.189551  ,
       -0.04454695, -1.1345474 ,  1.3751402 ,  0.4560847 , -0.8363714 ,
        1.234668  ,  0.15720613, -0.20844367, -0.59328604,  0.0719127 ,
       -0.90313226, -0.35463053,  0.5526016 , -2.0843413 ,  0.9

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


[[[0.5165264010429382,
   -0.22917534410953522,
   1.0252394676208496,
   0.902176022529602,
   -0.15047933161258698,
   0.10167567431926727,
   1.0283812284469604,
   1.460909366607666,
   0.6924673914909363,
   -1.3453443050384521,
   0.10453058034181595,
   0.48612549901008606,
   -0.8112342357635498,
   0.64143967628479,
   -0.2121436595916748,
   -0.3247617483139038,
   0.7714580297470093,
   0.4942666292190552,
   -0.6745337843894958,
   0.6102868914604187,
   -1.1919803619384766,
   -0.09778425842523575,
   -0.651001513004303,
   1.3467175960540771,
   0.10646191239356995,
   1.222200632095337,
   -0.6452080607414246,
   0.29866859316825867,
   1.0957871675491333,
   -0.27776849269866943,
   0.05556098744273186,
   -0.5152819156646729,
   0.5032316446304321,
   -1.3070075511932373,
   -0.08184892684221268,
   0.046023935079574585,
   1.8057117462158203,
   0.730135977268219,
   -0.3809678256511688,
   -1.1287721395492554,
   0.40267425775527954,
   0.35611024498939514,
   -1.460

In [52]:
xp = loadpkl('./data/xp_2D_10-50.pkl')

In [88]:
vocab.index(" ")

ValueError: ' ' is not in list

In [50]:
np.average([], axis=0).tolist()

nan

In [31]:
baseline_f.table_emb.iloc[3]

'[[[-0.33235064148902893, 0.3774985373020172, 0.9850264191627502, -0.22443778812885284, 0.6179551482200623, -1.2571191787719727, -1.4135297536849976, -0.5024814605712891, -0.9414803385734558, 0.3353623151779175, -0.448342889547348, -0.3551619350910187, -0.763576328754425, 1.1934080123901367, 0.688130795955658, 0.05433099344372749, -0.14577287435531616, -1.1966537237167358, 0.5481092929840088, -0.4246203899383545, -0.7090688347816467, 0.10370733588933945, -0.23486198484897614, -1.2638757228851318, 0.9908353686332703, 0.2814267575740814, -0.3183518648147583, -1.8097326755523682, 1.1041028499603271, -1.6187546253204346, -1.0153124332427979, -0.4797622561454773, -0.6852583289146423, -0.288412868976593, -0.6150086522102356, -0.33468225598335266, 0.257697731256485, -0.6699970364570618, -0.7880803942680359, -0.37315836548805237, -0.3436267077922821, -0.25363385677337646, 1.4531947374343872, 0.809532880783081, 1.6469230651855469, -0.7340391874313354, 0.44369441270828247, 0.79531329870224, -0.6

In [22]:
for i,v in enumerate(baseline_f.table_emb):
    print(i)
    eval(v)

0
1
2
3


NameError: name 'nan' is not defined

In [None]:
print(baseline_f[baseline_f['table_emb'] == 'nan'])
baseline_f['query_emb'] = baseline_f.query.apply(
    lambda x: tokenize_cell(x))
print(baseline_f.head())

In [9]:
# baseline_f = mp(baseline_f, t, 20)
# print(baseline_f.iloc[:2]['table_emb'])
# baseline_f.to_csv('./baseline_f_t-emb.csv',index=False)

In [None]:

# semantic_f['w2v_early_fusion'] = semantic_f.apply(
#     lambda x: early_fusion(x['w2v_embd_table'], x['w2v_embd_query']), axis=1)

# semantic_f['w2v_late_fusion'] = semantic_f.parallel_apply(
#     lambda x: late_fusion(x['w2v_embd_table'], x['w2v_embd_query']), axis=1)
# semantic_f['w2v_late_fusion_max'] = semantic_f.w2v_late_fusion.parallel_apply(
#     np.max)
# semantic_f['w2v_late_fusion_avg'] = semantic_f.w2v_late_fusion.parallel_apply(
#     np.average)
# semantic_f['w2v_late_fusion_sum'] = semantic_f.w2v_late_fusion.parallel_apply(
#     np.sum)


In [62]:
import torch

In [82]:
a = torch.rand((50,10,20,100))

In [83]:
a

tensor([[[[0.9839, 0.4114, 0.7876,  ..., 0.6464, 0.8292, 0.5808],
          [0.5222, 0.0083, 0.0892,  ..., 0.0439, 0.6292, 0.9426],
          [0.8353, 0.9286, 0.7864,  ..., 0.1873, 0.1795, 0.9356],
          ...,
          [0.7320, 0.4794, 0.0660,  ..., 0.4504, 0.6280, 0.4518],
          [0.5500, 0.4899, 0.8965,  ..., 0.5264, 0.4761, 0.4289],
          [0.8733, 0.7031, 0.8754,  ..., 0.8364, 0.2969, 0.5876]],

         [[0.1317, 0.3744, 0.1463,  ..., 0.1537, 0.3357, 0.2412],
          [0.8203, 0.6448, 0.9633,  ..., 0.0938, 0.2991, 0.7808],
          [0.6284, 0.8944, 0.9918,  ..., 0.8758, 0.2391, 0.8392],
          ...,
          [0.7893, 0.0627, 0.7928,  ..., 0.6556, 0.3692, 0.2124],
          [0.7162, 0.1713, 0.6986,  ..., 0.6243, 0.2457, 0.1893],
          [0.3342, 0.9964, 0.4578,  ..., 0.2276, 0.1621, 0.7278]],

         [[0.3185, 0.6586, 0.4469,  ..., 0.6466, 0.2872, 0.9849],
          [0.1739, 0.4090, 0.8391,  ..., 0.5325, 0.4388, 0.7971],
          [0.1986, 0.0738, 0.7301,  ..., 0

In [85]:
torch.from_numpy(np.average(a.cpu().data.numpy(), axis=2)).shape

torch.Size([50, 10, 100])