In [41]:
import numpy as np
import pandas as pd
import os
import torch
from preprocess import loadpkl, print_table, savepkl, split_overflow_table, tokenize_table, read_table, tokenize_cell
from multiprocessing import Pool
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,KFold
import pickle

In [32]:
def get_emb(table):
    if len(table) == 0:
        return [embeddings[w2i['<PAD>']]]
    for row in table:
        for j, cell in enumerate(row):
            if len(row[j]) == 0:
                row[j].append('<PAD>')
            for i, item in enumerate(cell):
                cell[i] = embeddings[w2i[item]]
            row[j] = np.average(row[j], axis=0).tolist()
    x = np.array(table)
    shape = x.shape
#     print(shape)
    table = x.reshape(shape[0]*shape[1],shape[2])
    return table.tolist()


def late_fusion(table, query):
    s = []
    for i in query:
        for j in table:
            i = np.array(i)
            j = np.array(j)
            sim = cosine_similarity(i.reshape(1, -1), j.reshape(1, -1))
            s.append(sim)
    s = np.array(s).reshape(-1)
    return s


def early_fusion(table, query):
    a = np.average(table, axis=0).reshape(1, -1)
    b = np.average(query, axis=0).reshape(1, -1)
    sim = cosine_similarity(a, b)
    return sim.reshape(-1)[0]


def mp(df, func, num_partitions):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def t_emb(df):
    df['table_emb'] = df.table_id.apply(
        lambda x: get_emb(tokenize_table(read_table(x)['data'])))
    return df

def ltf(baseline_f):
    baseline_f['late_fusion'] = baseline_f.apply(lambda x: late_fusion(x['table_emb'], x['query_emb']), axis=1)
    return baseline_f

In [3]:
# X = loadpkl('./data/xp_2D_10-50.pkl')
vocab = loadpkl('./data/vocab_2D_10-50_complete.pkl')
w2i = {w: i for i, w in enumerate(vocab)}

In [4]:
model = torch.load('./output/11_25_15_56_30/model.pt')
embeddings = model['embeddings.weight'].cpu().data.numpy()

In [None]:
# pool = Pool(processes=30)
# X = pool.map(get_emb, X)
# X = np.array(X)
# print(np.array(X[0][0][0]).shape)

# savepkl('./data/xp_2D_10-50_emb.pkl', X)

In [5]:
baseline_f = pd.read_csv('./baseline_f_t-emb.csv')

In [6]:
baseline_f = mp(baseline_f, t_emb, 20)
print(baseline_f.iloc[:2]['table_emb'])
# baseline_f.to_csv('./baseline_f_t-emb.csv',index=False)

0    [[-0.8198481798171997, -0.3138501048088074, 0....
1    [[1.8271758556365967, -1.0623819828033447, -2....
Name: table_emb, dtype: object


In [7]:
baseline_f.head()

Unnamed: 0,query_id,query,table_id,row,col,nul,in_link,out_link,pgcount,tImp,...,csum,cavg,csim,remax,resum,reavg,resim,query_l,rel,table_emb
0,1,world interest rates Table,table-0875-680,8,2,0,31,21,51438,1.0,...,5.291894,0.048108,0.354686,0.241209,3.716354,0.033785,0.28113,4,0,"[[-0.8198481798171997, -0.3138501048088074, 0...."
1,1,world interest rates Table,table-1020-619,4,3,0,18,0,324,1.0,...,11.116121,0.101056,0.718895,1.0,8.075247,0.073411,0.71025,4,0,"[[1.8271758556365967, -1.0623819828033447, -2...."
2,1,world interest rates Table,table-0288-531,3,5,0,23,22,26419,0.5,...,0.0,0.0,0.0,0.067373,0.365818,0.003326,0.03368,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0...."
3,1,world interest rates Table,table-0288-530,4,5,1,23,22,26419,0.5,...,0.0,0.0,0.0,0.067373,0.365818,0.003326,0.03368,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0...."
4,1,world interest rates Table,table-1000-57,2,2,0,38,1,2268,1.0,...,10.147388,0.092249,0.372667,0.226134,4.564622,0.041497,0.279899,4,0,"[[-0.27183327078819275, -0.6750550270080566, -..."


In [8]:
baseline_f['query_tkn'] = baseline_f['query'].apply(lambda x: tokenize_cell(x))
baseline_f['query_emb'] = baseline_f['query_tkn'].apply(lambda x: [embeddings[w2i[item]] for item in x])

In [None]:
np.array(baseline_f['table_emb'].iloc[12]).shape

In [9]:
baseline_f['early_fusion'] = baseline_f.apply(lambda x: early_fusion(x['table_emb'], x['query_emb']), axis=1)

In [34]:
baseline_f.head()

Unnamed: 0,query_id,query,table_id,row,col,nul,in_link,out_link,pgcount,tImp,...,resum,reavg,resim,query_l,rel,table_emb,query_tkn,query_emb,early_fusion,late_fusion
0,1,world interest rates Table,table-0875-680,8,2,0,31,21,51438,1.0,...,3.716354,0.033785,0.28113,4,0,"[[-0.8198481798171997, -0.3138501048088074, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.109016,"[0.19790824260169904, -0.012302985297919955, -..."
1,1,world interest rates Table,table-1020-619,4,3,0,18,0,324,1.0,...,8.075247,0.073411,0.71025,4,0,"[[1.8271758556365967, -1.0623819828033447, -2....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.128941,"[0.09213746773753934, -0.027414043227519364, -..."
2,1,world interest rates Table,table-0288-531,3,5,0,23,22,26419,0.5,...,0.365818,0.003326,0.03368,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",0.010459,"[-0.03477901916711623, 0.004363591376772238, 0..."
3,1,world interest rates Table,table-0288-530,4,5,1,23,22,26419,0.5,...,0.365818,0.003326,0.03368,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.0723,"[-0.03477901916711623, 0.0002768481896604086, ..."
4,1,world interest rates Table,table-1000-57,2,2,0,38,1,2268,1.0,...,4.564622,0.041497,0.279899,4,0,"[[-0.27183327078819275, -0.6750550270080566, -...","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.038333,"[-0.010314776306679388, 0.03445066514260779, -..."


In [18]:
# baseline_f.to_csv('./baseline_f_tq-emb.csv',index=False)

In [33]:
baseline_f = mp(baseline_f, ltf, 20)

   query_id                       query        table_id  row  col  nul  \
0         1  world interest rates Table  table-0875-680    8    2    0   
1         1  world interest rates Table  table-1020-619    4    3    0   
2         1  world interest rates Table  table-0288-531    3    5    0   
3         1  world interest rates Table  table-0288-530    4    5    1   
4         1  world interest rates Table   table-1000-57    2    2    0   

   in_link  out_link  pgcount  tImp  ...     resum     reavg     resim  \
0       31        21    51438   1.0  ...  3.716354  0.033785  0.281130   
1       18         0      324   1.0  ...  8.075247  0.073411  0.710250   
2       23        22    26419   0.5  ...  0.365818  0.003326  0.033680   
3       23        22    26419   0.5  ...  0.365818  0.003326  0.033680   
4       38         1     2268   1.0  ...  4.564622  0.041497  0.279899   

   query_l  rel                                          table_emb  \
0        4    0  [[-0.8198481798171997, 

In [35]:
baseline_f['late_fusion_max'] = baseline_f.late_fusion.apply(
    np.max)
baseline_f['late_fusion_avg'] = baseline_f.late_fusion.apply(
    np.average)
baseline_f['late_fusion_sum'] = baseline_f.late_fusion.apply(
    np.sum)

In [36]:
baseline_f.head()

Unnamed: 0,query_id,query,table_id,row,col,nul,in_link,out_link,pgcount,tImp,...,query_l,rel,table_emb,query_tkn,query_emb,early_fusion,late_fusion,late_fusion_max,late_fusion_avg,late_fusion_sum
0,1,world interest rates Table,table-0875-680,8,2,0,31,21,51438,1.0,...,4,0,"[[-0.8198481798171997, -0.3138501048088074, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.109016,"[0.19790824260169904, -0.012302985297919955, -...",0.238511,-0.012274,-0.785514
1,1,world interest rates Table,table-1020-619,4,3,0,18,0,324,1.0,...,4,0,"[[1.8271758556365967, -1.0623819828033447, -2....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.128941,"[0.09213746773753934, -0.027414043227519364, -...",0.153335,-0.018985,-0.911297
2,1,world interest rates Table,table-0288-531,3,5,0,23,22,26419,0.5,...,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",0.010459,"[-0.03477901916711623, 0.004363591376772238, 0...",0.154895,-7e-06,-0.000423
3,1,world interest rates Table,table-0288-530,4,5,1,23,22,26419,0.5,...,4,0,"[[-0.33235064148902893, 0.3774985373020172, 0....","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.0723,"[-0.03477901916711623, 0.0002768481896604086, ...",0.220914,-0.013451,-1.076088
4,1,world interest rates Table,table-1000-57,2,2,0,38,1,2268,1.0,...,4,0,"[[-0.27183327078819275, -0.6750550270080566, -...","[world, interest, rates, Table]","[[0.631871, -0.55642724, 0.8134155, 1.6779823,...",-0.038333,"[-0.010314776306679388, 0.03445066514260779, -...",0.141372,-0.01442,-0.230712


# Model

In [37]:
x_bf = ['row', 'col', 'nul', 'in_link', 'out_link', 'pgcount', 'tImp', 'tPF', 'leftColhits', 'SecColhits', 'bodyhits', 'PMI', 'qInPgTitle', 'qInTableTitle', 'yRank', 'csr_score', 'idf1', 'idf2', 'idf3', 'idf4', 'idf5', 'idf6', 'max', 'sum', 'avg', 'sim', 'emax', 'esum', 'eavg', 'esim', 'cmax', 'csum', 'cavg', 'csim', 'remax', 'resum', 'reavg', 'resim', 'query_l']
x_smf = ['early_fusion', 'late_fusion_max', 'late_fusion_avg', 'late_fusion_sum']
x_f = x_smf + x_bf
y_f = ['rel']

In [38]:
X = baseline_f[x_f]
y = baseline_f[y_f]

In [39]:
X

Unnamed: 0,early_fusion,late_fusion_max,late_fusion_avg,late_fusion_sum,row,col,nul,in_link,out_link,pgcount,...,esim,cmax,csum,cavg,csim,remax,resum,reavg,resim,query_l
0,-0.109016,0.238511,-0.012274,-0.785514,8,2,0,31,21,51438,...,0.971854,0.666667,5.291894,0.048108,0.354686,0.241209,3.716354,0.033785,0.281130,4
1,-0.128941,0.153335,-0.018985,-0.911297,4,3,0,18,0,324,...,0.983893,1.000000,11.116121,0.101056,0.718895,1.000000,8.075247,0.073411,0.710250,4
2,0.010459,0.154895,-0.000007,-0.000423,3,5,0,23,22,26419,...,0.753198,0.000000,0.000000,0.000000,0.000000,0.067373,0.365818,0.003326,0.033680,4
3,-0.072300,0.220914,-0.013451,-1.076088,4,5,1,23,22,26419,...,0.753198,0.000000,0.000000,0.000000,0.000000,0.067373,0.365818,0.003326,0.033680,4
4,-0.038333,0.141372,-0.014420,-0.230712,2,2,0,38,1,2268,...,0.954395,1.000000,10.147388,0.092249,0.372667,0.226134,4.564622,0.041497,0.279899,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,-0.059319,0.431836,-0.002602,-0.104094,5,4,0,407,164,83485,...,0.911832,0.000000,0.000000,0.000000,0.000000,0.119523,1.362889,0.010484,0.033985,2
3116,0.009275,0.300448,0.004826,2.258369,39,6,76,369,20,30512,...,0.906921,0.408248,3.872062,0.032267,0.221742,0.322749,4.308619,0.035905,0.157279,2
3117,0.127651,0.738549,0.022897,3.022354,22,3,0,0,0,826,...,0.926550,0.158114,0.965534,0.004198,0.030756,0.150756,4.949540,0.021520,0.112942,2
3118,-0.119844,0.654193,-0.012133,-2.135352,22,4,0,87,73,4080,...,0.883336,0.288675,2.499919,0.016666,0.049296,0.113228,1.261686,0.008411,0.031775,2


In [60]:
kfold = KFold(5, True, 42)
for i, indices in enumerate(kfold.split(X)):
    train, test = indices
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    df1 = makeModel_getdf(X_train, X_test, y_train, y_test)
    df1.to_csv('./LTR_results/LTR_k5_{}.txt'.format(i), sep=' ', index=False, header=False)

  This is separate from the ipykernel package so we can avoid doing imports until


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  This is separate from the ipykernel package so we can avoid doing imports until


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  This is separate from the ipykernel package so we can avoid doing imports until


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  This is separate from the ipykernel package so we can avoid doing imports until


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  This is separate from the ipykernel package so we can avoid doing imports until


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [56]:
def ms(df):
    df['model_score'] = df.apply(lambda x: getScore(x,clf),axis=1)
    return df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
makeModel_getdf(X_train, X_test, y_train, y_test)

In [57]:
from pandarallel import pandarallel

In [58]:
pandarallel.initialize(progress_bar=True,nb_workers=15,shm_size_mb=2500)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [59]:
def makeModel_getdf(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=42)
    clf.fit(X_train,y_train)
#     X_test = mp(X_test, ms, 15)
    X_test['model_score'] = X_test.parallel_apply(lambda x: getScore(x,clf),axis=1)
    df = generate_sorted_df(X_test,y_test)
    df1 = generate_trec_df(df)
    return df1

In [44]:
def getScore(row,clf):
    arr = clf.predict_proba(np.array(row).reshape(1,-1))
    return arr[0][1]+2*arr[0][2]

In [45]:
def generate_sorted_df(X,y):
#     df_temp = pd.concat([
#         X[['model_score']],
#         y],axis=1)
    df_temp = pd.concat([
        baseline_f.iloc[list(X.index)][['query_id', 'query', 'table_id']],
        X['model_score']],axis=1)

#     df_sorted = df_temp.sort_values(by=['query_id', 'model_score'], ascending=[True,False])

#     df1 = pd.DataFrame()
#     for i in range(1,61):
# #         print(i,df_sorted[df_sorted.query_id == i][:20].shape)
#         df1 = pd.concat([df1,df_sorted[df_sorted.query_id == i][:20]])
    return df_temp

In [46]:
def generate_trec_df(df1):
    l = []
    dic = dict(df1.query_id.value_counts())
    for i in dic:
        for j in range(1,dic[i]+1):
            l.append(j)
#     for i in range(60):
#         for j in range(1,21):
#             l.append(j)
    df2 = pd.DataFrame()
    df2['query_id'] = df1['query_id']
    df2['Q0'] = 'Q0'
    df2['table_id'] = df1['table_id']
    df2['rank'] = l
    df2['score'] = df1['model_score']
    df2['smarttable'] = 'smarttable'
    return df2