In [1]:
import argparse
from collections import OrderedDict
import logging
import os
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold, train_test_split
from tqdm import tqdm

from data_preprocess2 import split_overflow_table
from utils import Config, loadpkl, make_dirs, mp, flatten_1_deg
import models
from dataset import T2VDataset

In [6]:
class TREC_model():
    def __init__(self, data, output_dir, config):
        self.data = data
        self.config = config
        self.file_path = os.path.join(output_dir, config['trec']['file_name'])
        self.prep_data()
        make_dirs(output_dir)

    def prep_data(self):
        x_bf = ['row', 'col', 'nul', 'in_link', 'out_link', 'pgcount', 'tImp', 'tPF', 'leftColhits', 'SecColhits', 'bodyhits', 'PMI', 'qInPgTitle', 'qInTableTitle', 'yRank', 'csr_score', 'idf1',
                'idf2', 'idf3', 'idf4', 'idf5', 'idf6', 'max', 'sum', 'avg', 'sim', 'emax', 'esum', 'eavg', 'esim', 'cmax', 'csum', 'cavg', 'csim', 'remax', 'resum', 'reavg', 'resim', 'query_l']
        x_smf = ['early_fusion', 'late_fusion_max',
                 'late_fusion_avg', 'late_fusion_sum']
        x_f = x_bf
        y_f = ['rel']
        if self.config['trec']['semantic_f']:
            x_f += x_smf

        self.X = self.data[x_f]
        self.y = self.data[y_f]

    def train(self):
        kfold = KFold(5, True, 42)
        for i, indices in enumerate(kfold.split(self.X)):
            train_idx, test_idx = indices
            X_train, X_test, y_train, y_test = self.X.iloc[train_idx], self.X.iloc[
                test_idx], self.y.iloc[train_idx], self.y.iloc[test_idx]
            df = self.makeModel_getdf(X_train, X_test, y_train, y_test)
            df.to_csv(f"{self.file_path}{i}.txt",
                      sep=' ', index=False, header=False)

    def makeModel_getdf(self, X_train, X_test, y_train, y_test):
        # self.clf = XGBClassifier(
            # tree_method='gpu_hist',
            # gpu_id=self.config['gpu']
            # )
        # self.clf = AdaBoostClassifier(
        #     n_estimators=1000,
        #     learning_rate=1,
        #     random_state=42)
        self.clf = RandomForestClassifier(
            n_estimators=1000,
            max_features=3,
            random_state=42)
        self.clf.fit(X_train, y_train.values.ravel())
        # self.clf.fit(X_train.values, y_train.values)
        # X_test = self.score_mp(X_test)
        X_test = mp(X_test, self.score_mp, 20)
        df = self.generate_trec_df(self.generate_filtered_df(X_test, y_test))
        return df

    def score_mp(self, X_test):
        X_test['model_score'] = X_test.apply(
            lambda x: self.getScore(x), axis=1)
        return X_test

    def getScore(self, row):
        arr = self.clf.predict_proba(np.array(row).reshape(1, -1))
        return arr[0][1] + 2 * arr[0][2]

    def generate_filtered_df(self, X, y):
        df = pd.concat([
            self.data.iloc[list(X.index)][['query_id', 'query', 'table_id']],
            X['model_score']], axis=1)
        return df

    def generate_trec_df(self, df):
        l = []
        dic = dict(df.query_id.value_counts())
        for i in dic:
            for j in range(1, dic[i] + 1):
                l.append(j)

        df_temp = pd.DataFrame()
        df_temp['query_id'] = df['query_id']
        df_temp['Q0'] = 'Q0'
        df_temp['table_id'] = df['table_id']
        df_temp['rank'] = l
        df_temp['score'] = df['model_score']
        df_temp['smarttable'] = 'smarttable'
        return df_temp

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--path",
                        help="path for the scores")
    parser.add_argument("-m", "--model_name")
    return parser.parse_args()

class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [116]:
class TREC_data_prep():
    def __init__(self, model, config, vocab):
        self.model = model
        self.config = config
        self.vocab = vocab

    def convert2table(self, inp, typ):
        if typ == 'table':
            if len(inp) == 0:
                inp = [[['<PAD>']]]
            else:
                for row in inp:
                    for j, cell in enumerate(row):
                        if len(row[j]) == 0:
                            row[j].append('<PAD>')
        if typ == 'query':
            inp = [[[j] for j in inp]]
        return inp
    
    def prepare_data2(self, df, typ):
        for index, row in df.iterrows():
            inp = eval(row[f"{typ}_tkn"])
            inp =  self.convert2table(inp,typ)
            inp = split_overflow_table(inp)
            for i in range(len(inp)):
                inp[i] = T2VDataset.pad_table(
                    self.config['table_prep_params'], inp[i], '<PAD>')
            inp = T2VDataset.table_words2index(self.vocab, inp)
            inp = np.array(inp)
            rows2add = [row] * inp.shape[0]
            for i in range(inp.shape[0]):
                rows2add[i][f"{typ}_ft"] = inp[i]
            df = df.drop(index)
            df = pd.concat([df, pd.DataFrame(rows2add)])
        return df

    def late_fusion(self, table, query):
        s = []
        for i in query:
            for j in table:
                sim = cosine_similarity(
                    np.array(i).reshape(1, -1),
                    np.array(j).reshape(1, -1))
                s.append(sim)
        s = np.array(s).reshape(-1)
        return s

    def early_fusion(self, table, query):
        a = np.average(table, axis=0).reshape(1, -1)
        b = np.average(query, axis=0).reshape(1, -1)
        sim = cosine_similarity(a, b)
        return sim.reshape(-1)[0]

    def pipeline(self, baseline_f,device):
        baseline_f = self.prepare_data2(baseline_f, 'table')
        baseline_f = self.prepare_data2(baseline_f, 'query')
        baseline_f['table_ft'] = model(torch.tensor(baseline_f['table_ft'].tolist(),device=device)).cpu().detach().numpy().tolist()
        baseline_f['query_ft'] = model(torch.tensor(baseline_f['query_ft'].tolist(),device=device)).cpu().detach().numpy().tolist()

        baseline_f['early_fusion'] = baseline_f.apply(
            lambda x: self.early_fusion(x['table_ft'], x['query_ft']), axis=1)
        baseline_f['late_fusion'] = baseline_f.apply(
            lambda x: self.late_fusion(x['table_ft'], x['query_ft']), axis=1)
        
        baseline_f['late_fusion_max'] = baseline_f.late_fusion.apply(
            np.max)
        baseline_f['late_fusion_avg'] = baseline_f.late_fusion.apply(
            np.average)
        baseline_f['late_fusion_sum'] = baseline_f.late_fusion.apply(
            np.sum)
        
        baseline_f.drop(columns=['table_ft', 'query_ft'], inplace=True)

        return baseline_f

In [117]:
# args = get_args()
args = dotdict({"path":'output/5_25_11_25_39', 'model_name':'model_3.pt'})
start_time = time.time()

config = Config()
config.load(os.path.join(args.path, 'config.toml'))

vocab = loadpkl(config['input_files']['vocab_path'])
device = torch.device(f"cuda:{config['gpu']}")

In [118]:
model = models.create_model(
    config['model_props']['type'],
    params=(
        len(vocab),
        config['model_params']['embedding_dim'],
        device
    )
)
model.to(device)
state_dict = torch.load(os.path.join(args.path, args.model_name))
model.linear_layers = torch.nn.Sequential(
    *(list(model.linear_layers.children())[:2]))
state_dict_ = OrderedDict(
    {i: state_dict[i] for i in list(model.state_dict())})
model.load_state_dict(state_dict_)
print(torch.equal(list(model.parameters())[
      0], state_dict_['embeddings.weight']))
print(model)
model.eval()

True
Table2Vec(
  (embeddings): Embedding(2533498, 100)
  (cnn_layers): Sequential(
    (0): Conv2d(100, 128, kernel_size=(3, 2), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(128, 64, kernel_size=(2, 1), stride=(1, 1))
    (4): ReLU(inplace=True)
  )
  (linear_layers): Sequential(
    (0): Linear(in_features=640, out_features=256, bias=True)
    (1): ReLU(inplace=True)
  )
)


Table2Vec(
  (embeddings): Embedding(2533498, 100)
  (cnn_layers): Sequential(
    (0): Conv2d(100, 128, kernel_size=(3, 2), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(128, 64, kernel_size=(2, 1), stride=(1, 1))
    (4): ReLU(inplace=True)
  )
  (linear_layers): Sequential(
    (0): Linear(in_features=640, out_features=256, bias=True)
    (1): ReLU(inplace=True)
  )
)

# Prepare the df and features for the model

In [None]:
# model = None
baseline_f = pd.read_csv(config['input_files']['baseline_f'])
trec = TREC_data_prep(model, config, vocab)
baseline_f = trec.pipeline(baseline_f,device)
# baseline_f = mp(
#     df=baseline_f, func=trec.pipeline, num_partitions=20)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [81]:
baseline_f.shape

(6560, 46)

In [85]:
baseline_f.to_csv('./data/w_all_data/baseline_f_tq-tkn_lst_ft.csv',index=False)

In [None]:
trec_model = TREC_model(data=baseline_f, output_dir=trec_path, config=config)
trec_model.train()
ndcg_score, ndcg_score_dict = ndcg_pipeline(
    trec_model.file_path,
    '../trec_eval/trec_eval',
    '../global_data/qrels.txt')

# Prepare with the model

In [99]:
np.array(baseline_f['table_ft'].tolist()).shape

(6560, 15, 5, 1)

In [109]:
out = model(torch.tensor(baseline_f['table_ft'].tolist(),device=device))

In [110]:
out.shape

torch.Size([6560, 256])

In [113]:
len(out.cpu().detach().numpy().tolist())

6560