### Generating the BERT embeddings for the tables which are used for the similarity-based search of few-shot examples

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import ast
import torch
import json
import itertools
import random

In [5]:
import sys
sys.path.append('project_path')

In [7]:
from transformers import BertTokenizer, BertModel
import torch

In [8]:
from utils import get_correct_anno, process_single_table_gpt

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [10]:
seed_nr = 42
generator = torch.Generator().manual_seed(seed_nr)

In [12]:
data_path="../data/Wiki_TabNER_final_labeled.json"
with open(data_path, 'r') as f:
    ner_tables = json.load(f)   

In [13]:
labels_dict = {  
            'Activity': 1,
            'Organisation': 2,
            'ArchitecturalStructure': 3,
            'Event': 4,
            'Place': 5,
            'Person': 6,
            'Work': 7,
                }
labels_dict_rev={v:k for k,v in labels_dict.items()}
print(labels_dict)

{'Activity': 1, 'Organisation': 2, 'ArchitecturalStructure': 3, 'Event': 4, 'Place': 5, 'Person': 6, 'Work': 7}


### Generate table embeddings for all tables and save in npy

In [14]:
test = process_single_table_gpt(model, tokenizer, ner_tables[0][0])
test.shape

torch.Size([768])

In [15]:
all_table_embeddings = []
for i in range(len(ner_tables)):       
    table = ner_tables[i][0]
    table_embedding = process_single_table_gpt(model, tokenizer, table)
    all_table_embeddings.append(table_embedding)
    
all_table_embeddings = np.array(all_table_embeddings)
np.save('../data/bert_all_table_embeddings.npy', all_table_embeddings)

In [16]:
all_table_embeddings.shape

(61273, 768)

In [19]:
train_set, test_set = torch.utils.data.random_split(ner_tables, [31273, 30000], generator=generator)

In [20]:
train_embeddings = all_table_embeddings[train_set.indices]

### Look for the 5 most similar tables to each test table, save indices and then generate prompt demos 

In [None]:
similar_sets = []
for idx in test_set.indices:            
    similar = np.dot(all_table_embeddings[idx], train_embeddings.T)
    top3_similar_tables_indices = np.argsort(similar, axis=0)[-5:]    
    similar_sets.append(tuple(top3_similar_tables_indices))

In [22]:
expand = [item for subset in similar_sets for item in subset]

#### Some of the tables repeat, therefore we take the set of all the similar sets. For these tables, we get the correct annotations and prepare them for input to the prompt. We save them into a dict {tab_id: example_rows, example_NER_annotations}

In [34]:
len(set(expand))

12802

In [54]:
# to generate the gt for the indices
similar_tables = {}
for i in expand:
    tab_idx = train_set.indices[i]
    table = ner_tables[tab_idx][0]   
    
    example_rows, example_NER_annotations, _ = get_correct_anno(table, labels_dict_rev)        
    similar_tables[tab_idx] = [example_rows, example_NER_annotations]    

In [55]:
train_set.indices[5439]

35002

In [56]:
ner_tables[35002]

[['31890250-3',
  'Ladytron discography',
  'Compilation albums',
  'Compilation albums',
  [[[-1, 0], 'Title'],
   [[-1, 1], 'Album details'],
   [[-1, 2], 'Peak chart positions']],
  [[[0, 0], 'Softcore Jukebox'],
   [[0, 1], 'Released: 7 October 2003 Label: Emperor Norton Format: CD'],
   [[0, 2], '24'],
   [[1, 0], 'Best of Remixes'],
   [[1, 1], 'Released: 8 March 2011 Label: Nettwerk Format: Digital download'],
   [[1, 2], '—'],
   [[2, 0], 'Best of 00–10'],
   [[2, 1],
    'Released: 28 March 2011 Label: Nettwerk Format: CD , digital download'],
   [[2, 2], '—']],
  [[[[0, 0, 0, 16, 7]]],
   [[[0, 1, 32, 46, 2], [0, 1, 55, 57, 0]]],
   [[[1, 0, 0, 15, 7]]],
   [[[1, 1, 30, 38, 2], [1, 1, 47, 63, 7]]],
   [[[2, 0, 0, 13, 7]]],
   [[[2, 1, 31, 39, 2], [2, 1, 48, 50, 0], [2, 1, 53, 69, 7]]]]]]

In [57]:
similar_tables[35002]

['Table:\nTitle|Album details|Peak chart positions\nSoftcore Jukebox|Released: 7 October 2003 Label: Emperor Norton Format: CD|24\nBest of Remixes|Released: 8 March 2011 Label: Nettwerk Format: Digital download|—\nBest of 00–10|Released: 28 March 2011 Label: Nettwerk Format: CD , digital download|—\n',
 [{'entity': 'Softcore Jukebox', 'type': 'Work', 'cell_index': [0, 0]},
  {'entity': 'Emperor Norton', 'type': 'Organisation', 'cell_index': [0, 1]},
  {'entity': 'Best of Remixes', 'type': 'Work', 'cell_index': [1, 0]},
  {'entity': 'Nettwerk', 'type': 'Organisation', 'cell_index': [1, 1]},
  {'entity': 'Digital download', 'type': 'Work', 'cell_index': [1, 1]},
  {'entity': 'Best of 00–10', 'type': 'Work', 'cell_index': [2, 0]},
  {'entity': 'Nettwerk', 'type': 'Organisation', 'cell_index': [2, 1]},
  {'entity': 'digital download', 'type': 'Work', 'cell_index': [2, 1]}]]

In [60]:
with open("../output/similar_tables_examples.json", "w") as f:
    json.dump(similar_tables,f)