In [5]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import numpy as np


from transformers import pipeline
#model finetuned for question-answering:
tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetuned-wtq")

def cleantable(df):
    '''change multi-index column to single and 
    Iterate through each row and column in the DataFrame to remove non byte like characters'''
    
    df.columns = [' '.join(col).strip() for col in df.columns.values]

    for index, row in df.iterrows():
        for col in df.columns:
            df.at[index, col] = re.sub('[^a-zA-Z0-9]', '', str(row[col]))
    return df

In [3]:
# Read the HTML file into a Pandas dataframe
with open('2022ApJ...924...14P.html') as file:
    soup = BeautifulSoup(file, 'html.parser')
tables = pd.read_html(str(soup))


    


In [4]:
for t in tables:
    df = cleantable(t)
    question = 'are object names or ids mentioned? if yes what are they? give them in an array'
    print(tqa(table=df, query=question))

{'answer': 'COSMOSACSF814Wncnn, CANDELSACSF606Wndnn', 'coordinates': [(0, 0), (1, 0)], 'cells': ['COSMOSACSF814Wncnn', 'CANDELSACSF606Wndnn'], 'aggregator': 'NONE'}
{'answer': 'ndrizcrcorrn, ndrizcombinen, ncleann, nfinalwcsn, nfinalscalen, nfinalpixfracn, nskymethodn, nskysubn, ncombinetypen', 'coordinates': [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0)], 'cells': ['ndrizcrcorrn', 'ndrizcombinen', 'ncleann', 'nfinalwcsn', 'nfinalscalen', 'nfinalpixfracn', 'nskymethodn', 'nskysubn', 'ncombinetypen'], 'aggregator': 'NONE'}
{'answer': 'KeckILRIS, KeckILRIS, KeckIMOSFIRE, KeckILRIS, KeckILRIS, KeckILRIS, KeckIMOSFIRE, LBTMODS1, KeckILRIS, KeckILRIS', 'coordinates': [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0)], 'cells': ['KeckILRIS', 'KeckILRIS', 'KeckIMOSFIRE', 'KeckILRIS', 'KeckILRIS', 'KeckILRIS', 'KeckIMOSFIRE', 'LBTMODS1', 'KeckILRIS', 'KeckILRIS'], 'aggregator': 'NONE'}
{'answer': 'zf9775, zf11754, zf14000', 'coordinates':

In [6]:
a = df.values.tolist()
print(np.shape(a))

(35, 11)


In [None]:
df.columns

"Index(['ID \\xc2\\xa0', 'Isophotal\\n a\\n \\n Area (pixels)',\n       '\\n mF336W\\n \\xc2\\xa0', 'S/N (F336W)', '\\n mF435W\\n \\xc2\\xa0',\n       'S/N (F435W)', '\\n mF435W\\n \\n b\\n \\n (HSC Match)',\n       '\\n mF606W\\n \\xc2\\xa0', 'S/N (F606W)', '\\n mF814W\\n \\xc2\\xa0',\n       'S/N (F814W)'],\n      dtype='object')"

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [16]:
input_text = 'are there object ids in a table with the following columns:'+str(df.columns)
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(outputs)
print(tokenizer.decode(outputs[0]))

tensor([[    0,    33,   132,  3735,     3,    23,    26,     7,    16,     8,
           953,    28,     8,   826, 15752,    10,     1]])
<pad> are there object ids in the table with the following columns:</s>


In [27]:
oracle = pipeline(model="deepset/roberta-base-squad2")
oracle(question="What are the column names", context=str(df.columns),topk=5,handle_impossible_answer=True)

Downloading:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

[{'score': 0.7181697487831116, 'start': 0, 'end': 0, 'answer': ''},
 {'score': 9.344970749225467e-05,
  'start': 136,
  'end': 149,
  'answer': "S/N (F435W)',"},
 {'score': 4.46161357103847e-05,
  'start': 136,
  'end': 149,
  'answer': "S/N (F435W)',"},
 {'score': 4.420016921358183e-05,
  'start': 8,
  'end': 19,
  'answer': 'ID \\xc2\\xa0'},
 {'score': 3.8521822716575116e-05,
  'start': 135,
  'end': 149,
  'answer': "'S/N (F435W)',"}]

In [11]:
print(input_text)

are there object ids in a table with the following columnsIndex(['ID \xc2\xa0', 'Isophotal\n a\n \n Area (pixels)',
       '\n mF336W\n \xc2\xa0', 'S/N (F336W)', '\n mF435W\n \xc2\xa0',
       'S/N (F435W)', '\n mF435W\n \n b\n \n (HSC Match)',
       '\n mF606W\n \xc2\xa0', 'S/N (F606W)', '\n mF814W\n \xc2\xa0',
       'S/N (F814W)'],
      dtype='object')
