In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")

# prepare table + question
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
question = "abc"

encoding = tokenizer(table, question, return_tensors="pt")

# let the model generate an answer autoregressively
outputs = model.generate(**encoding)

# decode back to text
predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(predicted_answer)



 kalimba edwards


In [9]:
max_length = model.config.max_position_embeddings

In [10]:
max_length

1024

In [11]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
col_tokens = tokenizer.tokenize("Actors")
input_ids = tokenizer.convert_tokens_to_ids(col_tokens)
print(col_tokens)
print(input_ids)

['act', 'ors']
[7257, 994]


In [12]:
tokenizer.model_max_length

1024

In [13]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding)

You provide nothing to query with respect to the table.


{'input_ids': tensor([[    0,  9119,  4832,  5552,  1721,   346,     9,  4133,  3236,   112,
          4832,  5378,   625,   181,  2582,  1721,  8176,  3236,   132,  4832,
          2084,   261,  6782,  2269,  2927, 12834,  1721,  4268,  3236,   155,
          4832,  5473, 26875, 42771,  6071,  1721,  5913,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [14]:
data = {"Actors": ["Brad Pitt"], "Number of movies": ["87"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding)

You provide nothing to query with respect to the table.


{'input_ids': tensor([[   0, 9119, 4832, 5552, 1721,  346,    9, 4133, 3236,  112, 4832, 5378,
          625,  181, 2582, 1721, 8176,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [15]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide nothing to query with respect to the table.


tensor([[    0,  9119,  4832,  5552,  3236,   112,  4832,  5378,   625,   181,
          2582,  3236,   132,  4832,  2084,   261,  6782,  2269,  2927, 12834,
          3236,   155,  4832,  5473, 26875, 42771,  6071,     2]])


In [16]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide nothing to query with respect to the table.


tensor([[    0,  9119,  4832,  5552,  3236,   112,  4832,  5378,   625,   181,
          2582,  3236,   132,  4832,  2084,   261,  6782,  2269,  2927, 12834,
             2]])


In [17]:
data = {"Actors": ["Brad Pitt"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide nothing to query with respect to the table.


tensor([[   0, 9119, 4832, 5552, 3236,  112, 4832, 5378,  625,  181, 2582,    2]])


In [18]:
data = {"Actors": [], "Number of movies": []}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). Please carefully check the corresponding table with the query : .
You provide nothing to query with respect to the table.


tensor([[0, 2]])


In [19]:
data = {"Actors": [""]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide nothing to query with respect to the table.


tensor([[   0, 9119, 4832, 5552, 3236,  112, 4832,    2]])


In [20]:
data = {"": [""]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

You provide nothing to query with respect to the table.


tensor([[   0, 9119, 4832, 1437, 3236,  112, 4832,    2]])


In [32]:
def get_decoder_output_before_lm_head(model, tokenizer, table):
    # Tokenize the table and question
    encoding = tokenizer(table, "", return_tensors="pt")
    input_ids = encoding['input_ids']
    print(input_ids.shape)
    
    attention_mask = encoding.get('attention_mask', None)
    print(attention_mask.shape)
    batch_input_ids = torch.cat([input_ids,input_ids], dim=0)
    batch_attention_mask = torch.cat([attention_mask,attention_mask], dim=0)
    print(batch_input_ids.shape)
    print(batch_attention_mask.shape)
    
    # Directly use the model's BART structure to get the encoder's output
    encoder_outputs = model.model.encoder(batch_input_ids, attention_mask=batch_attention_mask)
    print(encoder_outputs[0].shape)

    # Retrieve decoder's output using encoder's outputs and attention mask
    decoder_outputs = model.model.decoder(
        input_ids=batch_input_ids, 
        encoder_hidden_states=encoder_outputs[0], 
        attention_mask=batch_attention_mask
    )

    # The first output of the decoder contains the last hidden states
    return decoder_outputs[0]

# Example usage:
decoder_output = get_decoder_output_before_lm_head(model, tokenizer, table)
print(decoder_output.shape)


You provide nothing to query with respect to the table.


torch.Size([1, 38])
torch.Size([1, 38])
torch.Size([2, 38])
torch.Size([2, 38])
torch.Size([2, 38, 1024])
torch.Size([2, 38, 1024])
