In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")

# prepare table + question
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
question = "abc"

encoding = tokenizer(table, question, return_tensors="pt")

# let the model generate an answer autoregressively
outputs = model.generate(**encoding)

# decode back to text
predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(predicted_answer)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
col_tokens = tokenizer.tokenize("Brad Pitt")
input_ids = tokenizer.convert_tokens_to_ids(col_tokens)
print(col_tokens)
print(input_ids)

In [None]:
tokenizer

In [None]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding)

In [None]:
data = {"Actors": ["Brad Pitt"], "Number of movies": ["87"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding)

In [None]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

In [None]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

In [None]:
data = {"Actors": ["Brad Pitt"]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

In [None]:
data = {"Actors": [""]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

In [None]:
data = {"": [""]}
table = pd.DataFrame.from_dict(data)
encoding = tokenizer(table, "", return_tensors="pt")
print(encoding['input_ids'])

In [None]:
def get_decoder_output_before_lm_head(model, tokenizer, table, question):
    # Tokenize the table and question
    encoding = tokenizer(table, question, return_tensors="pt")
    input_ids = encoding['input_ids']
    attention_mask = encoding.get('attention_mask', None)

    # Directly use the model's BART structure to get the encoder's output
    encoder_outputs = model.model.encoder(input_ids, attention_mask=attention_mask)
    
    # Retrieve decoder's output using encoder's outputs and attention mask
    decoder_outputs = model.model.decoder(
        input_ids=input_ids, 
        encoder_hidden_states=encoder_outputs[0], 
        attention_mask=attention_mask
    )

    # The first output of the decoder contains the last hidden states
    return decoder_outputs[0]

# Example usage:
decoder_output = get_decoder_output_before_lm_head(model, tokenizer, table, question)
print(decoder_output.shape)
