<a href="https://colab.research.google.com/github/vkamma/nlp_exploration/blob/main/Google_TAPAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# !pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+$cu102.html

In [31]:
# from transformers import TapasConfig, TapasForQuestionAnswering

# for example, the base sized model with default SQA configuration
# model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')

# or, the base sized model with WTQ configuration
# config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
# model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)

# or, the base sized model with WikiSQL configuration
# config = TapasConfig('google-base-finetuned-wikisql-supervised')
# model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)

In [32]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

In [33]:
tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')

In [34]:
data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
        'Age': ["56", "45", "59"],
        'Number of movies': ["87", "53", "69"]
}
table = pd.DataFrame.from_dict(data)

In [35]:
queries = ["What is the name of the first actor?", 
           "How many movies has George Clooney played in?", 
           "What is the total number of movies?",
           ]

In [36]:
inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
outputs = model(**inputs)

predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(inputs,
                                                                                                      outputs.logits.detach(),
                                                                                                      outputs.logits_aggregation.detach()
                                                                                                      )

In [37]:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

In [38]:
answers = []
for coordinates in predicted_answer_coordinates:
  if len(coordinates) == 1:
    # only a single cell:
    answers.append(table.iat[coordinates[0]])
  else:
    # multiple cells
    cell_values = []
    for coordinate in coordinates:
      cell_values.append(table.iat[coordinate])
    answers.append(", ".join(cell_values))

In [39]:
display(table)

Unnamed: 0,Actors,Age,Number of movies
0,Brad Pitt,56,87
1,Leonardo Di Caprio,45,53
2,George Clooney,59,69


In [40]:
print()
for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
  print(query)
  if predicted_agg == "NONE":
    print("Predicted answer: " + answer)
  else:
    print("Predicted answer: " + predicted_agg + " > " + answer)


What is the name of the first actor?
Predicted answer: Brad Pitt
How many movies has George Clooney played in?
Predicted answer: COUNT > 69
What is the total number of movies?
Predicted answer: COUNT > 87, 53, 69
