In [66]:
import pandas as pd
from datasets import Dataset
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [13]:
train_df = pd.read_csv('./dataset/final/train.csv', usecols=['source', 'label'])
train_df['source'] = train_df['source'].str.lower()
train_df['label'] = train_df['label'].str.lower()
train_ds = Dataset.from_pandas(train_df)

test_df = pd.read_csv('./dataset/final/test.csv', usecols=['source', 'label'])
test_df['source'] = test_df['source'].str.lower()
test_df['label'] = test_df['label'].str.lower()
test_ds = Dataset.from_pandas(test_df)

dev_df = pd.read_csv('./dataset/final/dev.csv', usecols=['source', 'label'])
dev_df['source'] = dev_df['source'].str.lower()
dev_df['label'] = dev_df['label'].str.lower()
dev_ds = Dataset.from_pandas(dev_df)

all_df = pd.concat((train_df, test_df, dev_df))
all_ds = Dataset.from_pandas(all_df)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('./results_seq2seq/', local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained('./results_seq2seq/', local_files_only=True)

In [8]:
from VisAwareTranslation import postprocessing

nl_template_cnt = 0
nl_template_match = 0

for source, label in zip(test_df['source'], test_df['label']):
    input_ids = tokenizer(source, return_tensors="pt", max_length=512, padding=True, truncation=True).input_ids 
    outputs = model.generate(input_ids)
    
    decoded = ''.join(tokenizer.convert_ids_to_tokens(outputs[0])[1:-1]).replace('▁', ' ').strip()
    
    pred = postprocessing(label, decoded)

    nl_template_cnt += 1

    if ' '.join(label.replace('"', "'").split()) == ' '.join(pred.replace('"', "'").split()):
        nl_template_match += 1

10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | how many companies that are not headquartered in the united states for each main industry ? show me a bar chart , and could you display by the total number from high to low ?
mark bar encoding x main_industry y aggregate count main_industry transform filter headquarters != 'usa' sort y desc
mark bar encoding x main_industry y aggregate count main_industry transform filter main_industry != 'united states' sort y desc

19 . 8 . id - int , name - str , headquarters - str , industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value_billion - float | , and order by the y axis in ascending .
mark bar encoding x industry y aggregate count industry transform sort y asc
mark bar encoding x headquarters y aggregate none market_value_billion transform sort 

KeyboardInterrupt: 

In [4]:
source = test_df['source'][100]

In [5]:
input_ids = tokenizer(source, return_tensors="pt", max_length=512, padding=True, truncation=True).input_ids 
outputs = model.generate(input_ids)

decoded = ''.join(tokenizer.convert_ids_to_tokens(outputs[0])[1:-1]).replace('▁', ' ').strip()

In [6]:
source

'11 . 7 . id - int , train_number - int , name - str , origin - str , destination - str , time - str , interval - str | find the number of trains starting from each origin plot them as bar chart , and order in asc by the y-axis .'

In [7]:
decoded

'mark bar encoding x origin y aggregate count origin transform sort y asc'

In [14]:
tokenizer = AutoTokenizer.from_pretrained('./results_causal/', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('./results_causal/', local_files_only=True)

In [102]:
source = '10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | what is the market value of every'

input_ids = tokenizer(source, return_tensors="pt").input_ids 

logits = model(input_ids).logits[:, -1, :]

pred_ids = torch.argsort(logits)[0, -5:]
pred_words = [tokenizer.decode(pred_id) for pred_id in pred_ids]

In [103]:
pred_words

[' major', ' industry', ' asset', ' manufacturer', ' company']

In [94]:
pred_ids = torch.argsort(logits)[0, -5:]
probs = logits[0][pred_ids] 
probs

tensor([-113.4453, -113.0419, -111.8864, -111.2938, -106.9644],
       grad_fn=<IndexBackward>)

In [101]:
logits[0][pred_ids]

tensor([-113.4453, -113.0419, -111.8864, -111.2938, -106.9644],
       grad_fn=<IndexBackward>)

In [96]:
probs

array([0.20380668, 0.203082  , 0.20100617, 0.19994149, 0.19216363],
      dtype=float32)

In [97]:
pred_ids

tensor([ 1688,  2831, 11171, 11554,  1664])