In [19]:
import json
from sklearn.model_selection import train_test_split
from transformers import (
    TextDataset,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the training and testing datasets
with open('sqlData-train.json') as train_file:
    train_data = json.load(train_file)

with open('sqlData-test.json') as test_file:
    test_data = json.load(test_file)

# Split the training dataset into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)


In [20]:
#tokenizer = T5Tokenizer.from_pretrained('t5-base')

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

def format_dataset(data):
    formatted_data = []
    for item in data:
        formatted_data.append(
            {
                'question': item['question'],
                'answer': f"translate English to SQL: {item['answer']}",
            }
        )
    return formatted_data

train_data = format_dataset(train_data)
val_data = format_dataset(val_data)
test_data = format_dataset(test_data)


def tokenize_data(data):
    tokenized_data = tokenizer.batch_encode_plus(
        [(item['question'], item['answer']) for item in data],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt',
    )
    return tokenized_data

train_data = tokenize_data(train_data)
val_data = tokenize_data(val_data)
test_data = tokenize_data(test_data)



In [21]:
#model = T5ForConditionalGeneration.from_pretrained('t5-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

#data_collator = DataCollatorForText2TextGeneration(tokenizer=tokenizer, padding=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)


In [22]:
trainer.train()

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'