In [21]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from datasets import load_dataset


In [22]:
train_data = load_dataset('json', data_files='sqlData-train.json')['train']
test_data = load_dataset('json', data_files='sqlData-test.json')['train']


Found cached dataset json (/home/studio-lab-user/.cache/huggingface/datasets/json/default-d2d8c133d9f8e450/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/studio-lab-user/.cache/huggingface/datasets/json/default-18de05e3278fbf27/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
def preprocess_data(examples):
    table_name = "oura_sleep"  # Replace "your_table_name" with the actual name of your table
    examples['input_text'] = examples['question'] + " table: " + table_name
    examples['target_text'] = examples['answer']
    return examples


In [24]:
train_data = train_data.map(preprocess_data)
test_data = test_data.map(preprocess_data)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-d2d8c133d9f8e450/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-3edc1a2f53039c70.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-18de05e3278fbf27/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-a4af2438f027f0ad.arrow


In [25]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize_function(examples):
    input_text = examples['input_text']
    target_text = examples['target_text']
    tokenized_inputs = tokenizer.batch_encode_plus(
        input_text, 
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    tokenized_targets = tokenizer.batch_encode_plus(
        target_text, 
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    tokenized_inputs['labels'] = tokenized_targets['input_ids']
    return tokenized_inputs

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-d2d8c133d9f8e450/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-de60f28021a1778c.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-18de05e3278fbf27/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-d8f8333231e9d63e.arrow


In [28]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.resize_token_embeddings(len(tokenizer))
per_device_train_batch_size = 2  # 4 Increase or decrease the batch size as needed
num_train_epochs = 1  # 3 Reduce the number of epochs as needed


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)

trainer.train()





Step,Training Loss
10,17.2476
20,17.6241
30,16.8989
40,14.0333
50,12.7786
60,11.8041
70,9.9005
80,7.5324
90,5.0712
100,2.9778


TrainOutput(global_step=237, training_loss=5.136633045059719, metrics={'train_runtime': 6181.8562, 'train_samples_per_second': 0.077, 'train_steps_per_second': 0.038, 'total_flos': 288037082234880.0, 'train_loss': 5.136633045059719, 'epoch': 1.0})

In [29]:
eval_results = trainer.evaluate(eval_dataset=test_data)
print(eval_results)

{'eval_loss': 0.04891989007592201, 'eval_runtime': 226.531, 'eval_samples_per_second': 0.525, 'eval_steps_per_second': 0.265, 'epoch': 1.0}


In [30]:
def generate_sql_query(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=512)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output

In [32]:
input_text = "How long I had deep sleep last night?"
sql_query = generate_sql_query(input_text)
print(sql_query)

How long did it take to get deep sleep last night?


In [34]:
def generate_sql_query(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=512, num_beams=2, early_stopping=True)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output

input_text = "What is the total sales for each category?"
sql_query = generate_sql_query(input_text)
print(sql_query)


Was is the total sales for each category?
