In [1]:
import json

# Load the training dataset from JSON file
with open("sqlData-train.json", "r") as file:
    train_data = json.load(file)

# Extract questions and queries from the training data
train_questions = []
train_queries = []
for entry in train_data:
    train_questions.append(entry["question"])
    train_queries.append(entry["answer"])


In [2]:
# Load the testing dataset from JSON file
with open("sqlData-test.json", "r") as file:
    test_data = json.load(file)

# Extract questions and queries from the testing data
test_questions = []
test_queries = []
for entry in test_data:
    test_questions.append(entry["question"])
    test_queries.append(entry["answer"])


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [4]:

# Define the fixed schema attributes
schema_attributes = [
  "summary_date",
  "bedtime_start",
  "bedtime_end",
  "score",
  "score_total",
  "score_disturbances",
  "score_efficiency",
  "score_latency",
  "score_rem",
  "score_deep",
  "score_alignment",
  "total",
  "duration",
  "awake",
  "light",
  "rem",
  "deep",
  "onset_latency",
  "restless",
  "efficiency",
  "midpoint_time",
  "hr_lowest",
  "hr_average",
  "rmssd",
  "breath_average",
  "temperature_delta",
  "hypnogram_5min",
  "hr_5min",
  "rmssd_5min"
]


In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Prepare input sequences with fixed schema attributes for training
train_input_sequences = []
for question in train_questions:
    train_input_sequence = f"question: {question} table: {', '.join(schema_attributes)}"
    train_input_sequences.append(train_input_sequence)

train_input_encodings = tokenizer.batch_encode_plus(
    train_input_sequences,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
train_input_ids = train_input_encodings["input_ids"]
train_attention_mask = train_input_encodings["attention_mask"]

# Prepare input sequences with fixed schema attributes for testing
test_input_sequences = []
for question in test_questions:
    test_input_sequence = f"question: {question} table: {', '.join(schema_attributes)}"
    test_input_sequences.append(test_input_sequence)

test_input_encodings = tokenizer.batch_encode_plus(
    test_input_sequences,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
test_input_ids = test_input_encodings["input_ids"]
test_attention_mask = test_input_encodings["attention_mask"]

# Prepare target sequences (SQL queries) for training and testing
train_target_encodings = tokenizer.batch_encode_plus(
    train_queries,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
train_target_ids = train_target_encodings["input_ids"]
train_target_attention_mask = train_target_encodings["attention_mask"]

test_target_encodings = tokenizer.batch_encode_plus(
    test_queries,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
test_target_ids = test_target_encodings["input_ids"]
test_target_attention_mask = test_target_encodings["attention_mask"]


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
import torch
from torch.optim import AdamW


# Model Training
model = T5ForConditionalGeneration.from_pretrained("t5-base")


# Set up the training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train the model
num_epochs = 5
batch_size = 2
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for i in range(0, len(train_input_ids), batch_size):
        input_batch = train_input_ids[i : i + batch_size].to(device)
        attention_mask_batch = train_attention_mask[i : i + batch_size].to(device)
        target_batch = train_target_ids[i : i + batch_size].to(device)
        target_attention_mask_batch = train_target_attention_mask[i : i + batch_size].to(device)

        model.zero_grad()
        outputs = model(
            input_ids=input_batch,
            attention_mask=attention_mask_batch,
            labels=target_batch,
            decoder_attention_mask=target_attention_mask_batch,
            return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss}")

#import pandas as pd
    
# Model Evaluation
#import json

# Model Evaluation
model.eval()

test_predictions = []
for i in range(0, len(test_input_ids), batch_size):
    input_batch = test_input_ids[i : i + batch_size].to(device)
    attention_mask_batch = test_attention_mask[i : i + batch_size].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_batch,
            attention_mask=attention_mask_batch,
            max_length=100,
            num_beams=4,
            early_stopping=True
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    test_predictions.extend(decoded_outputs)




Epoch 1/5 | Loss: 89.04612409707624
Epoch 2/5 | Loss: 13.432238040608354


In [9]:
test_predictions

['Select rmssd from oura_sleep where summary_date=LATEST',
 'Select score_latency from oura_sleep where summary_date=LATEST',
 'Select duration from oura_sleep where summary_date=LATEST',
 'Select deep from oura_sleep where summary_date=LATEST',
 'Select hr_lowest from oura_sleep where summary_date=LATEST',
 'Select score_alignment from oura_sleep where summary_date=LATEST',
 'Select score_disturbances from oura_sleep where summary_date=LATEST',
 'Select score_deep from oura_sleep where summary_date=LATEST',
 'Select score_light from oura_sleep where summary_date=LATEST',
 'Select score_disturbances from oura_sleep where summary_date=LATEST',
 'Select score_efficiency from oura_sleep where summary_date=LATEST',
 'Select score from oura_sleep where summary_date=LATEST',
 'Select hr_average from oura_sleep where summary_date=LATEST',
 'Select awake from oura_sleep where summary_date=LATEST',
 'Select rmssd from oura_sleep where summary_date=LATEST',
 'Select efficiency from oura_sleep wh

In [None]:
    
# Create a list of dictionaries for each prediction
predictions_list = []
for question, prediction in zip(test_questions, test_predictions):
    prediction_entry = {
        "question": question,
        "prediction": prediction
    }
    predictions_list.append(prediction_entry)

# Save the predictions to a JSON file
with open("predictions.json", "w") as file:
    json.dump(predictions_list, file)

In [10]:
# Save the model
model.save_pretrained("t5-test")

In [15]:
# Define the input question
input_question = "when I went to bed yesterday?"

# Tokenize and encode the input question
input_sequence = f"question: {input_question} table: {', '.join(schema_attributes)}"
input_encoding = tokenizer.encode_plus(
    input_sequence,
    padding="longest",
    truncation=True,
    return_tensors="pt"
)
input_ids = input_encoding["input_ids"]
attention_mask = input_encoding["attention_mask"]

# Generate predictions
output_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_beams=4,
    early_stopping=True
)

# Decode the output predictions
output_predictions = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the output predictions
print("Input Question:", input_question)
print("Predicted SQL Query:", output_predictions)


Input Question: when I went to bed yesterday?
Predicted SQL Query: Select score from oura_sleep where summary_date=LATEST
