In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

from torch.optim import AdamW

2023-06-14 04:47:47.443443: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Step 3: Load and Preprocess the Data
class SQLDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = []
        self.tokenizer = tokenizer

        with open(data_path, 'r') as file:
            raw_data = json.load(file)

        for item in raw_data:
            question = item['question']
            answer = item['answer']
            self.data.append((question, answer))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question, answer = self.data[index]
        input_ids = self.tokenizer.encode(question, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
        target_ids = self.tokenizer.encode(answer, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': input_ids.squeeze(),
            'attention_mask': input_ids.squeeze().bool(),
            'target_ids': target_ids.squeeze(),
            'target_attention_mask': target_ids.squeeze().bool()
        }

tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_dataset = SQLDataset('sqlData-train.json', tokenizer)
test_dataset = SQLDataset('sqlData-test.json', tokenizer)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# Step 4: Create Data Loaders
batch_size = 2  # Adjust according to your system resources
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [4]:

# Step 5: Define the Model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [None]:
# Step 6: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-4)

num_epochs = 3  # Adjust as needed
total_steps = len(train_dataloader) * num_epochs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        target_attention_mask = batch['target_attention_mask'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=target_ids[:, :-1].contiguous(),
            decoder_attention_mask=target_attention_mask[:, :-1].contiguous(),
            labels=target_ids[:, 1:].contiguous()
        )
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}')