In [36]:
# Import Libraries
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig,BertForSequenceClassification,  get_linear_schedule_with_warmup

from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder


In [37]:
# Step 1: Load and Preprocess the Data
class SQLDataset(Dataset):
    def __init__(self, data_path, tokenizer, label_encoder):
        self.data = []
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder

        with open(data_path, 'r') as file:
            raw_data = json.load(file)
        
        for item in raw_data:
            question = item['question']
            answer = item['answer']
            self.data.append((question, answer))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        question, answer = self.data[index]
        encoded_input = self.tokenizer.encode_plus(
            question,
            max_length=128,  # Adjust according to your needs
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()
        
        answer_encoded = self.label_encoder.transform([answer])
        answer_tensor = torch.tensor(answer_encoded, dtype=torch.long)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'answer': answer_tensor
        }



In [51]:
# Step 2: Initialize Label Encoder and Tokenizer

label_encoder = LabelEncoder()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Step 3: Load and Encode the Data
train_dataset = SQLDataset('sqlData-train.json', tokenizer, label_encoder)
test_dataset = SQLDataset('sqlData-test.json', tokenizer, label_encoder)

# Fit the label encoder on the combined SQL statements
all_labels = [item[1] for item in train_dataset.data] + [item[1] for item in test_dataset.data]
label_encoder.fit(all_labels)

# Get the unique labels
unique_labels = label_encoder.classes_
num_labels = len(unique_labels)
num_labels

269

In [46]:
# Step 4: Create Data Loaders
batch_size = 16  # Adjust according to your system resources
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [54]:

# Step 5: Define the Model
class TextToSQLModel(torch.nn.Module):
    def __init__(self, encoder, encoder_config, decoder_config, num_labels):
        super(TextToSQLModel, self).__init__()
        self.encoder = encoder.from_pretrained('bert-base-uncased', config=encoder_config)
        self.decoder = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=decoder_config)
        self.decoder.classifier = torch.nn.Linear(self.decoder.classifier.in_features, num_labels)
    
    def forward(self, input_ids, attention_mask):
        encoder_output = self.encoder(input_ids, attention_mask=attention_mask)
        pooled_output = encoder_output.pooler_output
        decoder_output = self.decoder(pooled_output)
        return decoder_output.logits
    

In [55]:
# Step 6: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder_config = BertConfig()
decoder_config = BertConfig(num_labels=num_labels)
model = TextToSQLModel(BertModel, encoder_config, decoder_config, num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5  # Adjust as needed

total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        answer = batch['answer'].squeeze().to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.view(-1)  # Reshape the logits tensor

        loss = loss_fn(logits, answer)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

# Step 7: Evaluate the Model
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        answer = batch['answer'].squeeze().to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.view(-1)

        _, predicted_labels = torch.max(logits, 1)
        total_predictions += answer.size(0)
        correct_predictions += (predicted_labels == answer).sum().item()

accuracy = correct_predictions / total_predictions
print(f'Test Accuracy: {accuracy:.4f}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerN

RuntimeError: The expanded size of the tensor (768) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [16, 768].  Tensor sizes: [1, 512]