In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

2023-06-13 11:41:34.161226: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

from torch.optim import AdamW

In [3]:
# Define Dataset Class
class SQLDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = self.load_data(data_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        question = item["question"]
        sql = item["answer"]  # Assuming the SQL syntax is stored in the "answer" attribute

        encoded_inputs = self.tokenizer.encode_plus(
            question,
            sql,
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoded_inputs["input_ids"].squeeze()
        attention_mask = encoded_inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(1 if sql else 0),
        }

    def load_data(self, data_path):
        with open(data_path, "r") as file:
            data = json.load(file)
        return data



In [4]:

# Function to Compute Metrics
def compute_metrics(labels, preds):
    preds = torch.argmax(preds, dim=1)
    acc = accuracy_score(labels, preds)
    report = classification_report(labels, preds)
    return {"accuracy": acc, "classification_report": report}

In [6]:
# Set Paths
train_data_path = "sqlData-train.json"
test_data_path = "sqlData-test.json"
output_dir = "./fine_tuned_model"

# Set Hyperparameters
batch_size = 16
learning_rate = 2e-5
num_epochs = 3
num_labels = 2

# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [7]:
# Load Data
train_dataset = SQLDataset(train_data_path, tokenizer)
test_dataset = SQLDataset(test_data_path, tokenizer)

In [8]:
# Create Data Loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
# Load BERT Model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False,
)


In [11]:
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)


In [13]:
# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

Epoch 1/3 - Average Loss: 0.1090
Epoch 2/3 - Average Loss: 0.0023
Epoch 3/3 - Average Loss: 0.0008


In [15]:
# Save the fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json')

In [16]:
# Evaluation
model.eval()
eval_loss = 0
preds = []
labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=batch_labels,
        )

        loss = outputs.loss
        logits = outputs.logits

        eval_loss += loss.item()
        preds.append(logits.detach().cpu())
        labels.append(batch_labels.detach().cpu())

eval_loss /= len(test_loader)
preds = torch.cat(preds, dim=0)
labels = torch.cat(labels, dim=0)

metrics = compute_metrics(labels, preds)
accuracy = metrics["accuracy"]
classification_report = metrics["classification_report"]

print(f"\nEvaluation Loss: {eval_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:\n{classification_report}")


Evaluation Loss: 0.0005
Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       119

    accuracy                           1.00       119
   macro avg       1.00      1.00      1.00       119
weighted avg       1.00      1.00      1.00       119



In [17]:
model = BertForSequenceClassification.from_pretrained("fine_tuned_model")


In [18]:
tokenizer = BertTokenizer.from_pretrained("fine_tuned_model")
input_text = "How much deep sleep I got last night?"
encoded_input = tokenizer.encode_plus(
    input_text,
    padding="max_length",
    max_length=128,
    truncation=True,
    return_tensors="pt",
)


In [19]:
model.eval()
with torch.no_grad():
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

predicted_class = torch.argmax(outputs.logits, dim=1).item()


In [20]:
predicted_class

1