<a href="https://colab.research.google.com/github/vishal7379/Colab/blob/main/NL_2_SQL_Complete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install -q transformers datasets accelerate sentencepiece evaluate

In [1]:
import random
import torch

from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)

torch.backends.cuda.matmul.allow_tf32 = True  # free speed on T4

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [2]:
model_name = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)

model.gradient_checkpointing_enable()   # MUST for T4
model.to(device)

model.generation_config.max_length = 128
model.generation_config.num_beams = 4

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/257 [00:00<?, ?it/s]

In [3]:
SELECT_WORDS = [
    "list", "show", "display", "fetch", "retrieve",
    "give me", "return", "provide"
]

GT_WORDS = [
    "greater than", "above", "higher than",
    "more than", "over", "exceeding"
]

AGG_MAP = {
    "average": "AVG",
    "mean": "AVG",
    "avg": "AVG",

    "sum": "SUM",
    "total": "SUM",

    "minimum": "MIN",
    "lowest": "MIN",

    "maximum": "MAX",
    "highest": "MAX"
}

COUNT_WORDS = [
    "count", "number of", "total"
]

In [4]:
COLUMNS = [
    "id","name","age","salary","price",
    "quantity","score","rating","dept_id"
]

def random_identifier(prefix):
    return prefix + "_" + str(random.randint(1, 99999))

def create_schema():
    t1 = random_identifier("table")
    t2 = random_identifier("table")

    cols1 = random.sample(COLUMNS, 4)
    cols2 = random.sample(COLUMNS, 4)

    schema = f"""
table {t1}({', '.join(cols1)})
table {t2}({', '.join(cols2)})
"""
    return t1, t2, cols1, cols2, schema

In [5]:
def generate_example():
    t1, t2, c1, c2, schema = create_schema()

    col = random.choice(c1)
    col2 = random.choice(c1)

    examples = []

    # SELECT
    q = f"{random.choice(SELECT_WORDS)} {col} from {t1}"
    sql = f"SELECT {col} FROM {t1};"
    examples.append((schema, q, sql))


    # WHERE
    q = f"{random.choice(SELECT_WORDS)} {col} from {t1} where {col2} is {random.choice(GT_WORDS)} 10"
    sql = f"SELECT {col} FROM {t1} WHERE {col2} > 10;"
    examples.append((schema, q, sql))


    # AGGREGATION
    agg_word, agg_sql = random.choice(list(AGG_MAP.items()))

    q = f"what is the {agg_word} {col} in {t1}"
    sql = f"SELECT {agg_sql}({col}) FROM {t1};"
    examples.append((schema, q, sql))


    # AGG + WHERE (VERY IMPORTANT)
    q = f"what is the {agg_word} {col} in {t1} where {col2} is {random.choice(GT_WORDS)} 5"
    sql = f"SELECT {agg_sql}({col}) FROM {t1} WHERE {col2} > 5;"
    examples.append((schema, q, sql))


    # COUNT
    q = f"{random.choice(COUNT_WORDS)} rows in {t1}"
    sql = f"SELECT COUNT(*) FROM {t1};"
    examples.append((schema, q, sql))


    # GROUP BY
    q = f"{random.choice(COUNT_WORDS)} rows grouped by {col}"
    sql = f"SELECT {col}, COUNT(*) FROM {t1} GROUP BY {col};"
    examples.append((schema, q, sql))


    # HAVING
    q = f"{random.choice(COUNT_WORDS)} rows grouped by {col} having count {random.choice(GT_WORDS)} 5"
    sql = f"SELECT {col}, COUNT(*) FROM {t1} GROUP BY {col} HAVING COUNT(*) > 5;"
    examples.append((schema, q, sql))


    # JOIN
    join_col = random.choice(list(set(c1).intersection(set(c2))) or ["id"])

    q = f"{random.choice(SELECT_WORDS)} {col} by joining {t1} and {t2} on {join_col}"
    sql = f"""
SELECT {col}
FROM {t1}
JOIN {t2}
ON {t1}.{join_col} = {t2}.{join_col};
"""
    examples.append((schema, q, sql))

    return examples

In [6]:
data = []

for _ in range(8000):   # ~64k STRONG samples
    for schema, question, sql in generate_example():

        inp = f"""
Translate English to SQL.

Schema:
{schema}

Question:
{question}
"""

        data.append({
            "input": inp.strip(),
            "output": sql.strip()
        })

print("Dataset size:", len(data))

Dataset size: 64000


In [7]:
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)

train_ds = dataset["train"]
val_ds = dataset["test"]

In [8]:
MAX_INPUT = 256
MAX_TARGET = 128


def tokenize(batch):

    # Tokenize inputs
    model_inputs = tokenizer(
        batch["input"],
        max_length=MAX_INPUT,
        truncation=True,
        padding=False   # VERY IMPORTANT → dynamic padding later
    )

    # Tokenize targets
    labels = tokenizer(
        batch["output"],
        max_length=MAX_TARGET,
        truncation=True,
        padding=False
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


train_ds = train_ds.map(
    tokenize,
    batched=True,
    remove_columns=train_ds.column_names  # prevents trainer warnings + saves RAM
)

val_ds = val_ds.map(
    tokenize,
    batched=True,
    remove_columns=val_ds.column_names
)

Map:   0%|          | 0/57600 [00:00<?, ? examples/s]

Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./t5-nl2sql",

    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,

    learning_rate=7e-5,
    weight_decay=0.01,

    num_train_epochs=6,

    eval_strategy="epoch",
    save_strategy="epoch",

    logging_steps=100,

    fp16=True,
    save_total_limit=2,
    report_to="none"
)

In [10]:
# Data collator (handles dynamic padding + label masking correctly)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


trainer = Trainer(
    model=model,
    args=training_args,

    train_dataset=train_ds,
    eval_dataset=val_ds,

    processing_class=tokenizer,   # REQUIRED in Transformers 5.x
    data_collator=data_collator
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.006712,0.001959
2,0.004597,0.000955


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
trainer.save_model("t5-nl2sql-final")
tokenizer.save_pretrained("t5-nl2sql-final")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('t5-nl2sql-final/tokenizer_config.json', 't5-nl2sql-final/tokenizer.json')

In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "t5-nl2sql-final"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

model.eval()   # VERY IMPORTANT

Loading weights:   0%|          | 0/257 [00:00<?, ?it/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [2]:
def generate_sql(schema, question):

    prompt = f"""
Translate English to SQL.

Schema:
{schema}

Question:
{question}
"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,   # better than max_length for generation
            num_beams=5,          # improves correctness
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [3]:
schema = """
astronauts(id, mission_hours, agency_id)
agencies(id, agency_name)
"""

In [6]:
print(generate_sql(
    schema,
    "what is the average mission_hours for astronauts group by agency_id"
))

SELECT AVG(mission_hours) FROM astronauts GROUP BY agency_id;
