<a href="https://colab.research.google.com/github/vishal7379/Colab/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# STEP 1: SYNTHETIC NL → SQL DATASET (LARGE + CORRECT)
# ============================================================

import random
import json

TABLE_SCHEMAS = {
    "employee": ["id", "name", "salary", "department", "age"],
    "orders": ["order_id", "customer_id", "amount", "order_date"],
    "customer": ["customer_id", "name", "city"]
}

AGGS = ["SUM", "AVG", "COUNT", "MAX", "MIN"]
OPS = [">", "<", "="]

def select_all():
    table = random.choice(list(TABLE_SCHEMAS))
    return (
        f"show all {table}",
        f"SELECT * FROM {table}",
        {table: TABLE_SCHEMAS[table]}
    )

def where_clause():
    table = "employee"
    val = random.choice([30000, 50000, 70000])
    op = random.choice(OPS)
    return (
        f"show employees where salary {op} {val}",
        f"SELECT * FROM employee WHERE salary {op} {val}",
        {"employee": TABLE_SCHEMAS["employee"]}
    )

def group_by():
    agg = random.choice(AGGS)
    return (
        f"{agg.lower()} salary per department",
        f"SELECT department, {agg}(salary) FROM employee GROUP BY department",
        {"employee": TABLE_SCHEMAS["employee"]}
    )

generators = [select_all, where_clause, group_by]

def generate_dataset(n=50000):
    data = []
    for _ in range(n):
        q, sql, schema = random.choice(generators)()
        data.append({
            "question": q,
            "sql": sql,
            "schema": schema
        })
    return data

dataset = generate_dataset(50000)

with open("custom_nl2sql.json", "w") as f:
    json.dump(dataset, f, indent=2)

print("✅ Dataset size:", len(dataset))
print(dataset[0])


✅ Dataset size: 50000
{'question': 'show all orders', 'sql': 'SELECT * FROM orders', 'schema': {'orders': ['order_id', 'customer_id', 'amount', 'order_date']}}


In [None]:
# ============================================================
# STEP 2: PREPROCESS DATASET
# ============================================================

import re

def normalize(text):
    return re.sub(r"\s+", " ", text.lower()).strip()

processed = []

for item in dataset:
    schema_text = ""
    for table, cols in item["schema"].items():
        schema_text += f"table {table} columns: {', '.join(cols)} ; "

    encoder_input = schema_text + " question: " + normalize(item["question"])
    decoder_output = normalize(item["sql"])

    processed.append({
        "encoder_input": encoder_input,
        "decoder_output": decoder_output
    })

with open("processed_data.json", "w") as f:
    json.dump(processed, f, indent=2)

print("✅ Example:")
print(processed[0])


✅ Example:
{'encoder_input': 'table orders columns: order_id, customer_id, amount, order_date ;  question: show all orders', 'decoder_output': 'select * from orders'}


In [None]:
# ============================================================
# STEP 3: BUILD VOCAB
# ============================================================

from collections import Counter

def build_vocab(sentences, min_freq=1):
    counter = Counter()
    for s in sentences:
        counter.update(s.split())

    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

encoder_texts = [x["encoder_input"] for x in processed]
decoder_texts = [x["decoder_output"] for x in processed]

src_vocab = build_vocab(encoder_texts)
tgt_vocab = build_vocab(decoder_texts)

print("✅ Encoder vocab:", len(src_vocab))
print("✅ Decoder vocab:", len(tgt_vocab))


✅ Encoder vocab: 39
✅ Decoder vocab: 27


In [None]:
# ============================================================
# STEP 4: DATASET
# ============================================================

import torch
from torch.utils.data import Dataset, DataLoader

class NL2SQLDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, max_len=60):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def encode(self, text, vocab):
        tokens = text.split()
        ids = [vocab.get(t, vocab["<unk>"]) for t in tokens]
        ids = ids[:self.max_len]
        return ids + [vocab["<pad>"]] * (self.max_len - len(ids))

    def __getitem__(self, idx):
        src = self.encode(self.data[idx]["encoder_input"], self.src_vocab)
        tgt = [self.tgt_vocab["<sos>"]] + \
              self.encode(self.data[idx]["decoder_output"], self.tgt_vocab) + \
              [self.tgt_vocab["<eos>"]]

        return torch.tensor(src), torch.tensor(tgt)

    def __len__(self):
        return len(self.data)

dataset = NL2SQLDataset(processed, src_vocab, tgt_vocab)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

x, y = next(iter(loader))
print(x.shape, y.shape)


torch.Size([32, 60]) torch.Size([32, 62])


In [None]:
# ============================================================
# STEP 5: ENCODER
# ============================================================

import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.embedding(x)
        outputs, hidden = self.rnn(emb)
        return outputs, hidden


In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), 2)))
        scores = self.v(energy).squeeze(2)
        weights = torch.softmax(scores, dim=1)
        context = torch.bmm(weights.unsqueeze(1), encoder_outputs)
        return context.squeeze(1)


In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.attn = Attention(hidden_dim)
        self.rnn = nn.GRU(emb_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, encoder_outputs):
        emb = self.embedding(x.unsqueeze(1))
        context = self.attn(hidden.squeeze(0), encoder_outputs)
        rnn_input = torch.cat((emb, context.unsqueeze(1)), 2)
        output, hidden = self.rnn(rnn_input, hidden)
        return self.fc(output.squeeze(1)), hidden


In [None]:
# ============================================================
# STEP 8: TRAIN
# ============================================================

device = "cuda" if torch.cuda.is_available() else "cpu"

encoder = Encoder(len(src_vocab)).to(device)
decoder = Decoder(len(tgt_vocab)).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()),
    lr=0.001
)

for epoch in range(5):
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)

        enc_out, hidden = encoder(src)
        dec_input = tgt[:, 0]

        loss = 0
        for t in range(1, tgt.size(1)):
            output, hidden = decoder(dec_input, hidden, enc_out)
            loss += criterion(output, tgt[:, t])
            dec_input = tgt[:, t]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss:", total_loss / len(loader))


Epoch 1 Loss: nan


KeyboardInterrupt: 

In [1]:
# ============================================================
# STEP 1: SYNTHETIC NL → SQL DATASET
# ============================================================

import random
import json
import re
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader

TABLE_SCHEMAS = {
    "employee": ["id", "name", "salary", "department", "age"],
    "orders": ["order_id", "customer_id", "amount", "order_date"],
    "customer": ["customer_id", "name", "city"]
}

AGGS = ["SUM", "AVG", "COUNT", "MAX", "MIN"]
OPS = [">", "<", "="]

def select_all():
    table = random.choice(list(TABLE_SCHEMAS))
    return (
        f"show all {table}",
        f"SELECT * FROM {table}",
        {table: TABLE_SCHEMAS[table]}
    )

def where_clause():
    val = random.choice([30000, 50000, 70000])
    op = random.choice(OPS)
    return (
        f"show employees where salary {op} {val}",
        f"SELECT * FROM employee WHERE salary {op} {val}",
        {"employee": TABLE_SCHEMAS["employee"]}
    )

def group_by():
    agg = random.choice(AGGS)
    return (
        f"{agg.lower()} salary per department",
        f"SELECT department, {agg}(salary) FROM employee GROUP BY department",
        {"employee": TABLE_SCHEMAS["employee"]}
    )

dataset = []
for _ in range(50000):
    q, sql, schema = random.choice([select_all, where_clause, group_by])()
    dataset.append({"question": q, "sql": sql, "schema": schema})


# ============================================================
# STEP 2: PREPROCESS
# ============================================================

def normalize(text):
    return re.sub(r"\s+", " ", text.lower()).strip()

processed = []
for item in dataset:
    schema_text = ""
    for t, cols in item["schema"].items():
        schema_text += f"table {t} columns: {', '.join(cols)} ; "
    processed.append({
        "encoder_input": schema_text + " question: " + normalize(item["question"]),
        "decoder_output": normalize(item["sql"])
    })


# ============================================================
# STEP 3: BUILD VOCAB
# ============================================================

def build_vocab(sentences):
    counter = Counter()
    for s in sentences:
        counter.update(s.split())
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    for w in counter:
        vocab[w] = len(vocab)
    return vocab

src_vocab = build_vocab([x["encoder_input"] for x in processed])
tgt_vocab = build_vocab([x["decoder_output"] for x in processed])


# ============================================================
# STEP 4: DATASET
# ============================================================

class NL2SQLDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, max_len=60):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def encode(self, text, vocab):
        ids = [vocab.get(t, vocab["<unk>"]) for t in text.split()]
        return ids[:self.max_len]

    def __getitem__(self, idx):
        # SOURCE (already fixed length)
        src = self.encode(self.data[idx]["encoder_input"], self.src_vocab)
        src = src + [self.src_vocab["<pad>"]] * (self.max_len - len(src))

        # TARGET (FIXED LENGTH NOW)
        tgt = self.encode(self.data[idx]["decoder_output"], self.tgt_vocab)
        tgt = [self.tgt_vocab["<sos>"]] + tgt + [self.tgt_vocab["<eos>"]]

        tgt = tgt[:self.max_len]
        tgt = tgt + [self.tgt_vocab["<pad>"]] * (self.max_len - len(tgt))

        return torch.tensor(src), torch.tensor(tgt)

    def __len__(self):
        return len(self.data)

# ============================================================
# CREATE DATALOADER (MISSING STEP)
# ============================================================

dataset_obj = NL2SQLDataset(processed, src_vocab, tgt_vocab, max_len=60)

loader = DataLoader(
    dataset_obj,
    batch_size=32,
    shuffle=True
)

# ============================================================
# STEP 5–7: MODEL
# ============================================================

class Encoder(nn.Module):
    def __init__(self, vocab, emb=128, hid=256):
        super().__init__()
        self.emb = nn.Embedding(vocab, emb, padding_idx=0)
        self.rnn = nn.GRU(emb, hid, batch_first=True)

    def forward(self, x):
        e = self.emb(x)
        return self.rnn(e)

class Attention(nn.Module):
    def __init__(self, hid):
        super().__init__()
        self.fc = nn.Linear(hid * 2, hid)
        self.v = nn.Linear(hid, 1, bias=False)

    def forward(self, h, enc):
        h = h.unsqueeze(1).repeat(1, enc.size(1), 1)
        e = torch.tanh(self.fc(torch.cat((h, enc), 2)))
        w = torch.softmax(self.v(e).squeeze(2), 1)
        return torch.bmm(w.unsqueeze(1), enc).squeeze(1)

class Decoder(nn.Module):
    def __init__(self, vocab, emb=128, hid=256):
        super().__init__()
        self.emb = nn.Embedding(vocab, emb)
        self.attn = Attention(hid)
        self.rnn = nn.GRU(emb + hid, hid, batch_first=True)
        self.fc = nn.Linear(hid, vocab)

    def forward(self, x, h, enc):
        e = self.emb(x.unsqueeze(1))
        c = self.attn(h.squeeze(0), enc)
        out, h = self.rnn(torch.cat((e, c.unsqueeze(1)), 2), h)
        return self.fc(out.squeeze(1)), h


# ============================================================
# STEP 8: TRAIN (NaN-SAFE)
# ============================================================

device = "cuda" if torch.cuda.is_available() else "cpu"

encoder = Encoder(len(src_vocab)).to(device)
decoder = Decoder(len(tgt_vocab)).to(device)

PAD_IDX = tgt_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()), lr=0.001
)

for epoch in range(5):
    encoder.train(); decoder.train()
    total_loss = 0

    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        enc_out, hidden = encoder(src)

        dec_input = tgt[:, 0]
        loss = 0
        optimizer.zero_grad()

        for t in range(1, tgt.size(1)):
            out, hidden = decoder(dec_input, hidden, enc_out)
            loss += criterion(out, tgt[:, t])
            dec_input = tgt[:, t]

        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            list(encoder.parameters()) + list(decoder.parameters()), 1.0
        )

        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(loader):.4f}")


NameError: name 'loader' is not defined