<a href="https://colab.research.google.com/github/vishal7379/Colab/blob/main/NL_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers torch nltk sqlparse




In [6]:
import random, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [7]:
SCHEMAS = [
    {
        "tables": {
            "employees": ["id","name","salary","dept_id"],
            "departments": ["id","name"]
        },
        "join": ("employees","departments","dept_id","id")
    }
]

AGGS = ["sum","avg","count","max","min"]


In [8]:
def generate_example():
    db = random.choice(SCHEMAS)
    tables = db["tables"]
    main = list(tables.keys())[0]
    cols = tables[main]

    intent = random.choice([
        "SELECT","WHERE","AGG","AGG_WHERE","JOIN","JOIN_WHERE","NESTED"
    ])

    if intent=="SELECT":
        col=random.choice(cols)
        q=f"show {col} of {main}"
        sql=f"SELECT {main}.{col} FROM {main}"

    elif intent=="WHERE":
        col=random.choice(cols)
        val=random.choice([10,20,50,100])
        q=f"get {col} from {main} where {col} > {val}"
        sql=f"SELECT {main}.{col} FROM {main} WHERE {main}.{col} > {val}"

    elif intent=="AGG":
        agg=random.choice(AGGS)
        col=random.choice(cols)
        q=f"show {agg} of {col} from {main}"
        sql=f"SELECT {agg.upper()}({main}.{col}) FROM {main}"

    elif intent=="AGG_WHERE":
        agg=random.choice(AGGS)
        col=random.choice(cols)
        val=random.choice([20,50,100])
        q=f"show {agg} of {col} from {main} where {col} > {val}"
        sql=f"SELECT {agg.upper()}({main}.{col}) FROM {main} WHERE {main}.{col} > {val}"

    elif intent=="JOIN":
        t1,t2,c1,c2=db["join"]
        q=f"show {t1} and {t2} names"
        sql=f"SELECT {t1}.name , {t2}.name FROM {t1} JOIN {t2} ON {t1}.{c1} = {t2}.{c2}"

    elif intent=="JOIN_WHERE":
        t1,t2,c1,c2=db["join"]
        val=random.choice([20,50,100])
        q=f"show {t1} and {t2} names where {t1}.{c1} > {val}"
        sql=f"SELECT {t1}.name , {t2}.name FROM {t1} JOIN {t2} ON {t1}.{c1} = {t2}.{c2} WHERE {t1}.{c1} > {val}"

    else:
        q=f"find employees earning more than average salary"
        sql="SELECT name FROM employees WHERE salary > ( SELECT AVG(salary) FROM employees )"

    return {"question":q,"schema":tables,"sql":sql}
DATA = [generate_example() for _ in range(60000)]


In [9]:
def sql_to_ast(sql):
    sql=sql.lower()
    tokens=["<QUERY>"]

    for kw in ["select","from","join","where","group by","having","order by","intersect","union","except"]:
        if kw in sql:
            tokens.append(f"<{kw.replace(' ','_').upper()}>")

    if sql.count("select") > 1:
        tokens.append("<SUBQUERY>")

    tokens.append("</QUERY>")
    return tokens


In [10]:
AST_VOCAB={"<PAD>":0,"<BOS>":1,"<EOS>":2}
idx=3

def add(tok):
    global idx
    if tok not in AST_VOCAB:
        AST_VOCAB[tok]=idx
        idx+=1

for ex in DATA:
    for t in sql_to_ast(ex["sql"]):
        add(t)

INV_AST_VOCAB={v:k for k,v in AST_VOCAB.items()}


In [11]:
MODEL_NAME="microsoft/MiniLM-L12-H384-uncased"
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)

class NL2SQLDataset(Dataset):
    def __init__(self,data): self.data=data
    def __len__(self): return len(self.data)

    def __getitem__(self,i):
        ex=self.data[i]
        schema=" | ".join([f"{t}.{c}" for t,cs in ex["schema"].items() for c in cs])
        text=f"question: {ex['question']} schema: {schema}"

        enc=tokenizer(text,padding="max_length",truncation=True,
                      max_length=128,return_tensors="pt")

        tgt=[AST_VOCAB["<BOS>"]] + \
            [AST_VOCAB[t] for t in sql_to_ast(ex["sql"])] + \
            [AST_VOCAB["<EOS>"]]

        return enc["input_ids"].squeeze(0),enc["attention_mask"].squeeze(0),torch.tensor(tgt)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
def collate_fn(batch):
    ids, masks, tgts = zip(*batch)

    ids = torch.stack(ids)
    masks = torch.stack(masks)

    max_len = max(len(t) for t in tgts)
    tgt_pad = torch.zeros(len(tgts), max_len, dtype=torch.long)

    for i, t in enumerate(tgts):
        tgt_pad[i, :len(t)] = t

    return ids, masks, tgt_pad


In [13]:
train,val=train_test_split(DATA,test_size=0.1)
train_loader=DataLoader(NL2SQLDataset(train),batch_size=32,shuffle=True,collate_fn=collate_fn)

val_loader=DataLoader(NL2SQLDataset(val),batch_size=32,shuffle=True,collate_fn=collate_fn)


In [14]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.2)

    def forward(self, ids, mask):
        out = self.model(ids, attention_mask=mask).last_hidden_state
        return self.dropout(out)


class Decoder(nn.Module):
    def __init__(self, hidden, vocab):
        super().__init__()
        self.emb = nn.Embedding(vocab, hidden)
        self.lstm = nn.LSTM(hidden, hidden, batch_first=True, dropout=0.2)
        self.attn = nn.MultiheadAttention(hidden, num_heads=8, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden, vocab)

    def forward(self, tgt, enc_out):
        emb = self.dropout(self.emb(tgt))
        out, _ = self.lstm(emb)
        out, _ = self.attn(out, enc_out, enc_out)
        out = self.dropout(out)
        return self.fc(out)
loss_fn = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)


class NL2SQL(nn.Module):
    def __init__(self, vocab):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(384, vocab)

    def forward(self, ids, mask, tgt):
        enc_out = self.encoder(ids, mask)
        return self.decoder(tgt, enc_out)


In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"
model=NL2SQL(len(AST_VOCAB)).to(device)

opt=torch.optim.AdamW(model.parameters(),lr=2e-5)
loss_fn=nn.CrossEntropyLoss(ignore_index=0)

def train_epoch(loader):
    model.train(); total=0
    for x,m,t in tqdm(loader):
        x,m,t=x.to(device),m.to(device),t.to(device)
        out=model(x,m,t[:,:-1])
        loss=loss_fn(out.reshape(-1,len(AST_VOCAB)),t[:,1:].reshape(-1))
        opt.zero_grad(); loss.backward(); opt.step()
        total+=loss.item()
    return total/len(loader)

for e in range(5):
    tr=train_epoch(train_loader)
    print(f"Epoch {e+1} | Loss {tr:.4f}")


100%|██████████| 1688/1688 [06:11<00:00,  4.55it/s]


Epoch 1 | Loss 0.3283


 53%|█████▎    | 901/1688 [03:18<02:51,  4.59it/s]

In [None]:
def infer_structure(question, schema):
    model.eval()

    text = f"question: {question} schema: {schema}"
    enc = tokenizer(
        text, padding="max_length", truncation=True,
        max_length=128, return_tensors="pt"
    ).to(device)

    tgt = torch.tensor([[AST_VOCAB["<BOS>"]]], device=device)

    for _ in range(20):
        logits = model(enc["input_ids"], enc["attention_mask"], tgt)
        nxt = logits[:, -1].argmax(-1, keepdim=True)
        tgt = torch.cat([tgt, nxt], dim=1)

        if nxt.item() == AST_VOCAB["<EOS>"]:
            break

    tokens = [INV_AST_VOCAB[t.item()] for t in tgt[0]]
    return tokens


In [None]:
schema = "employees.id | employees.name | employees.salary | employees.dept_id | departments.id | departments.name"

print(infer_structure("show employee and department names", schema))
print(infer_structure("find employees earning more than average salary", schema))
print(infer_structure("show avg salary by department", schema))
