<a href="https://colab.research.google.com/github/vishal7379/Colab/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sympy --upgrade

import random
import string
import json
import re
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


In [None]:
SEED = 42

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


In [None]:
def random_token(min_len=4, max_len=10):
    length = random.randint(min_len, max_len)
    return ''.join(random.choices(string.ascii_lowercase, k=length))


def generate_schema():

    schema = {}

    num_tables = random.randint(1,5)

    for _ in range(num_tables):

        table = random_token()

        num_cols = random.randint(3,7)

        cols = [random_token() for _ in range(num_cols)]

        cols.append("id")   # anchor column

        schema[table] = list(set(cols))

    return schema


In [None]:
AGGS = ["SUM","AVG","COUNT","MAX","MIN"]

OPS = [">","<",">=","<=","!="]

JOIN_TYPES = ["JOIN","LEFT JOIN","RIGHT JOIN"]

SORT = ["ASC","DESC"]


In [None]:
def serialize_schema(schema):

    parts = []

    for t,cols in schema.items():

        random.shuffle(cols)  # prevents memorization

        parts.append(
            f"{t} : " + " , ".join(cols)
        )

    return " <SCHEMA> " + " | ".join(parts) + " </SCHEMA> "


In [None]:
def generate_example():

    schema = generate_schema()

    tables = list(schema.keys())

    main = random.choice(tables)

    cols = schema[main]

    intent = random.choice([
        "SELECT","WHERE","GROUP",
        "HAVING","ORDER","LIMIT",
        "JOIN","NESTED"
    ])

    ################################
    # SELECT
    ################################

    if intent == "SELECT":

        chosen = random.sample(cols, random.randint(1,min(3,len(cols))))

        question = f"get {', '.join(chosen)} from {main}"

        sql = f"SELECT {', '.join(chosen)} FROM {main}"


    ################################
    # WHERE
    ################################

    elif intent == "WHERE":

        chosen = random.sample(cols,2)

        op = random.choice(OPS)

        val = random.randint(1,1000)

        question = f"find {chosen[0]} from {main} where {chosen[1]} {op} {val}"

        sql = f"""
        SELECT {chosen[0]}
        FROM {main}
        WHERE {chosen[1]} {op} {val}
        """


    ################################
    # GROUP + HAVING
    ################################

    elif intent == "HAVING":

        group = random.choice(cols)
        agg_col = random.choice(cols)

        agg = random.choice(AGGS)

        op = random.choice(OPS)
        val = random.randint(1,500)

        question = f"group {main} by {group} having {agg.lower()} {agg_col} {op} {val}"

        sql = f"""
        SELECT {group}, {agg}({agg_col})
        FROM {main}
        GROUP BY {group}
        HAVING {agg}({agg_col}) {op} {val}
        """


    ################################
    # ORDER
    ################################

    elif intent == "ORDER":

        col = random.choice(cols)

        direction = random.choice(SORT)

        question = f"order {main} by {col} {direction.lower()}"

        sql = f"""
        SELECT {col}
        FROM {main}
        ORDER BY {col} {direction}
        """


    ################################
    # LIMIT
    ################################

    elif intent == "LIMIT":

        col = random.choice(cols)

        limit = random.randint(1,20)

        question = f"top {limit} rows of {col} from {main}"

        sql = f"""
        SELECT {col}
        FROM {main}
        LIMIT {limit}
        """


    ################################
    # JOIN
    ################################

    elif intent == "JOIN" and len(tables) > 1:

        t2 = random.choice([t for t in tables if t!=main])

        c1 = random.choice(schema[main])
        c2 = random.choice(schema[t2])

        join = random.choice(JOIN_TYPES)

        question = f"join {main} with {t2}"

        sql = f"""
        SELECT {main}.{c1}, {t2}.{c2}
        FROM {main}
        {join} {t2}
        ON {main}.{c1} = {t2}.{c2}
        """


    ################################
    # NESTED
    ################################

    else:

        col = random.choice(cols)

        agg = random.choice(AGGS)

        question = f"find {col} from {main} greater than average"

        sql = f"""
        SELECT {col}
        FROM {main}
        WHERE {col} >
        (SELECT {agg}({col}) FROM {main})
        """

    full_question = question + serialize_schema(schema)

    return {
        "question": full_question.lower(),
        "sql": " ".join(sql.split())
    }


In [None]:
DATA = [generate_example() for _ in range(80000)]

with open("nl2sql.json","w") as f:
    json.dump(DATA,f)

print("Dataset Ready ðŸš€")


In [None]:
def sql_tokenize(sql):

    return re.findall(
        r"[A-Za-z_]+\.[A-Za-z_]+"
        r"|>=|<=|!=|=|>|<"
        r"|\bselect\b|\bfrom\b|\bwhere\b|\bjoin\b|\bon\b"
        r"|\bgroup\b|\bby\b|\bhaving\b|\border\b|\blimit\b"
        r"|\bavg\b|\bsum\b|\bcount\b|\bmax\b|\bmin\b"
        r"|\(|\)|,"
        r"|[A-Za-z_]+"
        r"|\d+",
        sql.lower()
    )


In [None]:
ENC_VOCAB={"<PAD>":0,"<UNK>":1}
DEC_VOCAB={"<PAD>":0,"<UNK>":1,"<BOS>":2,"<EOS>":3}

def add(vocab,t):
    if t not in vocab:
        vocab[t]=len(vocab)

for ex in DATA:

    for t in ex["question"].split():
        add(ENC_VOCAB,t)

    for t in sql_tokenize(ex["sql"]):
        add(DEC_VOCAB,t)

print(len(ENC_VOCAB), len(DEC_VOCAB))


In [None]:
class NL2SQLDataset(Dataset):

    def __init__(self,data):
        self.data=data

    def __len__(self):
        return len(self.data)

    def __getitem__(self,i):

        ex=self.data[i]

        src=[ENC_VOCAB.get(t,1) for t in ex["question"].split()][:180]
        src+=[0]*(180-len(src))

        tgt=[DEC_VOCAB["<BOS>"]] + \
            [DEC_VOCAB.get(t,1) for t in sql_tokenize(ex["sql"])] + \
            [DEC_VOCAB["<EOS>"]]

        tgt=tgt[:100]
        tgt+=[0]*(100-len(tgt))

        return torch.tensor(src),torch.tensor(tgt)


In [None]:
D_MODEL = 512


class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len=512):
        super().__init__()

        pe=torch.zeros(max_len,d_model)

        pos=torch.arange(0,max_len).unsqueeze(1)

        div=torch.exp(
            torch.arange(0,d_model,2) *
            (-torch.log(torch.tensor(10000.0))/d_model)
        )

        pe[:,0::2]=torch.sin(pos*div)
        pe[:,1::2]=torch.cos(pos*div)

        self.pe=pe.unsqueeze(0)

    def forward(self,x):
        return x+self.pe[:,:x.size(1)].to(x.device)


In [None]:
class Encoder(nn.Module):

    def __init__(self,vocab):
        super().__init__()

        self.emb=nn.Embedding(vocab,D_MODEL,padding_idx=0)
        self.pos=PositionalEncoding(D_MODEL)

        layer=nn.TransformerEncoderLayer(
            D_MODEL,8,1536,
            dropout=0.1,
            batch_first=True,
            norm_first=True
        )

        self.enc=nn.TransformerEncoder(layer,4)

    def forward(self,x):

        mask=(x==0)

        x=self.pos(self.emb(x))

        return self.enc(x,src_key_padding_mask=mask)



class Decoder(nn.Module):

    def __init__(self,vocab):
        super().__init__()

        self.emb=nn.Embedding(vocab,D_MODEL)
        self.pos=PositionalEncoding(D_MODEL)

        layer=nn.TransformerDecoderLayer(
            D_MODEL,8,1536,
            dropout=0.1,
            batch_first=True,
            norm_first=True
        )

        self.dec=nn.TransformerDecoder(layer,4)

        self.fc=nn.Linear(D_MODEL,vocab)

        self.fc.weight=self.emb.weight

    def forward(self,y,mem,mask):

        L=y.size(1)

        causal=torch.triu(
            torch.ones(L,L,device=y.device),1
        ).bool()

        y=self.pos(self.emb(y))

        return self.fc(
            self.dec(
                y,mem,
                tgt_mask=causal,
                memory_key_padding_mask=mask
            )
        )


In [None]:
D_MODEL = 512
N_HEADS = 8
NUM_LAYERS = 4
FF_DIM = 2048
DROPOUT = 0.1


In [None]:
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2) *
            (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


In [None]:
class Encoder(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            D_MODEL,
            padding_idx=0
        )

        self.pos = PositionalEncoding(D_MODEL)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=FF_DIM,
            dropout=DROPOUT,
            batch_first=True,
            norm_first=True
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=NUM_LAYERS
        )

    def forward(self, x):

        padding_mask = (x == 0)

        x = self.embedding(x)
        x = self.pos(x)

        return self.encoder(
            x,
            src_key_padding_mask=padding_mask
        )


In [None]:
class Decoder(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            D_MODEL
        )

        self.pos = PositionalEncoding(D_MODEL)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=FF_DIM,
            dropout=DROPOUT,
            batch_first=True,
            norm_first=True
        )

        self.decoder = nn.TransformerDecoder(
            decoder_layer,
            num_layers=NUM_LAYERS
        )

        self.fc = nn.Linear(D_MODEL, vocab_size)

        # weight tying (important)
        self.fc.weight = self.embedding.weight

    def forward(self, y, memory, src_mask):

        L = y.size(1)

        causal_mask = torch.triu(
            torch.ones(L, L, device=y.device),
            diagonal=1
        ).bool()

        y = self.embedding(y)
        y = self.pos(y)

        output = self.decoder(
            y,
            memory,
            tgt_mask=causal_mask,
            memory_key_padding_mask=src_mask
        )

        return self.fc(output)


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

train_data, val_data = train_test_split(
    DATA,
    test_size=0.1,
    random_state=42
)

train_loader = DataLoader(
    NL2SQLDataset(train_data),
    batch_size=64,
    shuffle=True
)

val_loader = DataLoader(
    NL2SQLDataset(val_data),
    batch_size=64
)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

enc = Encoder(len(ENC_VOCAB)).to(device)
dec = Decoder(len(DEC_VOCAB)).to(device)


In [None]:
import torch.optim as optim

optimizer = optim.AdamW(
    list(enc.parameters()) + list(dec.parameters()),
    lr=1e-4
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=15
)

loss_fn = nn.CrossEntropyLoss(
    ignore_index=0,
    label_smoothing=0.05
)


In [None]:
EPOCHS = 15

for epoch in range(EPOCHS):

    #################################
    # TRAIN
    #################################

    enc.train()
    dec.train()

    train_loss = 0

    for x, y in train_loader:

        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()

        memory = enc(x)

        output = dec(y[:, :-1], memory, (x == 0))

        loss = loss_fn(
            output.reshape(-1, len(DEC_VOCAB)),
            y[:, 1:].reshape(-1)
        )

        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            list(enc.parameters()) + list(dec.parameters()),
            1.0
        )

        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    #################################
    # VALIDATION
    #################################

    enc.eval()
    dec.eval()

    val_loss = 0

    with torch.no_grad():

        for x, y in val_loader:

            x = x.to(device)
            y = y.to(device)

            memory = enc(x)

            output = dec(y[:, :-1], memory, (x == 0))

            val_loss += loss_fn(
                output.reshape(-1, len(DEC_VOCAB)),
                y[:, 1:].reshape(-1)
            ).item()

    val_loss /= len(val_loader)

    scheduler.step()

    #################################

    print(f"""
Epoch {epoch+1}

Train Loss: {train_loss:.3f}
Val Loss:   {val_loss:.3f}
""")
