In [1]:
# ====== SETUP + INSTALL ======
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/DuÃõÃ£ aÃÅn - baÃÅo caÃÅo/Do_an_HK3_DT2210L_Text_To_SQL/vi_t5_text2sql

/content/drive/MyDrive/DuÃõÃ£ aÃÅn - baÃÅo caÃÅo/Do_an_HK3_DT2210L_Text_To_SQL/vi_t5_text2sql


In [3]:
# ======== Import Libraries ========
!pip install tabulate transformers gradio
!pip install Unidecode
import gradio as gr
import re
import json
import pandas as pd
import sqlite3
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tabulate import tabulate
from unidecode import unidecode


Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [4]:
# ======= CONFIG ========
model_dir = "model/Final_model"
db_path = "data/processing/SQLite_real_estate.db"
schema = "address[str], area[float], frontage[float], access_road[float], house_direction[str], balcony_direction[str], \
          floors[int], bedrooms[int], bathrooms[int], legal_status[str], furniture_state[str], price[float], city[str], district[str], \
          ward[str], cluster_label[str]"

# Load nested JSON
with open("data/processing/locations.json", "r", encoding="utf-8") as f:
    nested = json.load(f)

# Flatten: t·ª´ {city: {district: [ward, ...]}} ‚Üí list of dict
flat = []
for city, districts in nested.items():
    for district, wards in districts.items():
        for ward in wards:
            flat.append({
                "city": city,
                "district": district,
                "ward": ward
            })

# Save file JSON
with open("data/processing/locations_flat.json", "w", encoding="utf-8") as f:
    json.dump(flat, f, ensure_ascii=False, indent=2)

# Optionally: convert to CSV ƒë·ªÉ d·ªÖ nh√¨n
pd.DataFrame(flat).to_csv("data/processing/locations_flat.csv", index=False)
filepath_location = "data/processing/locations_flat.json"

# ======= LOAD MODEL ========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval().to(device)

# ======= STEP 1: X·ª≠ l√Ω c√¢u h·ªèi ƒë·∫ßu v√†o ========
full_schema = [tuple(col.strip().split("[")) for col in schema.split(",")]
full_schema = [(col, dtype.strip("]")) for col, dtype in full_schema]

SCHEMA_KEYWORDS = {
    'address': ['ƒë·ªãa ch·ªâ'], 'area': ['di·ªán t√≠ch', 'm2', 'm√©t vu√¥ng'],
    'price': ['gi√°', 't·ª∑', 'tri·ªáu', 'bao nhi√™u ti·ªÅn'],
    'frontage': ['m·∫∑t ti·ªÅn'], 'access_road': ['ƒë∆∞·ªùng v√†o', 'ƒë∆∞·ªùng', 'h·∫ªm'],
    'house_direction': ['h∆∞·ªõng nh√†'], 'balcony_direction': ['h∆∞·ªõng ban c√¥ng'],
    'floors': ['t·∫ßng', 'l·∫ßu'], 'bedrooms': ['ph√≤ng ng·ªß'], 'bathrooms': ['ph√≤ng t·∫Øm', 'wc'],
    'legal_status': ['ph√°p l√Ω', 's·ªï h·ªìng', 's·ªï ƒë·ªè'], 'furniture_state': ['n·ªôi th·∫•t'],
    'city': ['th√†nh ph·ªë'], 'district': ['qu·∫≠n', 'huy·ªán'], 'ward': ['ph∆∞·ªùng', 'x√£'],
    'cluster_label': ['ph√¢n kh√∫c']
}

def extract_relevant_columns_from_question(question: str, schema: list) -> list:
    question = question.lower()
    relevant = []
    for col, dtype in schema:
        if col in SCHEMA_KEYWORDS:
            for kw in SCHEMA_KEYWORDS[col]:
                if kw in question:
                    relevant.append((col, dtype))
                    break
    if 'gi√°' in question and ('price', 'float') not in relevant:
        relevant.append(('price', 'float'))
    return relevant

def generate_input_text(question: str, schema_columns: list) -> str:
    schema_str = ", ".join([f"{col}[{dtype}]" for col, dtype in schema_columns])
    return f"C√¢u h·ªèi: {question} | Schema: {schema_str}"

def normalize_question(text: str) -> str:
    """
    Normalize a natural language question:
    - Convert to lowercase and strip whitespace
    - Convert expressions like "2.5 t·ª∑", "3 tri·ªáu" to numeric VND values
    - Normalize city name variants (e.g., SG, HCM ‚Üí h·ªì ch√≠ minh)
    """
    try:
        text = text.lower().strip()

        # Bi·∫øn th·ªÉ c·ªßa th√†nh ph·ªë H·ªì Ch√≠ Minh
        hcm_aliases = [
            "th√†nh ph·ªë h·ªì ch√≠ minh", "tp h·ªì ch√≠ minh", "tp. h·ªì ch√≠ minh", "tphcm",
            "tp hcm", "tp.hcm", "hcm", "hcm city",
            "s√†i g√≤n", "tp sg", "th√†nh ph·ªë sg", "sg", "saigon"
        ]

        # Thay c√°c alias trong c√¢u h·ªèi th√†nh 'h·ªì ch√≠ minh'
        for alias in hcm_aliases:
            if alias in text:
                text = text.replace(alias, "h·ªì ch√≠ minh")

        return text
    except Exception as e:
        print(f"Error in normalize_question: {e}")
        return text

def load_locations(filepath: str):
    try:
        df = pd.read_csv(filepath) if filepath.endswith(".csv") else pd.read_json(filepath)
        return df[['city', 'district', 'ward']].dropna().drop_duplicates().to_dict(orient="records")
    except Exception as e:
        print(f"Error loading locations: {e}")
        return []

locations = load_locations(filepath_location)
def extract_location_from_question_v2(question: str, locations: list) -> dict:
    try:
        question = unidecode(question.lower())
        matched = {'city': None, 'district': None, 'ward': None}

        # ∆Øu ti√™n match ward + ƒë√∫ng district n·∫øu district c√≥ xu·∫•t hi·ªán trong c√¢u h·ªèi
        for loc in locations:
            ward = unidecode(loc['ward'].lower())
            district = unidecode(loc['district'].lower())
            if re.search(rf'\b{re.escape(ward)}\b', question):
                if re.search(rf'\b{re.escape(district)}\b', question):
                    return {'city': loc['city'], 'district': loc['district'], 'ward': loc['ward']}

        # N·∫øu kh√¥ng ƒë·ªß ward+district, th√¨ match district
        for loc in locations:
            district = unidecode(loc['district'].lower())
            if re.search(rf'\b{re.escape(district)}\b', question):
                matched['district'] = loc['district']
                matched['city'] = loc['city']

        # Cu·ªëi c√πng, ch·ªâ match city n·∫øu kh√¥ng c√≥ g√¨ kh√°c
        for loc in locations:
            city = unidecode(loc['city'].lower())
            if not matched['district'] and re.search(rf'\b{re.escape(city)}\b', question):
                matched['city'] = loc['city']

        return matched
    except Exception as e:
        print(f"[extract_location_from_question_v2] Error: {e}")
        return {'city': None, 'district': None, 'ward': None}



# ======= STEP 2: Sinh SQL t·ª´ model ========
def inference_sql_from_question(raw_question: str) -> str:
    """Sinh SQL t·ª´ c√¢u h·ªèi t·ª± nhi√™n b·∫±ng c√°ch tr√≠ch schema li√™n quan v√† g·ªçi model."""
    try:
        question = normalize_question(raw_question)
        matched_location = extract_location_from_question_v2(question, locations)
        relevant_cols = extract_relevant_columns_from_question(question, full_schema)
        input_text = generate_input_text(question, relevant_cols)

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
        outputs = model.generate(
            inputs.input_ids,
            max_length=64,
            num_beams=1,
            #config.early_stopping = False,
            decoder_start_token_id=model.config.decoder_start_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

        raw_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
        sql = smart_fix_sql(raw_sql, question)
        sql = fix_location_in_sql(sql, matched_location)
        return sql
    except Exception as e:
        print(f"Error during inference: {e}")
        return ""


# ======= STEP 3: Fix SQL sau khi sinh ========
def smart_fix_sql(sql: str, question: str = "") -> str:
    """
    Apply heuristic fixes to SQL:
    - Fix missing ANDs
    - Collapse repeated words
    - Convert BETWEEN if "t·ª´ ... ƒë·∫øn" is detected
    - Fix wrong comparison operators for phrases like 'd∆∞·ªõi', 'cao h∆°n', etc.
    """
    try:
        sql = re.sub(r"(\d[\)']?)\s+([a-zA-Z_]+\s*=)", r"\1 AND \2", sql)
        sql = re.sub(r"(\d[\)']?)\s+([a-zA-Z_]+\s*[><])", r"\1 AND \2", sql)
        sql = re.sub(r"\b(\w+)\b(?:\s+\1\b)+", r"\1", sql)
        if "t·ª´" in question and "ƒë·∫øn" in question:
          m = re.search(r"price\s*>=\s*(\d+\.?\d*)\s*AND\s*price\s*<=\s*(\d+\.?\d*)", sql)
          if m:
              x, y = m.groups()
              sql = re.sub(
                  r"price\s*>=\s*\d+\.?\d*\s*AND\s*price\s*<=\s*\d+\.?\d*",
                  f"price BETWEEN {x} AND {y}",
                  sql
              )
        if any(k in question for k in ['d∆∞·ªõi', '√≠t h∆°n', 'r·∫ª h∆°n', 'th·∫•p h∆°n']):
            sql = re.sub(r"(price|quantity)\s*>=\s*(\d+)", r"\1 < \2", sql)
        if any(k in question for k in ['tr√™n', 'nhi·ªÅu h∆°n', 'cao h∆°n', 'ƒë·∫Øt h∆°n']):
            sql = re.sub(r"(price|quantity)\s*<=\s*(\d+)", r"\1 > \2", sql)
        return sql
    except Exception as e:
        print(f"Error in smart_fix_sql: {e}")
        return sql

def fix_location_in_sql(sql: str, matched_location: dict) -> str:
    try:
        sql = re.sub(r"(AND\s+)?(city|district|ward)\s*=\s*'[^']*'", "", sql, flags=re.IGNORECASE)
        sql = re.sub(r"\s+WHERE\s+AND", " WHERE ", sql, flags=re.IGNORECASE)
        sql = re.sub(r"\s+AND\s+AND", " AND ", sql)

        conditions = []
        if matched_location.get('ward'):
            conditions.append(f"ward = '{matched_location['ward']}'")
        if matched_location.get('district'):
            conditions.append(f"district = '{matched_location['district']}'")
        if matched_location.get('city'):
            conditions.append(f"city = '{matched_location['city']}'")

        if not conditions:
            return sql.strip()

        # T√¨m v·ªã tr√≠ ƒë·ªÉ ch√®n WHERE tr∆∞·ªõc ORDER BY / GROUP BY / LIMIT
        split_pattern = r"\b(order by|group by|limit|having)\b"
        parts = re.split(split_pattern, sql, flags=re.IGNORECASE)

        if "where" in sql.lower():
            parts[0] += " AND " + " AND ".join(conditions)
        else:
            parts[0] += " WHERE " + " AND ".join(conditions)

        return " ".join(parts).strip()
    except Exception as e:
        print(f"[fix_location_in_sql] Error: {e}")
        return sql

# ======= STEP 4: Th·ª±c hi·ªán truy v·∫•n SQLite ========

def run_query(sql):
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.execute(sql)
        cols = [desc[0] for desc in cursor.description]
        rows = cursor.fetchall()
        conn.close()
        df = pd.DataFrame(rows, columns=cols)

        # N·∫øu qu√° nhi·ªÅu d√≤ng th√¨ gi·ªõi h·∫°n preview
        return df.head(200) if len(df) > 200 else df
    except Exception as e:
        print(f"Error running SQL query: {e}")
        return pd.DataFrame(columns=["L·ªói"], data=[[str(e)]])


In [None]:
# ====== GRADIO UI (Blocks layout) ======
with gr.Blocks(theme='soft') as demo:
    gr.Markdown("# üè° Text-to-SQL Real Estate Chatbot")
    gr.Markdown("Nh·∫≠p c√¢u h·ªèi t·ª± nhi√™n, h·ªá th·ªëng s·∫Ω tr·∫£ k·∫øt qu·∫£ t·ª´ CSDL b·∫•t ƒë·ªông s·∫£n.")

    input_box = gr.Textbox(
        label="C√¢u h·ªèi c·ªßa b·∫°n",
        placeholder="V√≠ d·ª•: T√¨m nh√† d∆∞·ªõi 3 t·ª∑ ·ªü G√≤ V·∫•p",
        lines=2
    )

    submit_btn = gr.Button("üì§ Submit")

    result_table = gr.Dataframe(label="K·∫øt qu·∫£ truy v·∫•n", interactive=False)

    # H√†m x·ª≠ l√Ω khi nh·∫•n Submit (ƒë·∫∑t b√™n trong block)
    def handle_query(user_input):
        print("** B∆∞·ªõc 1: Chu·∫©n ho√° c√¢u h·ªèi...")
        normalized_q = normalize_question(user_input)
        print("C√¢u h·ªèi sau chu·∫©n ho√°:", normalized_q)

        print("- B∆∞·ªõc 2: Tr√≠ch xu·∫•t ƒë·ªãa danh...")
        matched_location = extract_location_from_question_v2(normalized_q, locations)
        print("ƒê·ªãa danh:", matched_location)

        # Ch√®n th√™m print input t·∫°i ƒë√¢y n·∫øu mu·ªën
        relevant_cols = extract_relevant_columns_from_question(normalized_q, full_schema)
        input_text = generate_input_text(normalized_q, relevant_cols)
        print("‚Üí Input cho m√¥ h√¨nh:", input_text)

        print("- B∆∞·ªõc 3: Sinh SQL t·ª´ m√¥ h√¨nh...")
        raw_sql = inference_sql_from_question(user_input)
        print("SQL raw:", raw_sql)

        print("- B∆∞·ªõc 4: Fix logic SQL...")
        sql_fixed = smart_fix_sql(raw_sql, normalized_q)
        print("SQL logic fixed:", sql_fixed)

        print("- B∆∞·ªõc 5: Fix l·∫°i ƒë·ªãa danh trong SQL...")
        final_sql = fix_location_in_sql(sql_fixed, matched_location)
        print("SQL final:", final_sql)

        print("- B∆∞·ªõc 6: Th·ª±c hi·ªán truy v·∫•n v√† tr·∫£ k·∫øt qu·∫£...")
        return run_query(final_sql)

    submit_btn.click(fn=handle_query, inputs=input_box, outputs=result_table)

demo.launch(debug=True)


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://96722cc56bff158966.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


** B∆∞·ªõc 1: Chu·∫©n ho√° c√¢u h·ªèi...
C√¢u h·ªèi sau chu·∫©n ho√°: t√¨m nh√† ·ªü ph√∫ nhu·∫≠n gi√° d∆∞·ªõi 3 t·ª∑
- B∆∞·ªõc 2: Tr√≠ch xu·∫•t ƒë·ªãa danh...
ƒê·ªãa danh: {'city': 'H·ªì Ch√≠ Minh', 'district': 'Ph√∫ Nhu·∫≠n', 'ward': None}
‚Üí Input cho m√¥ h√¨nh: C√¢u h·ªèi: t√¨m nh√† ·ªü ph√∫ nhu·∫≠n gi√° d∆∞·ªõi 3 t·ª∑ | Schema: price[float]
- B∆∞·ªõc 3: Sinh SQL t·ª´ m√¥ h√¨nh...




SQL raw: SELECT * FROM price_house WHERE price < 3000000000
- B∆∞·ªõc 4: Fix logic SQL...
SQL logic fixed: SELECT * FROM price_house WHERE price < 3000000000
- B∆∞·ªõc 5: Fix l·∫°i ƒë·ªãa danh trong SQL...
SQL final: SELECT * FROM price_house WHERE price < 3000000000 AND district = 'Ph√∫ Nhu·∫≠n' AND city = 'H·ªì Ch√≠ Minh'
- B∆∞·ªõc 6: Th·ª±c hi·ªán truy v·∫•n v√† tr·∫£ k·∫øt qu·∫£...
** B∆∞·ªõc 1: Chu·∫©n ho√° c√¢u h·ªèi...
C√¢u h·ªèi sau chu·∫©n ho√°: t√¨m nh√† ·ªü ph√∫ nhu·∫≠n gi√° d∆∞·ªõi 3 t·ª∑, di·ªán t√≠ch tr√™n 30m2
- B∆∞·ªõc 2: Tr√≠ch xu·∫•t ƒë·ªãa danh...
ƒê·ªãa danh: {'city': 'H·ªì Ch√≠ Minh', 'district': 'Ph√∫ Nhu·∫≠n', 'ward': None}
‚Üí Input cho m√¥ h√¨nh: C√¢u h·ªèi: t√¨m nh√† ·ªü ph√∫ nhu·∫≠n gi√° d∆∞·ªõi 3 t·ª∑, di·ªán t√≠ch tr√™n 30m2 | Schema: area[float], price[float]
- B∆∞·ªõc 3: Sinh SQL t·ª´ m√¥ h√¨nh...




SQL raw: SELECT * FROM price_house WHERE price < 3000000000
- B∆∞·ªõc 4: Fix logic SQL...
SQL logic fixed: SELECT * FROM price_house WHERE price < 3000000000
- B∆∞·ªõc 5: Fix l·∫°i ƒë·ªãa danh trong SQL...
SQL final: SELECT * FROM price_house WHERE price < 3000000000 AND district = 'Ph√∫ Nhu·∫≠n' AND city = 'H·ªì Ch√≠ Minh'
- B∆∞·ªõc 6: Th·ª±c hi·ªán truy v·∫•n v√† tr·∫£ k·∫øt qu·∫£...
** B∆∞·ªõc 1: Chu·∫©n ho√° c√¢u h·ªèi...
C√¢u h·ªèi sau chu·∫©n ho√°: t√¨m nh√† ·ªü h√† n·ªôi gi√° d∆∞·ªõi 2 t·ª∑ di·ªán t√≠ch tr√™n 20m2
- B∆∞·ªõc 2: Tr√≠ch xu·∫•t ƒë·ªãa danh...
ƒê·ªãa danh: {'city': 'H√† N·ªôi', 'district': None, 'ward': None}
‚Üí Input cho m√¥ h√¨nh: C√¢u h·ªèi: t√¨m nh√† ·ªü h√† n·ªôi gi√° d∆∞·ªõi 2 t·ª∑ di·ªán t√≠ch tr√™n 20m2 | Schema: area[float], price[float]
- B∆∞·ªõc 3: Sinh SQL t·ª´ m√¥ h√¨nh...




SQL raw: SELECT * FROM price_house WHERE price < 2000000000 AND area > 20 AND city = 'H√† N·ªôi'
- B∆∞·ªõc 4: Fix logic SQL...
SQL logic fixed: SELECT * FROM price_house WHERE price < 2000000000 AND area > 20 AND city = 'H√† N·ªôi'
- B∆∞·ªõc 5: Fix l·∫°i ƒë·ªãa danh trong SQL...
SQL final: SELECT * FROM price_house WHERE price < 2000000000 AND area > 20  AND city = 'H√† N·ªôi'
- B∆∞·ªõc 6: Th·ª±c hi·ªán truy v·∫•n v√† tr·∫£ k·∫øt qu·∫£...
