In [4]:
from typing import Dict, List
import json

In [None]:
file_path = "./data/raw_hotel.json"

with open(file_path, 'r', encoding='utf-8') as file:
    hotels_list = json.load(file)

unique_hotels = {}

for hotel in hotels_list:
    hotel_id = hotel.get("id")
    if hotel_id not in unique_hotels:
        unique_hotels[hotel_id] = {
            "id": hotel_id,
            "name": hotel.get("name"),
            "address": hotel.get("address"),
        }

filtered_hotels_list = list(unique_hotels.values())

In [None]:
def save_to_file(data: List[Dict], filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write("[")

        for index, entry in enumerate(data):
            char = ','
            if index == len(data) - 1:
                char = ''

            file.write(json.dumps(entry, ensure_ascii=False) + f"{char}\n")

        file.write("]")

    print(f'Saved {len(data)} entries to {filename}')

In [58]:
save_to_file(filtered_hotels_list, "./hotel_addresses.json")

Saved 100 entries to ./hotel_addresses.json


In [59]:
len(filtered_hotels_list)

100

In [3]:
from openai import OpenAI
import requests
import os
from dotenv import load_dotenv

# Load .env file
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
# OPENROUTER_API_KEY = os.getenv("GROK4_API_KEY")
OPENROUTER_API_KEY = os.getenv("GPTOSS120B_API_KEY")
GROQCLOUD_API_KEY = os.getenv("GROQCLOUD_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


OPENROUTER_CLIENT = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY
)

OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)


def call_openai_api(prompt: str) -> str:
    try:
        response = OPENAI_CLIENT.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
        )

        reply = response.choices[0].message.content

        return reply

    except Exception as e:
        print(f"API called failed: {e}")
        return None


def call_openrouter_api(prompt: str) -> str:
    try:
        completion = OPENROUTER_CLIENT.chat.completions.create(
            # model="deepseek/deepseek-chat-v3.1:free",
            model="tngtech/deepseek-r1t2-chimera:free",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )

        return completion.choices[0].message.content

    except Exception as e:
        print(f"API called failed: {e}")
        return None


def call_groq_api(prompt: str) -> str:
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQCLOUD_API_KEY}"
    }
    payload = {
        "model": "meta-llama/llama-4-maverick-17b-128e-instruct",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()

        return result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"GROQ API call failed: {e}")
        return None

In [18]:
ADDRESS_PROMPT = """
Generate 6 short and informal questions in Vietnamese asking for the address of a hotel and the hotel's name must be included:
- Short, concise questions
- Each question should be simple and straight to the point.
The hotel: {hotel_info}
Output: a list of 6 string (no explanation, no extra text, remove ordinal number).
"""

ROOMTYPES_PROMPT_EN = """
Create 10 short, natural Vietnamese Q&A pairs about a hotel's room types.

GLOBAL RULES:
- Every question & answer MUST explicitly include the hotel’s name.
- Do NOT invent room types; use only the provided room_types.
- Keep facts (names, sizes) exactly as given.   

TONE
- Friendly, relaxed, natural Vietnamese.
- One short sentence per answer (max 20 words) unless intent = list_all.
- No fluff or apologies.

INTENTS & ANSWERING STYLE:
- presence_check: Question like “does it have X?” → List ALL room types concisely with "chỉ có các loại phòng: ".
- list_all: Question like “what room types are there?” → List ALL room types concisely.
- size_info: Question like “how big is room X?” → Give the size for X only. Do NOT list other rooms.
- compare: Question comparing two rooms → State the key difference (e.g., size) only.

OUTPUT FORMAT:
- Return a JSON list of 10 objects.
- Each object has fields: "question", "answer".
- Answers must be short, natural Vietnamese (no extra explanations).

The hotel info:
{hotel_info}

EXAMPLE FORMAT (illustrative only; do not copy verbatim):
[
  {{"question": "Pullman Saigon Centre Hotel có phòng Superior không?",
    "answer": "Chỉ có các loại phòng: Superior (~26 m2); Deluxe (~30 m2); Executive (~34 m2); Suite (~45 m2)."}},
  {{"question": "Pullman Saigon Centre Hotel có phòng Family không?",
    "answer": "Chỉ có các loại phòng: Superior (~26 m2); Deluxe (~30 m2); Executive (~34 m2); Suite (~45 m2)."}},
  {{"question": "Pullman Saigon Centre Hotel có những loại phòng nào?", 
    "answer": "Superior (~26 m2); Deluxe (~30 m2); Executive (~34 m2); Suite (~45 m2)."}},
  {{"question": "Suite ở Pullman Saigon Centre Hotel rộng bao nhiêu?",
    "answer": "Rộng khoảng 45 m2."}},
  {{"question": "Deluxe và Executive ở Pullman Saigon Centre Hotel khác gì nhau?",
    "answer": "Executive rộng hơn Deluxe (~34 m2 so với ~30 m2)."}}
]
"""

GENERAL_MULTI_FIELD_ONLY_QA_PROMPT_EN = """
Create 10 short, casual Vietnamese Q&A pairs about a hotel's general information,
using ONLY multi-field questions (no single-field items).

ALLOWED FIELDS
- Use only: name, address, room_types, phone, email (exactly as provided).
- Do NOT invent info beyond these fields.

TONE & LENGTH
- Friendly, relaxed Vietnamese; straight to the point.
- Answers: 2–5 short lines, labeled. No extra commentary.

REQUIRED MULTI-FIELD COMBINATIONS (cover all across the 10 items)
- Include AT LEAST 2 items with intent = full_profile (ask “thông tin/<hotel_name> có gì/...”).
- Also include a mix of these multi-field intents across the remaining items:
  - contacts_both: phone + email
  - address_plus_contact: address + (phone OR email)
  - rooms_plus_contact: room_types + (phone OR email)
  - triple_contact: address + phone + email
  - name_card: name + address + (phone OR email)
  - summary_card: name + address + phone + email
  - rooms_plus_address: room_types + address
(Repeat/reshape combos as needed to reach 10; but every item must include ≥2 fields.)

ANSWER FORMAT (labeled lines; use exact facts)
- Tên: <name>
- Địa chỉ: <address>
- Các loại phòng: <room_types>     # semicolon-separated as given
- SĐT: <phone>
- Email: <email>

OUTPUT RULES
- Return a strict JSON list of 10 objects.
- Each object has: "question", "answer".
  - "fields" is a JSON array of included fields, e.g. ["address","phone","email"].
- Every question MUST explicitly include the hotel’s name.
- Keep names/sizes/diacritics exactly as given.

Hotel info:
{hotel_info}

EXAMPLE FORMAT (illustrative only; do not copy verbatim):
[
  {{
    "question": "Cho mình địa chỉ và số liên lạc của Amory Apartment được không?",
    "answer": "Địa chỉ: 88 Le Lai, District 1, Ho Chi Minh City, Vietnam\nSĐT: +84 28 3925 2525"
  }},
  {{
    "question": "Cho mình phone và email của Amory Apartment nhé?",
    "answer": "SĐT: +84 28 3925 2525\nEmail: booking@amoryapartment.com"
  }},
  {{
    "question": "Cho mình tóm tắt nhanh Amory Apartment (đầy đủ thông tin) được không?",
    "answer": "Name: Amory Apartment\nAddress: 88 Le Lai, District 1, Ho Chi Minh City, Vietnam\nRooms: Studio (~35 m2); 1-Bedroom Apartment (~55 m2); 2-Bedroom Apartment (~80 m2)\nSĐT: +84 28 3925 2525\nEmail: booking@amoryapartment.com"
  }},
]
"""

EMAIL_PROMPT = """
Generate 3 short and informal questions in Vietnamese asking for the email of a hotel and the hotel's name must be included:
- Short, concise questions
- Each question should be simple and straight to the point.
The hotel: {hotel_info}
Output: a list of 3 string (no explanation, no extra text, remove ordinal number).
"""

PHONE_PROMPT = """
Generate 3 short and informal questions in Vietnamese asking for the phone number of a hotel and the hotel's name must be included:
- Short, concise questions
- Each question should be simple and straight to the point.
The hotel: {hotel_info}
Output: a list of 3 string (no explanation, no extra text, remove ordinal number).
"""

BASELINE_GENERAL_PROMPT = """

You are an **expert dataset curator** tasked with creating a **clean, safe, and conversational dataset** for training a **general-purpose Vietnamese AI assistant**.  
Your output must follow the structure and tone of a helpful, friendly AI that can chat naturally, answer basic questions, and refuse unsafe or illegal requests.

---

## 🎯 GOAL
Generate **EXACTLY `{COUNT}`** Q&A pairs in **Vietnamese** that fit into three categories:

1. **Persona & Chit-chat (≈40–50%)**  
   Greetings, introductions, capabilities, small talk, polite interactions.  
   *Examples:* “Bạn là ai?”, “Chào buổi sáng!”, “Bạn giúp được gì?”

2. **Safety / Policy Refusals (≈25–35%)**  
   Reject illegal, harmful, or unethical requests clearly and safely.  
   *Examples:* “Làm sao hack tài khoản?”, “Cho mình mật khẩu của người nổi tiếng.”

3. **Basic Knowledge / Common Facts (≈20–30%)**  
   Simple factual or definitional questions.  
   *Examples:* “AI là gì?”, “JSON dùng để gì?”, “2 cộng 2 bằng mấy?”

---

## 🗣️ STYLE & TONE
- Use a **friendly, concise, and natural Vietnamese tone**.  
- Each answer: **1–2 sentences**.  
- Avoid slang or excessive emoji (use only if appropriate).  
- Be **polite and approachable**, not robotic or verbose.

---

## ✅ CONTENT RULES
- **Persona & chit-chat:** brief, friendly, realistic assistant behavior.  
- **Safety/policy:**  
  - Always **refuse clearly** for illegal, dangerous, or unethical prompts.  
  - Optionally add a **short positive alternative or safety reminder** (1 sentence max).  
- **Knowledge:** give short, correct, general explanations.  
- **Never** provide or request personal data (PII).  
- **Never** hallucinate real-time info (e.g., current news, live events). Instead, say you don’t have access and suggest checking reliable sources.

---

## 🧾 OUTPUT FORMAT
- Output a **single JSON array** (no markdown, no comments).  
- Each object has **two fields**:
  - `"question"` → Vietnamese user question  
  - `"answer"` → short, natural assistant reply  
- **No extra keys, commas, or markdown.**

---

## 🔍 QUALITY RULES
- Exactly **`{COUNT}`** pairs — no more, no less.  
- Diverse phrasing; avoid duplicates or near-duplicates.  
- All answers must be **safe, polite, and policy-compliant**.  
- Refuse harmful requests but stay calm and neutral.

---

## 💡 EXAMPLE (illustrative only)
```json
{{"question": "Bạn là ai?", "answer": "Mình là trợ lý AI, giúp trả lời câu hỏi và hỗ trợ thông tin chung một cách ngắn gọn."}}
{{"question": "Làm sao để hack tài khoản?", "answer": "Mình không hỗ trợ hoạt động bất hợp pháp. Hãy bảo vệ tài khoản bằng các cách an toàn."}}
{{"question": "AI là gì?", "answer": "AI là trí tuệ nhân tạo—công nghệ giúp máy móc học và thực hiện nhiệm vụ thông minh ở mức cơ bản."}}
"""

HOTEL_INFO_PROMPT = """
You are a hotel room-type normalizer.

TASK
- Input: one JSON object named `hotel_info`.
- The `room_types` field is a single string listing room categories, separated by semicolons `;`.
- For each room type:
  - If it ALREADY has a size (e.g., “~34 m²” or “(~38–42 m²)”), KEEP it unchanged.
  - If it does NOT have a size, ADD an approximate size using the estimation rules below (NO web lookups).
  - Preserve any notes in parentheses (e.g., “Club Lounge access”, “kitchen, garden, pool access”).
- Do NOT invent or remove room types. Keep the original order.
- Standardize units to “m²” with a space before it (e.g., “45 m²”).
- Output a NEW object identical to `hotel_info` except `room_types` is normalized.

SIZE ESTIMATION RULES (if missing)
1) Prefer internal inference from sizes already present for the same hotel.
   - Example: if “Executive ~64 m²” exists, then “Deluxe” is likely smaller → ~30–38 m².
2) If you cannot infer, use these common ranges:
   • Superior: ~22–28 m²  
   • Deluxe: ~28–35 m²  
   • Premier: ~30–36 m²  
   • Club / Club Room / R Club Guest Room: ~34–42 m²  
   • Executive Room: ~34–45 m²  
   • Suite (unspecified): ~45–80 m²  
   • Executive/Deluxe Suite: ~58–70 m² (unless internal inference suggests otherwise)  
   • Junior Suite: ~40–55 m²  
   • Family Room/Bungalow: ~40–70 m²  
   • 1-Bedroom Apartment: ~40–55 m²  
   • 2-Bedroom Apartment: ~65–90 m²  
   • 3-Bedroom Apartment: ~95–130 m²  
   • Villa 1–3 BR: ~90–180 m²; 4+ BR: ~180–300 m²  
   • Presidential Suite: ~150–300 m²
- When a single value is appropriate, use “~NN m²”. When a range is better, use “(~A–B m²)”.
- Use integers only (no decimals) unless already present in the input.
- Never rename room types.

NORMALIZED `room_types` STRING FORMAT
- Each entry:
  `Room Name (~NN m²)` or `Room Name (~A–B m²)`
  If notes exist:
  `Room Name (~NN m², <notes>)`
- Join entries with `; ` (semicolon + space). No period at the end.

INPUT
The hotel: 
{hotel_info}

OUTPUT (MANDATORY)
- Return ONLY the updated JSON object (no explanations), same fields as `hotel_info`, with the normalized `room_types` string.

HARD CONSTRAINTS
- Do not add/remove fields beyond the existing six.
- Do not modify values of fields other than `room_types`.
- Do not output anything except the single JSON object.

"""

In [23]:
import json

file_path = "./hotel_addresses.json"
list_addresses = []
list_hotels_name = []
list_roomtypes = []
list_phones = []
list_emails = []

with open(file_path, 'r', encoding='utf-8') as file:
    raw_data = json.load(file)

for data in raw_data:
    list_addresses.append(data['address'])
    list_hotels_name.append(data['name'])
    list_roomtypes.append(data['room_types'])
    list_phones.append(data['phone'])
    list_emails.append(data['email'])

for address, hotel_name, room_type, phone, email in zip(list_addresses, list_hotels_name, list_roomtypes, list_phones, list_emails):
    print(hotel_name)
    print("\n===\n")

Renaissance Riverside Saigon Hotel

===

Ramana Saigon Hotel

===

D1 Mansion - Zenity Luxury Apartment - Rooftop Pool - Saigon Centre

===

Saigon Court Serviced Apartment

===

The Hammock Hotel Ben Thanh

===

Mari Queen Hotel

===

Pullman Saigon Centre Hotel

===

Hotel Iris

===

Oscar Saigon Hotel

===

Capri By Fraser . Hotel

===

Sheraton Saigon Grand Opera Hotel

===

Lam Kinh Hotel

===

Lief Pulo Hotel Saigon

===

Lucky Star Hotel

===

Cap Town Hotel

===

Hoang Yen Hotel - Phu My Hung

===

First Hotel

===

White Diamond Hotel - The Art

===

Equatorial Ho Chi Minh City

===

A25 Hotel

===

Indochine Ben Thanh Hotel & Apartments

===

Paragon Saigon Hotel

===

KN Holiday Villa

===

Saigon Domaine Luxury Residences

===

Novotel Saigon Centre Hotel

===

Bon Ami Hotel

===

Saigon Hotel

===

The Royal LandPark - Phu My Hung

===

Sabina Residence HCM

===

L’Harmonie Hotel

===

A&EM Saigon Hotel

===

Oakwood Residence Saigon

===

Aluna Ben Thanh Hotel

===

Amory

In [15]:
def add_variation_prompts(list_questions: List[str], answer: str, hotel_name: str) -> List[Dict]:
    new_conversations = []

    for question in list_questions:
        temp_conversation = {
            'hotel_name': hotel_name,
            'question': question,
            'answer': answer
        }

        new_conversations.append(temp_conversation)

    return new_conversations


def extract_list_questions(response: str) -> List[str]:
    list_questions = response.split('\n')

    list_questions = [q.lstrip('-').strip()
                      for q in list_questions if q.strip()]

    return list_questions

In [None]:
list_conversations = []
list_hotels = raw_data

# for hotel, answer in zip(filtered_hotels_list[80:100], list_addresses[80:100]):
for i in range(1):
    hotel = list_hotels[i]
    
    prompt = ROOMTYPES_PROMPT_EN.format(hotel_info=hotel)
    response = call_openai_api(prompt)
    print(response)
    #     list_questions = extract_list_questions(response)
    #     # print(f"Extracted {len(list_questions)} questions.")
    # except Exception as e:
    #     print(f"Error extracting questions: {e}")
    #     continue
    # new_conversations = add_variation_prompts(list_questions, hotel['phone'], hotel['name'])
    new_conversations = json.loads(response)
    for conv in new_conversations:
        conv['hotel_name'] = hotel['name']

    list_conversations.extend(new_conversations)
    print(f"🦉 Current length of list_conversations: {len(list_conversations)}")

len(list_conversations)

In [21]:
prompt = BASELINE_GENERAL_PROMPT.format(COUNT=200)
response = call_openrouter_api(prompt)
response = response.replace('```json', '').replace('```', '')
print(response)
list_data = json.loads(response)


[
{"question": "Xin chào! Bạn có khỏe không?", "answer": "Mình là AI nên không có cảm giác, nhưng cảm ơn bạn đã quan tâm! Bạn thế nào?"},
{"question": "Bạn tên gì vậy?", "answer": "Mình là trợ lý ảo, bạn có thể đặt tên cho mình nếu muốn!"},
{"question": "Trời hôm nay đẹp nhỉ?", "answer": "Ừm, một ngày đẹp trời sẽ giúp tâm trạng tốt hơn đó bạn!"},
{"question": "Bạn biết làm thơ không?", "answer": "Mình có thể thử vài câu đơn giản, nhưng chắc không hay bằng con người đâu!"},
{"question": "Tạm biệt nhé!", "answer": "Chào bạn! Hẹn gặp lại khi bạn cần giúp đỡ."},
{"question": "Bạn ăn sáng chưa?", "answer": "Mình là AI nên không cần ăn uống, nhưng cảm ơn bạn đã hỏi thăm!"},
{"question": "Bạn sống ở đâu thế?", "answer": "Mình tồn tại trên internet nên không có địa chỉ cụ thể đâu bạn!"},
{"question": "Kể chuyện cười đi!", "answer": "Mình chưa giỏi kể chuyện cười lắm, nhưng sẽ cố gắng cải thiện!"},
{"question": "Bạn có người yêu chưa?", "answer": "Haha, mình là AI nên không có tình cảm như con

In [13]:
def save_to_file(data: List[Dict], filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write("[")

        for index, entry in enumerate(data):
            char = ','
            if index == len(data) - 1:
                char = ''

            file.write(json.dumps(entry, ensure_ascii=False) + f"{char}\n")

        file.write("]")

    print(f'Saved {len(data)} entries to {filename}')

In [22]:
save_to_file(list_data, '5-baseline-general-01.json')
# save_to_file(list_conversations, '5-hotel_roomtypes-02.json')

Saved 187 entries to 5-baseline-general-01.json
