In [23]:
import os
from dotenv import load_dotenv

# Load .env file
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENROUTER_API_KEY = os.getenv("SHISHA_API_KEY")
GROQCLOUD_API_KEY = os.getenv("GROQCLOUD_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Get data from .json

In [24]:
import json

file_path = "./data/hotel.json"

with open(file_path, 'r', encoding='utf-8') as file:
    hotels_list = json.load(file)

print(len(hotels_list))

20


In [45]:
PROMPT_CONVERSATIONAL = """
You are a data conversion assistant. Your task is to transform each Q&A pair into 6 diverse multi-turn conversations in ShareGPT fine-tuning format.

Each input Q&A pair relates to a specific hotel. Your output must help the assistant learn to respond naturally and accurately for that **specific hotel**.

For each input Q&A pair, generate 6 JSON objects. Each object represents a distinct conversation with:
- 4 user questions and 4 assistant answers (8 messages in total).
- The **first user question** in each conversation must explicitly mention the hotel name (e.g., “Renaissance Riverside”, “Khách sạn Mường Thanh”) in a natural and varied way.
- Questions should ask the same core idea in different ways, using varied tone, length, and style (casual, formal, brief, detailed, etc.).
- **Only one conversation** (out of the six) should include a general question about the hotel’s overall information (e.g., “Cho tôi thông tin khách sạn Renaissance Riverside”, “Nói cho tôi biết về khách sạn Mường Thanh”) and the answer is full description of the hotel.
- **Only one different conversation** (out of the six) include a question asking specifically about the hotel’s location or address.

- Assistant replies should convey the same meaning (i.e., the original answer), but phrased to match the tone and wording of each question.

Each JSON object must follow the ShareGPT JSONL schema:
- Must have a top-level field: `conversations`.
- `conversations` is a list of 8 alternating messages.
- Messages alternate between `{{"from": "human", "value": ...}}` and `{{"from": "gpt", "value": ...}}`.

Output: a list of 6 JSON objects (no explanation, no extra text).

Here is hotel's information:
{hotel_information}
"""

PROMPT_FAQ = """
You are a data conversion assistant. Your task is to transform each Q&A pair into 30 diverse FAQ-style conversations in ShareGPT fine-tuning format.

Each input Q&A pair relates to a specific hotel. Your output must help the assistant learn to respond naturally and accurately for that **specific hotel**.

For each input Q&A pair, generate 30 JSON objects. Each object represents one distinct question–answer pair with:
- 1 user message and 1 assistant message (2 messages total).
- The user message must explicitly mention the hotel name (e.g., “Renaissance Riverside”, “Khách sạn Mường Thanh”) in a natural and varied way.
- Questions should vary in tone, length, and phrasing style (e.g., casual, formal, detailed, short).
- One of the 30 questions should be a **general request** for information about the hotel.
- A different one of the 30 should specifically ask about the **hotel's address or location**.

If the original Q&A is about **room types**, you must:
- Identify and group similar room types together. For example, “1 King Bed, Standard Room” and “Standard Room” should both be considered “Standard Room” with each types will have list of its rooms.
- The assistant’s answer must clearly list all available room categories in full.
- For each distinct room category, generate a separate question that asks specifically about **the detailed room options within that category**, and respond by listing all specific room names that belong to it.

- Assistant replies should match the question's style but always convey the same core answer from the original Q&A.
- Ensure every question is clearly distinct in wording and style from the others.

Format:
- Each JSON object must follow the ShareGPT JSONL schema:
- Top-level field: `conversations` & `hotel_name`
- `conversations` is a list of 2 messages: `{{"from": "human", "value": ...}}` and `{{"from": "gpt", "value": ...}}`.
- `hotel_name` is the name of the hotel.

Output: a list of 30 JSON objects (no explanation, no extra text).

Here is hotel's information:
{hotel_information}
"""

## Send to LLM

In [26]:
from openai import OpenAI 

CLIENT = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY
)

OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)


In [27]:
from typing import Dict, List
import json 
import requests
import re

def call_openai_api(prompt: str) -> str:
    try:
        response = OPENAI_CLIENT.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
        )

        reply = response.choices[0].message.content

        return reply 
    
    except Exception as e:
        print(f"API called failed: {e}")
        return None

def call_openrouter_api(prompt: str) -> str:
    try:
        completion = CLIENT.chat.completions.create(
            model="moonshotai/kimi-dev-72b:free",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )

        return completion.choices[0].message.content
    
    except Exception as e:
        print(f"API called failed: {e}")
        return None
    
def call_groq_api(prompt: str) -> str:
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQCLOUD_API_KEY}"
    }
    payload = {
        "model": "meta-llama/llama-4-maverick-17b-128e-instruct",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        
        return result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"GROQ API call failed: {e}")
        return None

def extract_conversations(api_response: str) -> List[Dict]:
    """Get conversations in JSON type"""
    if not api_response:
        print("No response")
        return []

    try:
        # Regex for finding each conversations JSON object
        # matches = re.findall(r'\{[\s\S]*?"conversations"\s*:\s*\[.*?\]\s*\}', api_response)
        # Delete last "," of each conversations object JSON
        # cleaned = [re.sub(r',\s*$', '', obj) for obj in matches]
        # Combine into 1 string
        # json_str = '[' + ','.join(matches) + ']'

        conversations_data = json.loads(api_response)
        return conversations_data
        # return conversations_data.get('conversations', [])
    except json.JSONDecodeError as e:
        print(f"Failed to parse API response: {e}")
        return []

def save_to_file(data: List[Dict], filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write("[")

        for index, entry in enumerate(data):
            char = ','
            if index == len(data) - 1:
                char = ''

            file.write(json.dumps(entry, ensure_ascii=False) + f"{char}\n")

        file.write("]")
        
    print(f'Saved {len(data)} entries to {filename}')

## Loop through each hotel JSON object

In [28]:
import time

In [29]:
list_conversations = []

for index, hotel in enumerate(hotels_list):
    if hotel['language'] != "vi":
        continue

    prompt = PROMPT_FAQ.format(hotel_information=str(hotel))
    response = call_openai_api(prompt)

    print(response)

    hotel_conversations = extract_conversations(response)
    print(f"Processed succesfully hotel: {index} with {len(hotel_conversations)} conversations")

    list_conversations += hotel_conversations

    # if (index + 1) % 10 == 0 and (index + 1) < len(hotels_list):
    #     print("Waiting 3 minutes before continuing...")

[
  {
    "hotel_name": "Renaissance Riverside Saigon Hotel",
    "conversations": [
      {
        "role": "human",
        "content": "Bạn có thể cung cấp thông tin tổng quan về Renaissance Riverside Saigon Hotel được không?"
      },
      {
        "role": "gpt",
        "content": "Renaissance Riverside Saigon Hotel là khách sạn 5 sao tọa lạc ngay trung tâm Thành phố Hồ Chí Minh. Khách sạn có 336 phòng với cửa sổ nhìn ra thành phố, nhiều tiện nghi hiện đại cùng 2 nhà hàng và bar sang trọng. Vị trí thuận tiện cách sân bay Tân Sơn Nhất khoảng 30 phút lái xe và gần các điểm tham quan nổi tiếng như Nhà hát Lớn, Chợ Bến Thành."
      }
    ]
  },
  {
    "hotel_name": "Renaissance Riverside Saigon Hotel",
    "conversations": [
      {
        "role": "human",
        "content": "Khách sạn Renaissance Riverside Saigon nằm ở địa chỉ nào vậy?"
      },
      {
        "role": "gpt",
        "content": "Renaissance Riverside Saigon Hotel tọa lạc tại địa chỉ 8 - 15, Ton Duc Thang, Quận 1,

In [40]:
list_conversations[-1]

{'hotel_name': 'Vân Anh Luxury Hotel',
 'conversations': [{'role': 'human',
   'content': 'Vân Anh Luxury Hotel có phục vụ phòng twin/double không?'},
  {'role': 'gpt',
   'content': 'Khách sạn có phòng Twin/Double Room, ngoài ra còn nhiều lựa chọn phòng Family và Deluxe phù hợp với nhu cầu giường đôi hoặc nhiều giường.'}]}

In [46]:
for index, hotel in enumerate(hotels_list):
    if hotel['language'] != "vi" or hotel['id'] != 21:
        continue

    prompt = PROMPT_FAQ.format(hotel_information=str(hotel))
    response = call_openai_api(prompt)

    print(response)

    hotel_conversations = extract_conversations(response)
    print(f"Processed succesfully hotel: {index} with {len(hotel_conversations)} conversations")

    list_conversations += hotel_conversations

[
  {
    "conversations": [
      {
        "from": "human",
        "value": "Bạn có thể cho tôi biết các loại phòng hiện có tại khách sạn The Hammock Hotel Ben Thanh được không?"
      },
      {
        "from": "gpt",
        "value": "Tại The Hammock Hotel Ben Thanh, có các loại phòng chính bao gồm: Cozy Double Nest, Deluxe Double Nest, Family Room, Executive Double Nest, Superior Nest, Small Nest, Extra Larger Nest, Medium Nest, Deluxe Nest, The Nest, và một số loại phòng đặc biệt khác. Mỗi loại phòng có những lựa chọn cụ thể và tiện nghi khác nhau để đáp ứng nhu cầu đa dạng của khách."
      }
    ],
    "hotel_name": "The Hammock Hotel Ben Thanh"
  },
  {
    "conversations": [
      {
        "from": "human",
        "value": "Mình muốn biết chi tiết về các lựa chọn phòng thuộc nhóm Family Room tại The Hammock Hotel Ben Thanh có thể cung cấp cho mình không?"
      },
      {
        "from": "gpt",
        "value": "Trong nhóm Family Room tại The Hammock Hotel Ben Thanh, bạn có

## Save to file

In [47]:
save_to_file(list_conversations, '0-preprocessing.json')

Saved 349 entries to 0-preprocessing.json


## Push to Hugging Face

In [31]:
# from huggingface_hub import login

# login(token=HF_TOKEN)

In [32]:
# from datasets import Dataset
# import json

# with open("test.json", "r", encoding="utf-8") as file:
#     data = json.load(file)

# dataset = Dataset.from_list(data)

In [33]:
# from huggingface_hub import HfApi

# dataset_name = "tyanfarm/hotel_conversations_118"
# api = HfApi()
# api.create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True)

In [34]:
# dataset.push_to_hub(dataset_name)