In [None]:
import os
from dotenv import load_dotenv

# Load .env file
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# Get data from .json

In [None]:
import json

file_path = "./data/hotel.json"

with open(file_path, 'r', encoding='utf-8') as file:
    hotels_list = json.load(file)

print(len(hotels_list))

In [None]:
PROMPT_TEMPLATE = """
You are a data conversion assistant. Your task is to transform each Q&A pair into 6 diverse multi-turn conversations in ShareGPT fine-tuning format.

Each input Q&A pair relates to a specific hotel. Your output must help the assistant learn to respond naturally and accurately for that **specific hotel**.

For each input Q&A pair, generate 6 JSON objects. Each object represents a distinct conversation with:
- 4 user questions and 4 assistant answers (8 messages in total).
- The **first user question** in each conversation must explicitly mention the hotel name (e.g., “Renaissance Riverside”, “Khách sạn Mường Thanh”) in a natural and varied way.
- Questions should ask the same core idea in different ways, using varied tone, length, and style (casual, formal, brief, detailed, etc.).
- **Only one conversation** (out of the six) should include a general question about the hotel’s overall information (e.g., “Cho tôi thông tin khách sạn Renaissance Riverside”, “Nói cho tôi biết về khách sạn Mường Thanh”) and the answer is full description of the hotel.
- **Only one different conversation** (out of the six) include a question asking specifically about the hotel’s location or address.

- Assistant replies should convey the same meaning (i.e., the original answer), but phrased to match the tone and wording of each question.

Each JSON object must follow the ShareGPT JSONL schema:
- Must have a top-level field: `conversations`.
- `conversations` is a list of 8 alternating messages.
- Messages alternate between `{{"from": "human", "value": ...}}` and `{{"from": "gpt", "value": ...}}`.

Output: a list of 6 JSON objects (no explanation, no extra text).

Here is hotel's information:
{hotel_information}
"""

## Send to LLM

In [None]:
from openai import OpenAI 

CLIENT = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY
)


In [None]:
from typing import Dict, List

def call_openrouter_api(prompt: str) -> str:
    try:
        completion = CLIENT.chat.completions.create(
            model="qwen/qwen3-235b-a22b-2507:free",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )

        return completion.choices[0].message.content
    
    except Exception as e:
        print(f"API called failed: {e}")
        return None

def extract_conversations(api_response: str) -> List[Dict]:
    """Get conversations in JSON type"""
    if not api_response:
        return []
    
    try:
        conversations_data = json.loads(api_response)
        return conversations_data
        # return conversations_data.get('conversations', [])
    except json.JSONDecodeError as e:
        print(f"Failed to parse API response: {e}")
        return []

def save_to_file(data: List[Dict], filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        for entry in data:
            file.write(json.dumps(entry, ensure_ascii=False) + ",\n")
    print(f'Saved {len(data)} entries to {filename}')

## Loop through each hotel JSON object

In [None]:
list_conversations = []

for index, hotel in enumerate(hotels_list):
    prompt = PROMPT_TEMPLATE.format(hotel_information=str(hotel))
    response = call_openrouter_api(prompt)

    hotel_conversations = extract_conversations(response)
    print(f"Processed succesfully hotel: {index} with {len(hotel_conversations)} conversations")

    list_conversations += hotel_conversations

In [None]:
save_to_file(list_conversations, 'test.json')

## Push to Hugging Face

In [None]:
from huggingface_hub import login

login(token=HF_TOKEN)

In [None]:
from datasets import Dataset
import json

with open("test.json", "r", encoding="utf-8") as file:
    data = json.load(file)

dataset = Dataset.from_list(data)

In [None]:
from huggingface_hub import HfApi

dataset_name = "tyanfarm/hotel_conversations_118"
api = HfApi()
api.create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True)

In [None]:
dataset.push_to_hub(dataset_name)