In [None]:
import polars as pl
from dotenv import load_dotenv
from openai import OpenAI
import os

import numpy as np

In [None]:
# initialise gpt
load_dotenv()  # loads from .env

client = OpenAI()

In [None]:
df = pl.read_csv("../data/japantravel_posts_with_comments.csv")

In [61]:
df = df.with_columns(pl.col('selftext').fill_null(''),
                     pl.col('comment1').fill_null(''),
                     pl.col('comment2').fill_null(''),
                     pl.col('comment3').fill_null(''),
                     pl.col('comment4').fill_null(''),
                     pl.col('comment5').fill_null(''))
df = df.with_columns(
    (
        "Comment 1: " + pl.col("comment1") + "\n" +
        "Comment 2: " + pl.col("comment2") + "\n" +
        "Comment 3: " + pl.col("comment3") + "\n" +
        "Comment 4: " + pl.col("comment4") + "\n" +
        "Comment 5: " + pl.col("comment5")
    ).alias("comments_combined")
    ).drop(["comment1", "comment2", "comment3", "comment4", "comment5"])


In [62]:
documents = df.to_dicts()

In [83]:
prompt_template = """
You are planning a trip to Japan. 
Formulate 5 questions and its answer related to planning a Japan trip based on the travel recommendation database. 
The record should contain the answer to the questions and the questions should be complete and not too short. 
If possible, use as few words as possible from the record.

Record:
Title: {title}
Content: {selftext}
Comments: {comments_combined}

Provide the output strictly in JSON format as a list of dictionaries, 
where each object has the fields "question" and "answer", like this:

[
  {{"question": "example question 1", "answer": "example answer 1"}},
  {{"question": "example question 2", "answer": "example answer 2"}},
  ...
]
"""

In [12]:
def generate_questions(doc):
    
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content



In [93]:
from tqdm.notebook import tqdm

results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    prompt = prompt_template.format(**doc)
    results[doc_id] = generate_questions(doc)

  0%|          | 0/445 [00:00<?, ?it/s]

In [107]:
import re
import json
from json_repair import repair_json

def robust_json_loads(text: str):
    """
    Safely load JSON. Tries normal json.loads first,
    and if it fails, attempts auto-repair with json-repair.
    """
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        repaired = repair_json(text)
        return json.loads(repaired)
    
parsed_results = {}
for doc_id, questions in results.items():
    raw_text = questions
    clean_text = re.sub(r"```json|```", "", raw_text).strip()
    parsed_json = robust_json_loads(clean_text)
    parsed_results[doc_id] = parsed_json


In [113]:
final_results_list = []
final_results = []
for doc_id, recommendations in parsed_results.items():
    for r in recommendations:
        final_results_list.append((r['question'], r['answer'], doc_id))
        final_results.append({'id':doc_id,
                              'question':r['question'],
                              'answer':r['answer']})

In [None]:
df_ground_truth = pl.DataFrame(final_results)
df_ground_truth.write_csv('../data/ground_truth.csv') 