# LLM Book Processing Notebook

In [1]:
# 1) OpenAI setup (API key uit .env)

!pip install -q openai python-dotenv

import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key not found. Zet OPENAI_API_KEY in je .env")

client = OpenAI(api_key=OPENAI_API_KEY)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
!pip install -q pandas openpyxl

import os
import time
import warnings
import pandas as pd

os.environ["LC_ALL"] = "C.UTF-8"
os.environ["LANG"] = "C.UTF-8"

warnings.filterwarnings(
    "ignore",
    message="Unknown extension is not supported and will be removed",
    category=UserWarning,
    module="openpyxl"
)

EXCEL_PATH = r"/home/yasin/Git/Cross-domain-recommender/experiment/notebook/complete dataset.xlsx"
SHEET_NAME = 0

NAME_COL = "name"
OVERVIEW_COL = "source_overview"
TYPE_COL = "item_type"
GENRE_COL = "Simplified genre"
ENRICHED_COL = "enriched_description"

MODEL = "gpt-4o-mini"
MAX_RETRIES = 5
SLEEP_BETWEEN_CALLS_SEC = 0.3


def is_missing(value) -> bool:
    if value is None:
        return True
    try:
        return pd.isna(value)
    except Exception:
        return str(value).strip() == ""


def build_prompt(name, item_type, genre, overview) -> str:
    item_type = (item_type or "").strip()
    genre = (genre or "").strip()

    prompt = f"""
Schrijf een verrijkte, informatieve beschrijving (enriched_description) voor dit item.

Naam: "{name}"
Type: "{item_type}"
Simplified genre: "{genre}"

Eisen:
- 120 tot 200 woorden
- Nederlands
- Objectief en informatief
- Geen spoilers buiten de hoofdlijn
- Thema's, motieven, toon, stijl en narratieve structuur impliciet verwerkt
- EÃ©n samenhangende alinea
""".strip()

    if overview and overview.strip():
        prompt += f"""

Context:
\"\"\"{overview.strip()}\"\"\"
"""
    else:
        prompt += "\n\nGeen context beschikbaar; blijf algemeen."

    return prompt


def generate_enriched_description(name, item_type, genre, overview) -> str:
    prompt = build_prompt(name, item_type, genre, overview)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": "Geef uitsluitend de enriched_description terug."},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.4,
            )

            text = (response.choices[0].message.content or "").strip()
            if len(text) < 40:
                raise ValueError("Output te kort")
            return text

        except Exception as exc:
            wait = min(2 ** attempt, 20)
            print(f"[WARN] Poging {attempt}/{MAX_RETRIES} mislukt: {exc}")
            time.sleep(wait)

    raise RuntimeError("AI-call faalde na alle retries")


raw = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_NAME)
df = raw[next(iter(raw))] if isinstance(raw, dict) else raw

missing_columns = [c for c in (NAME_COL, ENRICHED_COL) if c not in df.columns]
if missing_columns:
    raise ValueError(f"Ontbrekende kolommen: {missing_columns}")

rows_to_process = df[df[ENRICHED_COL].apply(is_missing)].index.tolist()
print(f"Rijen te verrijken: {len(rows_to_process)} / {len(df)}")

for index, row_idx in enumerate(rows_to_process, start=1):
    name = str(df.at[row_idx, NAME_COL]).strip()
    if not name:
        continue

    item_type = None if TYPE_COL not in df.columns or is_missing(df.at[row_idx, TYPE_COL]) else str(df.at[row_idx, TYPE_COL]).strip()
    genre = None if GENRE_COL not in df.columns or is_missing(df.at[row_idx, GENRE_COL]) else str(df.at[row_idx, GENRE_COL]).strip()
    overview = None if OVERVIEW_COL not in df.columns or is_missing(df.at[row_idx, OVERVIEW_COL]) else str(df.at[row_idx, OVERVIEW_COL])

    print(f"[{index}/{len(rows_to_process)}] {name}")
    df.at[row_idx, ENRICHED_COL] = generate_enriched_description(name, item_type, genre, overview)

    time.sleep(SLEEP_BETWEEN_CALLS_SEC)


output_path = EXCEL_PATH.replace(".xlsx", "_enriched.xlsx")
df.to_excel(output_path, index=False)
print("Opgeslagen:", output_path)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
df type: <class 'pandas.core.frame.DataFrame'>
Kolommen: ['db_ID', 'source_id', 'item_type', 'name', 'vote_count', 'vote_average', 'source_overview', 'Year', 'source_genres', 'Simplified genre', 'created_by / director / author', 'enriched_description']
Rijen te verrijken: 70 / 140
[1/70] Verrijken: The Long Walk
[2/70] Verrijken: Lost Worlds: Volume 1: Zothique, Averoigne and Others
[3/70] Verrijken: Buy Jupiter and Other Stories
[4/70] Verrijken: Incarnate (Newsoul, #1)
[5/70] Verrijken: Zero Echo Shadow Prime
[6/70] Verrijken: Seize the Fire (Star Trek: Typhon Pact, #2)
[7/70] Verrijken: Emerald Envisage
[8/70] Verrijken: The Doomsday Vault (Clockwork Empire #1)
[9/70] Verrijken: Grasshopper Jungle
[10/70] Verrijken: Tin Woodman
[11/7