In [1]:
# 1) OpenAI setup (API key uit .env)

!pip install -q openai python-dotenv

import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key not found. Zet OPENAI_API_KEY in je .env")

client = OpenAI(api_key=OPENAI_API_KEY)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install -q pandas openpyxl

import os
import time
import warnings
import pandas as pd

os.environ["LC_ALL"] = "C.UTF-8"
os.environ["LANG"] = "C.UTF-8"

warnings.filterwarnings(
    "ignore",
    message="Unknown extension is not supported and will be removed",
    category=UserWarning,
    module="openpyxl"
)

# ====== CONFIG ======
EXCEL_PATH = r"/home/yasin/Git/Cross-domain-recommender/experiment/notebook/complete dataset.xlsx"
SHEET_NAME = 0

NAME_COL = "name"
OVERVIEW_COL = "source_overview"
TYPE_COL = "item_type"
GENRE_COL = "Simplified genre"
ENRICHED_COL = "enriched_description"

# Optionele kolommen (als ze bestaan worden ze gebruikt, anders leeg)
YEAR_COL = "Year"
SOURCE_GENRES_COL = "source_genres"
CREATED_BY_COL = "created_by"

MODEL = "gpt-4o-mini"
MAX_RETRIES = 5
SLEEP_BETWEEN_CALLS_SEC = 0.3


def is_missing(value) -> bool:
    if value is None:
        return True
    try:
        return pd.isna(value)
    except Exception:
        return str(value).strip() == ""


def safe_get_str(df: pd.DataFrame, row_idx: int, col: str) -> str:
    """Return string value for cell, or empty string if column missing/NaN/blank."""
    if col not in df.columns:
        return ""
    v = df.at[row_idx, col]
    if is_missing(v):
        return ""
    return str(v).strip()


def build_prompt_v2(
    name: str,
    item_type: str,
    year: str,
    source_genres: str,
    simplified_genre: str,
    created_by: str,
    overview: str
) -> str:
    # Fallbacks zodat placeholders altijd bestaan
    name = (name or "").strip()
    item_type = (item_type or "").strip()
    year = (year or "").strip()
    source_genres = (source_genres or "").strip()
    simplified_genre = (simplified_genre or "").strip()
    created_by = (created_by or "").strip()
    overview = (overview or "").strip()

    # Maak de INPUT-sectie altijd geldig, ook als velden leeg zijn
    prompt = f"""
ROLE
You are an expert Data Curator. You receive raw data about a {item_type if item_type else "media item"}
and produce one coherent, semantically rich, atmospheric synopsis.

INPUT
---
Title: {name}
Year: {year}
Genre Tags: {source_genres} (Primary Genre: {simplified_genre})
Creators/Authors: {created_by}
Raw Description: {overview if overview else "N/A"}
---

INSTRUCTIONS
• Write a spoiler-free synopsis of 200–300 words.
• Focus on setting, central tensions, themes, and atmosphere.
• Rewrite inconsistent, minimal, or marketing-like source material into a smooth,
  neutral-literary narrative style.
• Use your own knowledge of the item only to deepen atmosphere, thematic resonance,
  and implicit tension — never to introduce factual details not inferable from the input.
• Do not invent plot points or specify events that are not supported by the input; interpret, do not fabricate.

• Subtly but clearly reflect the provided genre tags in mood, thematic accents,
  world-building, and terminology. Allow elements of {simplified_genre} and the broader tags
  to resonate through the synopsis without naming genres explicitly.
• Integrate 2–4 genre-typical features (e.g., Sci-Fi elements, adventure-driven dynamics,
  emotional or moral conflict) as semantic anchors for embedding quality.

• Avoid all metadata: no years, no factual trivia, no names of actors, directors, or authors.
• No meta-language: do not say things like “this film/series/book tells the story…”
• Maintain stylistic consistency: neutral-literary tone with clear structure:
  setting introduction → core conflict → thematic layer → closing atmosphere.

OUTPUT
Only output the final synopsis.
""".strip()

    return prompt


def generate_enriched_description(name, item_type, year, source_genres, simplified_genre, created_by, overview) -> str:
    prompt = build_prompt_v2(
        name=name,
        item_type=item_type,
        year=year,
        source_genres=source_genres,
        simplified_genre=simplified_genre,
        created_by=created_by,
        overview=overview,
    )

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": "Only output the final synopsis. No extra text."},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.4,
            )

            text = (response.choices[0].message.content or "").strip()
            if len(text) < 120:
                raise ValueError("Output too short")
            return text

        except Exception as exc:
            wait = min(2 ** attempt, 20)
            print(f"[WARN] Attempt {attempt}/{MAX_RETRIES} failed: {exc}")
            time.sleep(wait)

    raise RuntimeError("AI call failed after all retries")


# ====== READ EXCEL ======
raw = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_NAME)
df = raw[next(iter(raw))] if isinstance(raw, dict) else raw

missing_columns = [c for c in (NAME_COL, ENRICHED_COL) if c not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

# Alleen verrijken als enriched_description leeg is
rows_to_process = df[df[ENRICHED_COL].apply(is_missing)].index.tolist()
print(f"Rows to enrich: {len(rows_to_process)} / {len(df)}")

for index, row_idx in enumerate(rows_to_process, start=1):
    name = safe_get_str(df, row_idx, NAME_COL)
    if not name:
        continue

    item_type = safe_get_str(df, row_idx, TYPE_COL)
    simplified_genre = safe_get_str(df, row_idx, GENRE_COL)
    overview = safe_get_str(df, row_idx, OVERVIEW_COL)

    year = safe_get_str(df, row_idx, YEAR_COL)
    source_genres = safe_get_str(df, row_idx, SOURCE_GENRES_COL)
    created_by = safe_get_str(df, row_idx, CREATED_BY_COL)

    print(f"[{index}/{len(rows_to_process)}] {name}")
    df.at[row_idx, ENRICHED_COL] = generate_enriched_description(
        name=name,
        item_type=item_type,
        year=year,
        source_genres=source_genres,
        simplified_genre=simplified_genre,
        created_by=created_by,
        overview=overview,
    )

    time.sleep(SLEEP_BETWEEN_CALLS_SEC)

# ====== WRITE OUTPUT ======
output_path = EXCEL_PATH.replace(".xlsx", "_enriched.xlsx")
df.to_excel(output_path, index=False)
print("Saved:", output_path)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Rows to enrich: 140 / 140
[1/140] The Mandalorian



As he traverses desolate planets and bustling spaceports, the bounty hunter encounters a diverse array of characters—some allies, others adversaries—each with their own agendas and secrets. The weight of his past looms large, shaping his choices and challenging his resolve. In this unforgiving environment, the hunter must grapple with themes of identity, loyalty, and the quest for redemption, all while evading the shadows of his own making.

The atmosphere is thick with tension, as the hunter's journey unfolds against a backdrop of stunning vistas and gritty underbellies, where every corner turned could lead to fortune or peril. With each encounter, the stakes rise, and the hunter's resolve is tested, forcing him to confront not only the dangers of the galaxy but also the deeper conflicts within himself. In this relentless pursuit of purpose, the hunter discovers that sometimes, the greatest battles are fought not against others, but within one's own soul.' has dtype incompatible with

[2/140] The 100
[3/140] Westworld
[4/140] Black Mirror
[5/140] The X-Files
[6/140] Game of Thrones
[7/140] Wednesday
[8/140] Attack on Titan
[9/140] House of the Dragon
[10/140] The Originals
[11/140] Money Heist
[12/140] Sherlock
[13/140] Better Call Saul
[14/140] Mr. Robot
[15/140] Dexter
[16/140] Breaking Bad
[17/140] The Good Doctor
[18/140] Grey's Anatomy
[19/140] Euphoria
[20/140] Peaky Blinders
[21/140] The Haunting of Hill House
[22/140] Scream Queens
[23/140] 1899
[24/140] The Outsider
[25/140] It
[26/140] The Big Bang Theory
[27/140] Friends
[28/140] How I Met Your Mother
[29/140] The Office
[30/140] Brooklyn Nine-Nine
[31/140] Yo soy Betty, la fea
[32/140] Teresa
[33/140] La hija del Mariachi
[34/140] Amor Real
[35/140] My Heart is Yours
[36/140] The Godfather
[37/140] The Shawshank Redemption
[38/140] Fight Club
[39/140] One Flew Over the Cuckoo's Nest
[40/140] Once Upon a Time in America
[41/140] Star Wars
[42/140] Inception
[43/140] The Matrix
[44/140] Eternal Sunshine of