In [1]:
import pandas as pd, json, random, pathlib

# Step 1: Load dataset and keep only rows that have a rating
df = pd.read_csv("sample_ordered.csv")
df = df[df["vote_average"].notna()].reset_index(drop=True)

TARGET_COL = "vote_average"  # This is the value we want the model to predict

# Step 2: Helper function to safely convert any value to a string
def safe(val):
    if pd.isna(val):
        return ""
    if isinstance(val, (list, dict)):
        return json.dumps(val, ensure_ascii=False)
    return str(val).replace("\r", " ").replace("\n", " ").strip()

# Step 3: Format each row into a prompt using all available feature columns
def row_to_prompt(row) -> str:
    lines = [
        f"{col}: {safe(row[col])}"
        for col in df.columns if col != TARGET_COL
    ]
    features_block = "\n".join(lines)
    return (
        "### MOVIE FEATURES\n"
        f"{features_block}\n\n"
        "### TASK\n"
        "Predict this movie's TMDB rating on a 0–10 scale (one decimal place).\n\n"
        "### ANSWER\n"
    )

# Step 4: Build prompt + label pairs as JSONL records
records = [
    {"text": row_to_prompt(r) + str(r[TARGET_COL])}
    for _, r in df.iterrows()
]

# Step 5: Shuffle and split into training and dev sets (80/20 split)
random.seed(42)
random.shuffle(records)

dev_size = max(1, round(0.2 * len(records)))
dev, train = records[:dev_size], records[dev_size:]

# Step 6: Save train and dev sets to JSONL files
pathlib.Path("train.jsonl").write_text(
    "\n".join(json.dumps(r, ensure_ascii=False) for r in train), encoding="utf-8"
)
pathlib.Path("dev.jsonl").write_text(
    "\n".join(json.dumps(r, ensure_ascii=False) for r in dev), encoding="utf-8"
)

print(f"Saved {len(train)} train rows and {len(dev)} dev rows.")

Saved 40 train rows and 10 dev rows.
