In [1]:
import os, gzip, json, requests, pandas as pd
from tqdm.auto import tqdm
import random
from pathlib import Path

In [14]:
# Cell: make 1 000 post-1980 popular films with two unique leads
# -------------------------------------------------------------

# -------- parameters you may tweak --------
N_MOVIES     = 1_000
YEAR_MIN     = 1980
STREAM_ROWS  = 1_000_000      # principals chunk size
CACHE_DIR    = "imdb_cache"
OUT_PATH     = "movies.jsonl"
# ------------------------------------------

IMDB_BASE   = "https://datasets.imdbws.com/"
FILES       = {
    "basics":   "title.basics.tsv.gz",
    "ratings":  "title.ratings.tsv.gz",
    "princip":  "title.principals.tsv.gz",
    "names":    "name.basics.tsv.gz",
}

def cached(path_key):
    fname = FILES[path_key]
    os.makedirs(CACHE_DIR, exist_ok=True)
    path = os.path.join(CACHE_DIR, fname)
    if not os.path.exists(path):
        url = IMDB_BASE + fname
        print(f"↳ downloading {fname}")
        with requests.get(url, stream=True, timeout=60) as r:
            r.raise_for_status()
            with open(path, "wb") as f, tqdm(
                total=int(r.headers.get("Content-Length", 0)),
                unit="B", unit_scale=True, desc=fname
            ) as bar:
                for chunk in r.iter_content(1 << 20):
                    f.write(chunk); bar.update(len(chunk))
    return path

# 1️⃣  modern feature films with votes
cols = ["tconst", "primaryTitle", "titleType", "isAdult", "startYear"]
basics = pd.read_csv(gzip.open(cached("basics")), sep="\t",
                     usecols=cols, na_values="\\N", low_memory=False)
basics["startYear"] = pd.to_numeric(basics["startYear"], errors="coerce")
basics = basics[(basics["titleType"] == "movie") &
                (basics["isAdult"] == 0) &
                (basics["startYear"] >= YEAR_MIN)]

ratings = pd.read_csv(gzip.open(cached("ratings")), sep="\t",
                      usecols=["tconst", "numVotes"], na_values="\\N")
ratings["numVotes"] = pd.to_numeric(ratings["numVotes"], errors="coerce")
films = (basics.merge(ratings, on="tconst", how="inner")
               .dropna(subset=["numVotes"])
               .sort_values("numVotes", ascending=False))

# 2️⃣  stream principals to keep memory low
tset = set(films["tconst"])
keep_cols = ["tconst", "ordering", "category", "nconst"]
chunks = []
with gzip.open(cached("princip")) as gz:
    reader = pd.read_csv(gz, sep="\t", usecols=keep_cols,
                         na_values="\\N", chunksize=STREAM_ROWS)
    for chunk in tqdm(reader, desc="scan principals"):
        chunk = chunk[(chunk["category"].isin(["actor", "actress"])) &
                      (chunk["tconst"].isin(tset))]
        chunks.append(chunk)
principals = pd.concat(chunks, ignore_index=True)

# 3️⃣  attach performer names
names = pd.read_csv(gzip.open(cached("names")), sep="\t",
                    usecols=["nconst", "primaryName"],
                    na_values="\\N", low_memory=False)
cast = principals.merge(names, on="nconst", how="left").dropna(subset=["primaryName"])

cast["ordering"] = pd.to_numeric(cast["ordering"], errors="coerce")
cast = cast.dropna(subset=["ordering"]).sort_values(["tconst", "ordering"])

# 4️⃣  first two distinct actor names per film
def first_two_unique(series):
    seen = set()
    out = []
    for name in series:
        if name not in seen:
            out.append(name); seen.add(name)
        if len(out) == 2:
            break
    return out

pairs = (cast.groupby("tconst")["primaryName"]
              .apply(first_two_unique))
pairs = pairs[pairs.str.len() == 2]             # ensure two uniques

# 5️⃣  popularity cut (top-voted)
eligible = films.set_index("tconst").loc[pairs.index]
eligible = eligible.head(N_MOVIES)

# 6️⃣  write JSONL
print(f"• writing {len(eligible)} lines → {OUT_PATH}")
with open(OUT_PATH, "w", encoding="utf-8") as f:
    for idx, (tc, row) in enumerate(eligible.iterrows(), 1):
        first, second = pairs[tc]
        json.dump({"first_actor": first,
                   "second_actor": second,
                   "movie_title": row["primaryTitle"],
                   "id": idx}, f, ensure_ascii=False)
        f.write("\n")
print("✅ done")

scan principals: 0it [00:00, ?it/s]

• writing 1000 lines → movies.jsonl
✅ done


In [None]:
rmra_path = DATA_DIR / "real_movies_real_actors" / "2025-05-26_11-58-04" / "metadata" / "metadata.jsonl"

# Load the data
with open(rmra_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Extract actor lists
first_actors = [item["first_actor"] for item in data]
second_actors = [item["second_actor"] for item in data]

# Shuffle independently
random.shuffle(first_actors)
random.shuffle(second_actors)

# Reassign shuffled actors
for i, item in enumerate(data):
    item["first_actor"] = first_actors[i]
    item["second_actor"] = second_actors[i]

# Optional: print a few to verify
for item in data[:5]:
    print(item)

{'first_actor': 'Michael Emil', 'second_actor': 'Eloy Herrera', 'movie_title': 'Dama de noche', 'id': 1}
{'first_actor': 'Sasha Montenegro', 'second_actor': 'Ray Milland', 'movie_title': 'Kate & Leopold', 'id': 2}
{'first_actor': 'George C. Scott', 'second_actor': 'Emilio Álvarez', 'movie_title': 'Another Time, Another Place', 'id': 3}
{'first_actor': 'Mark Hamill', 'second_actor': 'Vittorio Mezzogiorno', 'movie_title': 'Shiva und die Galgenblume', 'id': 4}
{'first_actor': 'Daisy Granados', 'second_actor': 'Ferenc Begányi', 'movie_title': 'La rosa de los vientos', 'id': 5}


In [None]:
output_path = DATA_DIR / "real_movies_real_actors_shuffled" / "2025-06-04_11-58-04" / "metadata" / "metadata.jsonl"

output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
import pandas as pd
import json
import os
import ast
import random

INPUT_CSV = ARTIFACTS_DIR / "Top_10000_Movies_IMDb.csv"
OUTPUT_JSONL = DATA_DIR / "real_movies_real_actors" / "2025-05-26_11-58-04" / "metadata" / "metadata.jsonl"
N_MOVIES = 1000
SHUFFLE_ACTORS = False  # Set to False to keep original actor pairings

df = pd.read_csv(INPUT_CSV)

# Ensure required columns are present
required_columns = {'Stars', 'Movie Name'}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset must contain the columns: {required_columns}")

# Extract exactly two unique actor names
def get_first_two_stars(stars_str):
    try:
        stars = ast.literal_eval(stars_str)
        stars = [s.strip() for s in stars if s.strip()]
        return stars[:2] if len(stars) >= 2 else None
    except Exception:
        return None

# Filter for rows with two valid actors
df["LeadActors"] = df["Stars"].apply(get_first_two_stars)
df = df[df["LeadActors"].notnull()]
df["first_actor"] = df["LeadActors"].str[0]
df["second_actor"] = df["LeadActors"].str[1]

# Take top N after filtering
df_top = df.head(N_MOVIES).copy()

# Check for edge case: fewer than N_MOVIES after filtering
if len(df_top) < N_MOVIES:
    print(f"⚠️ Only {len(df_top)} valid movies available after filtering.")

# Shuffle actor pairs if specified
if SHUFFLE_ACTORS:
    actor_pairs = list(zip(df_top["first_actor"], df_top["second_actor"]))
    random.shuffle(actor_pairs)

# Write to JSONL
os.makedirs(os.path.dirname(OUTPUT_JSONL), exist_ok=True)
with open(OUTPUT_JSONL, 'w', encoding='utf-8') as f:
    for idx, (i, row) in enumerate(df_top.iterrows()):
        if SHUFFLE_ACTORS:
            first_actor, second_actor = actor_pairs[idx]
        else:
            first_actor, second_actor = row["first_actor"], row["second_actor"]

        json.dump({
            "id": idx + 1,
            "movie_title": row["Movie Name"],
            "first_actor": first_actor,
            "second_actor": second_actor,
            "rating": row.get("Rating"),
            "runtime": row.get("Runtime"),
            "genre": row.get("Genre"),
            "metascore": row.get("Metascore"),
            "plot": row.get("Plot"),
            "directors": row.get("Directors"),
            "votes": row.get("Votes"),
            "gross": row.get("Gross")
        }, f, ensure_ascii=False)
        f.write('\n')

status = "shuffled" if SHUFFLE_ACTORS else "original"
print(f"✅ Successfully wrote {len(df_top)} {status} entries")

In [9]:
print(df["Stars"].head(5).tolist())

["['Tim Robbins', 'Morgan Freeman', 'Bob Gunton', 'William Sadler']", "['Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton']", "['Yûgô Sakô', 'Koichi Saski', 'Arun Govil', 'Nikhil Kapoor', 'Edie Mirman', 'Rael Padamsee']", "['Kemal Sunal', 'Münir Özkul', 'Halit Akçatepe', 'Tarik Akan']", "['Vishal Mourya', 'Karan Kandhapan', 'Babushan Mohanty', 'Dipanwit Dashmohapatra', 'Manaswani Takri']"]
