# Ingesting [tmdb](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata) movie database into aperturedb.

This notebook will work on an instance of ApertureDB, which can be on the [cloud](https://cloud.aperturedata.io), or running as a [local docker container(s)](https://docs.aperturedata.io/Setup/server/Local)

The dataset is hosted on kaggle, and available via a mlcroissant link.


In [13]:
%pip install --quiet mlcroissant pandas dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Import all the modules needed

In [14]:
import json
from typing import List

import mlcroissant as mlc
import pandas as pd

from tqdm import tqdm

from aperturedb.Subscriptable import Subscriptable
from aperturedb.ParallelLoader import ParallelLoader
from aperturedb.Query import QueryBuilder
from aperturedb.CommonLibrary import (
    create_connector
)
from aperturedb.Utils import Utils


## Load croissant records into dataframes

In [15]:

# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata/croissant/download')

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(f"{record_sets=}")

def deserialize_record(record):
    deserialized = record.decode('utf-8') if isinstance(record, bytes) else record
    if isinstance(deserialized, str):
        try:
            deserialized = json.loads(deserialized)
        except:
            pass
    return deserialized

# Fetch the records and put them in a DataFrame. The archive, downloads, load into a DataFrame
# is managed by the croissant library.
# croisant recrds are ~ DataFrame. TMDB has 2 record sets
# The first records are the movies, the second are the casts.
# The association between the two is the movie_id
record_set_df_0 = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
record_set_df_1 = pd.DataFrame(croissant_dataset.records(record_set=record_sets[1].uuid))

# Display the first few records from each record set
for record_set in record_sets:
    record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_set.uuid))
    columns = record_set_df.columns
    count = 0
    for record in record_set_df.iterrows():
        j = {}
        for c in columns:
            j[c] = deserialize_record(record[1][c])
        count += 1
        print(json.dumps(j, indent=2, default=str))

        if count == 5:
            break


  -  [Metadata(TMDB 5000 Movie Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


record_sets=[RecordSet(uuid="tmdb_5000_credits.csv"), RecordSet(uuid="tmdb_5000_movies.csv")]
{
  "tmdb_5000_credits.csv/movie_id": 19995,
  "tmdb_5000_credits.csv/title": "Avatar",
  "tmdb_5000_credits.csv/cast": [
    {
      "cast_id": 242,
      "character": "Jake Sully",
      "credit_id": "5602a8a7c3a3685532001c9a",
      "gender": 2,
      "id": 65731,
      "name": "Sam Worthington",
      "order": 0
    },
    {
      "cast_id": 3,
      "character": "Neytiri",
      "credit_id": "52fe48009251416c750ac9cb",
      "gender": 1,
      "id": 8691,
      "name": "Zoe Saldana",
      "order": 1
    },
    {
      "cast_id": 25,
      "character": "Dr. Grace Augustine",
      "credit_id": "52fe48009251416c750aca39",
      "gender": 1,
      "id": 10205,
      "name": "Sigourney Weaver",
      "order": 2
    },
    {
      "cast_id": 4,
      "character": "Col. Quaritch",
      "credit_id": "52fe48009251416c750ac9cf",
      "gender": 2,
      "id": 32747,
      "name": "Stephen Lang",

In [16]:
client=create_connector()
utils = Utils(client)
utils.remove_all_objects()
utils.summary()


Database: garfield
Version: 0.18.1
Status:  0
Info:    OK
------------------ Entities -----------------
Total entities types:    0
---------------- Connections ----------------
Total connections types: 0
------------------ Totals -------------------
Total nodes: 0
Total edges: 0


In [17]:

def make_movie(j: dict) -> List[dict]:
    """
    This is where we create the Commands to create Movie and Professional objects
    and the HasCast connection between them.
    The movie is the root object, and the cast are the children.
    Each call to this function creates a transaction that will be executed in the database.

    Args:
        j (dict): a record from the dataset. The record is a dictionary with the following keys:

    Returns:
        List[dict]: A list of commands to be executed in the database.
    """
    transaction = []
    movie_parameters = dict(_ref=1, properties=dict(
        id=str(j["tmdb_5000_credits.csv/movie_id"]),
        movie_id=j["tmdb_5000_credits.csv/movie_id"],
        title=str(j["tmdb_5000_credits.csv/title"]),
        budget=j["tmdb_5000_movies.csv/budget"],
        overview=str(j["tmdb_5000_movies.csv/overview"]),
        popularity=j["tmdb_5000_movies.csv/popularity"]
    ), if_not_found=dict(id=["==", str(j["tmdb_5000_credits.csv/movie_id"])]))

    movie = QueryBuilder.add_command("MOVIE", movie_parameters)
    transaction.append(movie)

    index = 2
    for cast_info in j["tmdb_5000_credits.csv/cast"]:
        c = cast_info
        cast_parameters = dict(_ref=index, properties=dict(
            id=c["id"],
            name=c["name"],
            gender=c["gender"]), if_not_found=dict(id=["==", c["id"]]))
        professional = QueryBuilder.add_command("PROFESSIONAL", cast_parameters)
        transaction.append(professional)

        connection_parameters = dict(src=1, dst=index, properties=dict(
            character=c["character"],
            cast_id=c["cast_id"]))
        connection_parameters["class"] = "CAST"
        connection = QueryBuilder.add_command("_Connection", connection_parameters)
        transaction.append(connection)
        index += 1

    for crew_info in j["tmdb_5000_credits.csv/crew"]:
        c = crew_info
        crew_parameters = dict(_ref=index, properties=dict(
            id=c["id"],
            name=c["name"],
            gender=c["gender"]
        ), if_not_found=dict(id=["==", c["id"]]))
        professional = QueryBuilder.add_command("PROFESSIONAL", crew_parameters)
        transaction.append(professional)

        connection_parameters = dict(src=1, dst=index, properties=dict(
            department=c["department"],
            job=c["job"],
            credit_id=c["credit_id"]))
        connection = QueryBuilder.add_command("_Connection", connection_parameters)
        connection_parameters["class"] = "CREW"
        transaction.append(connection)
        index += 1

    return transaction


In [18]:

# Merge the two DataFrames on the movie_id
records = record_set_df_0.merge(
    record_set_df_1,
    right_on="tmdb_5000_movies.csv/id",
    left_on="tmdb_5000_credits.csv/movie_id")

collection = []
for record in tqdm(records.iterrows()):
    columns = records.columns
    count = 0
    j = {}
    for c in columns:
        j[c] = deserialize_record(record[1][c])
    count += 1
    movie = make_movie(j)
    collection.append(movie)


class MovieParser(Subscriptable):
    def __init__(self, collection):
        self.collection = collection
    def getitem(self, key):
        query = self.collection[key]
        return query, []
    def __len__(self):
        return len(self.collection)


utils.create_entity_index("MOVIE", "id")
utils.create_entity_index("PROFESSIONAL", "id")
utils.create_connection_index("CAST", "cast_id")
utils.create_connection_index("CREW", "crew_id")

parser = MovieParser(collection)
loader = ParallelLoader(client)
ParallelLoader.setSuccessStatus([0, 2])
loader.ingest(parser, batchsize=100, numthreads=8, stats=True)


4803it [00:03, 1215.36it/s]
Progress: 100%|██████████| 4.80k/4.80k [00:05<00:00, 905items/s]  

Total time (s): 5.308330059051514
Total queries executed: 55
Avg Query time (s): 0.5149770953438498
Query time std: 0.48228668156863974
Avg Query Throughput (q/s): 15.534671487978326
Overall insertion throughput (element/s): 904.8043257615738
Total inserted elements: 4803
Total successful commands: 476479



