In [1]:
import pandas as pd
import requests
from urllib.parse import quote
from joblib import Memory
import tiktoken

Note: pull `video-metadata.json` via `npm run fetch-video-metadata`

In [2]:
df = pd.read_json("video-metadata.json")
df["hours"] = pd.to_timedelta(df["duration"]).dt.total_seconds() / 3600
pd.options.display.float_format = "{:.2f}".format

In [3]:
stats = (
    df.groupby(["date", "body"], dropna=False)
    .agg({"hours": "sum"})
    .groupby("body", dropna=False)
    .agg({"hours": "mean"})
)
stats = stats.rename(columns={"hours": "hours_per_day"})
stats

Unnamed: 0_level_0,hours_per_day
body,Unnamed: 1_level_1
Economic and Social Council,12.18
Fifth Committee,1.32
First Committee,3.37
Fourth Committee,2.59
General Assembly,10.52
Second Committee,3.78
Security Council,4.9
Sixth Committee,3.88
Third Committee,4.54
Trusteeship Council,0.8


In [4]:
stats["assemblyai_cost_per_day"] = stats["hours_per_day"] * 0.15
stats["assemblyai_cost_per_year"] = stats["assemblyai_cost_per_day"] * 365
stats.loc["Total"] = stats.sum(numeric_only=True)
stats

Unnamed: 0_level_0,hours_per_day,assemblyai_cost_per_day,assemblyai_cost_per_year
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Economic and Social Council,12.18,1.83,667.12
Fifth Committee,1.32,0.2,72.29
First Committee,3.37,0.51,184.67
Fourth Committee,2.59,0.39,141.88
General Assembly,10.52,1.58,575.87
Second Committee,3.78,0.57,206.81
Security Council,4.9,0.74,268.52
Sixth Committee,3.88,0.58,212.51
Third Committee,4.54,0.68,248.66
Trusteeship Council,0.8,0.12,43.56


In [5]:
memory = Memory(location=".cache")


@memory.cache
def fetch_text(id):
    data = requests.get("https://webtv.unfck.org/json/" + quote(id, safe="")).json()
    transcript = data["transcript"]["data"]
    text = "\n".join(
        [
            sent["text"]
            for statement in transcript
            for para in statement["paragraphs"]
            for sent in para["sentences"]
        ]
    )
    return text


In [6]:
encoding = tiktoken.encoding_for_model("gpt-5")

In [7]:
transcripts = df[df["hasTranscript"]].iloc[:30]
transcripts["text"] = transcripts["id"].apply(fetch_text)
transcripts["n_tokens"] = transcripts["text"].apply(encoding.encode).apply(len)
tokens_per_hour = float(transcripts["n_tokens"].sum() / transcripts["hours"].sum())
tokens_per_hour


7493.898366826542

In [8]:
def cost(hours):
    # apply token estimates from `npm run usage-benchmark` and gpt-5 / gpt-5-mini pricing from https://developers.openai.com/api/docs/pricing
    input_tokens = hours * 35_000
    cached_input_tokens = hours * 7000
    output_tokens = hours * 30_000
    cheap_input_tokens = hours * 190_000
    cheap_cached_input_tokens = hours * 0
    cheap_output_tokens = hours * 70_000
    cost = (
        (input_tokens - cached_input_tokens) * 1.25 / 1_000_000
        + cached_input_tokens * 0.125 / 1_000_000
        + output_tokens * 10 / 1_000_000
        + (cheap_input_tokens - cheap_cached_input_tokens) * 0.25 / 1_000_000
        + cheap_cached_input_tokens * 0.025 / 1_000_000
        + cheap_output_tokens * 2 / 1_000_000
    )
    return cost


stats["openai_cost_per_day"] = stats["hours_per_day"].apply(cost)
stats["openai_cost_per_year"] = stats["openai_cost_per_day"] * 365
stats

Unnamed: 0_level_0,hours_per_day,assemblyai_cost_per_day,assemblyai_cost_per_year,openai_cost_per_day,openai_cost_per_year
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Economic and Social Council,12.18,1.83,667.12,6.38,2327.7
Fifth Committee,1.32,0.2,72.29,0.69,252.23
First Committee,3.37,0.51,184.67,1.77,644.35
Fourth Committee,2.59,0.39,141.88,1.36,495.06
General Assembly,10.52,1.58,575.87,5.5,2009.29
Second Committee,3.78,0.57,206.81,1.98,721.6
Security Council,4.9,0.74,268.52,2.57,936.92
Sixth Committee,3.88,0.58,212.51,2.03,741.5
Third Committee,4.54,0.68,248.66,2.38,867.63
Trusteeship Council,0.8,0.12,43.56,0.42,151.98
