In [80]:
import pandas as pd
import requests
from urllib.parse import quote
from joblib import Memory
import tiktoken

In [81]:
df = pd.read_json("video-metadata.json")
df["hours"] = pd.to_timedelta(df["duration"]).dt.total_seconds() / 3600
pd.options.display.float_format = "{:.2f}".format

In [82]:
stats = (
    df.groupby(["date", "body"], dropna=False)
    .agg({"hours": "sum"})
    .groupby("body", dropna=False)
    .agg({"hours": "mean"})
)
stats = stats.rename(columns={"hours": "hours_per_day"})
stats

Unnamed: 0_level_0,hours_per_day
body,Unnamed: 1_level_1
Economic and Social Council,12.18
Fifth Committee,1.32
First Committee,3.37
Fourth Committee,2.59
General Assembly,10.52
Second Committee,3.78
Security Council,4.9
Sixth Committee,3.88
Third Committee,4.54
Trusteeship Council,0.8


In [None]:
stats["assemblyai_cost_per_day"] = stats["hours_per_day"] * 0.15
stats["assemblyai_cost_per_year"] = stats["assemblyai_cost_per_day"] * 365
stats.loc["Total"] = stats.sum(numeric_only=True)
stats

Unnamed: 0_level_0,hours_per_day,assemblyai_cost_day,assemblyai_cost_year
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Economic and Social Council,12.18,1.83,667.12
Fifth Committee,1.32,0.2,72.29
First Committee,3.37,0.51,184.67
Fourth Committee,2.59,0.39,141.88
General Assembly,10.52,1.58,575.87
Second Committee,3.78,0.57,206.81
Security Council,4.9,0.74,268.52
Sixth Committee,3.88,0.58,212.51
Third Committee,4.54,0.68,248.66
Trusteeship Council,0.8,0.12,43.56


In [84]:
memory = Memory(location=".cache")


@memory.cache
def fetch_text(id):
    data = requests.get("https://webtv.unfck.org/json/" + quote(id, safe="")).json()
    transcript = data["transcript"]["data"]
    text = "\n".join(
        [
            sent["text"]
            for statement in transcript
            for para in statement["paragraphs"]
            for sent in para["sentences"]
        ]
    )
    return text


In [85]:
encoding = tiktoken.encoding_for_model('gpt-5')

In [86]:
transcripts = df[df["hasTranscript"]].iloc[:10]
transcripts["text"] = transcripts["id"].apply(fetch_text)
transcripts["n_tokens"] = transcripts["text"].apply(encoding.encode).apply(len)
tokens_per_hour = float(transcripts["n_tokens"].sum() / transcripts["hours"].sum())
tokens_per_hour


7851.193626066534

In [87]:
def cost(n_tokens):
    cost_input = n_tokens * 1.750 / 1_000_000
    cost_output = n_tokens * 14 / 1_000_000
    cost = cost_input + 2 * cost_output # assuming substantive token use for reasoning
    return cost

stats["n_tokens"] = stats["hours_per_day"] * tokens_per_hour
stats["openai_cost_per_day"] = stats["n_tokens"].apply(cost) * 2 # 2 stages: speaker identification + topic identification
stats["openai_cost_per_year"] = stats["openai_cost_per_day"] * 365
stats

Unnamed: 0_level_0,hours_per_day,assemblyai_cost_day,assemblyai_cost_year,n_tokens,openai_cost_per_day,openai_cost_per_year
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Economic and Social Council,12.18,1.83,667.12,95665.72,5.69,2077.62
Fifth Committee,1.32,0.2,72.29,10366.18,0.62,225.13
First Committee,3.37,0.51,184.67,26482.19,1.58,575.13
Fourth Committee,2.59,0.39,141.88,20346.25,1.21,441.87
General Assembly,10.52,1.58,575.87,82579.72,4.91,1793.42
Second Committee,3.78,0.57,206.81,29656.79,1.76,644.07
Security Council,4.9,0.74,268.52,38506.36,2.29,836.26
Sixth Committee,3.88,0.58,212.51,30474.67,1.81,661.83
Third Committee,4.54,0.68,248.66,35658.71,2.12,774.42
Trusteeship Council,0.8,0.12,43.56,6246.06,0.37,135.65
