In [1]:
# Cell 1 — Setup
from pathlib import Path
import json
import pandas as pd
import numpy as np

# 🔧 Change this to your CSV path
csv_path = Path("videos.csv")  # e.g., "my_youtube.csv"
json_path = Path("youtube-scrape.json")

In [2]:
# Cell 2 — Load CSV (treat everything as string to avoid ID corruption)
# We'll still parse published_date separately if needed.

# Read all columns as string so we don't lose leading zeros or IDs
df = pd.read_csv(csv_path, dtype=str)

# Normalize column names (strip spaces if any)
df.columns = [c.strip() for c in df.columns]

# Make sure these columns exist (optional; remove/adjust if your CSV differs)
expected_cols = [
    "source","channel_name","channel_id","video_id","title","url","description",
    "published_date","keyword","trust_score","query","view_count","like_count",
    "duration","educational_score"
]
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    print("Warning: Missing columns in CSV:", missing)

df.head()

Unnamed: 0,source,channel_name,channel_id,video_id,title,url,description,published_date,keyword,trust_score,query,view_count,like_count,duration,educational_score
0,trusted_channel,TED-Ed,UCsooa4yRKGN_zEE8iknghZA,2njn71TqkjA,What Earth in 2050 could look like - Shannon O...,https://www.youtube.com/watch?v=2njn71TqkjA,What could our future world look like if we co...,2024-03-28T15:01:38Z,climate change,0.9,,,,,
1,trusted_channel,TED-Ed,UCsooa4yRKGN_zEE8iknghZA,ZCKRjP_DMII,Can wildlife adapt to climate change? - Erin E...,https://www.youtube.com/watch?v=ZCKRjP_DMII,View full lesson: http://ed.ted.com/lessons/ca...,2016-03-03T16:17:52Z,climate change,0.9,,,,,
2,trusted_channel,TED-Ed,UCsooa4yRKGN_zEE8iknghZA,JYZpxRy5Mfg,Underwater farms vs. climate change - Ayana El...,https://www.youtube.com/watch?v=JYZpxRy5Mfg,Dive into the world of aquaculture and see how...,2019-06-13T15:01:11Z,climate change,0.9,,,,,
3,trusted_channel,TED-Ed,UCsooa4yRKGN_zEE8iknghZA,3hxE7Af98AI,What if there were 1 trillion more trees? - Je...,https://www.youtube.com/watch?v=3hxE7Af98AI,How can trees help in the fight against climat...,2020-10-27T15:00:30Z,climate change,0.9,,,,,
4,trusted_channel,TED-Ed,UCsooa4yRKGN_zEE8iknghZA,_vDZmVXtA7k,What the oil industry doesn’t want you to know...,https://www.youtube.com/watch?v=_vDZmVXtA7k,Uncover the oil industry's decades-long campai...,2024-07-25T15:00:40Z,climate change,0.9,,,,,


In [3]:
# Cell 3 — Clean & convert NaNs to None, and coerce numeric fields if present

def coerce_numeric(val):
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return None
    s = str(val).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        # prefer int if it looks like an integer, else float
        if s.isdigit():
            return int(s)
        return float(s)
    except Exception:
        return s  # leave as string if not numeric

def clean_row(row: dict) -> dict:
    out = {}
    for k, v in row.items():
        # Convert NaN/empty to None
        if v is None:
            out[k] = None
            continue
        s = str(v).strip()
        if s == "" or s.lower() == "nan":
            out[k] = None
            continue
        out[k] = s

    # Optional: coerce known numeric fields
    for num_col in ["trust_score", "view_count", "like_count", "duration", "educational_score"]:
        if num_col in out:
            out[num_col] = coerce_numeric(out[num_col])

    return out

records = [clean_row(r) for r in df.to_dict(orient="records")]
len(records)

131

In [4]:
# Cell 4 — Append to youtube-scrape.json (create if not exists)
# This writes an array of JSON objects. If the file exists, it loads, appends, de-duplicates by video_id, and saves.

existing = []
if json_path.exists():
    with open(json_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            if isinstance(data, list):
                existing = data
            else:
                print("Warning: Existing JSON is not a list. Starting fresh array.")
        except json.JSONDecodeError:
            print("Warning: Existing JSON invalid. Starting fresh array.")

# Merge
combined = existing + records

# De-duplicate by `video_id` if present
def dedup_by_key(items, key="video_id"):
    seen = set()
    out = []
    for it in items:
        vid = (it or {}).get(key)
        if vid is not None:
            if vid in seen:
                continue
            seen.add(vid)
        out.append(it)
    return out

combined = dedup_by_key(combined, key="video_id")

# Save
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(combined, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(combined)} total records to {json_path}")

Wrote 131 total records to youtube-scrape.json


In [5]:
# Cell 5 — Quick peek
# Show the last few appended entries (optional)
pd.DataFrame(records).tail(5)

Unnamed: 0,source,channel_name,channel_id,video_id,title,url,description,published_date,keyword,trust_score,query,view_count,like_count,duration,educational_score
126,search,TED,,MHZMQLDr-OA,A Controversial Play — and What It Taught Me A...,https://www.youtube.com/watch?v=MHZMQLDr-OA,When playwright David Finnigan launched a new ...,2024-09-09T11:00:18Z,,,global warming science documentary,729871.0,20838.0,PT10M8S,1.0
127,search,Hot Mess,,yy3VK6OYBbU,The Basics of Climate Science | Essentials of...,https://www.youtube.com/watch?v=yy3VK6OYBbU,PBS Member Stations rely on viewers like you. ...,2020-07-06T18:52:38Z,,,global warming science documentary,62141.0,1649.0,PT16M,0.7
128,search,Verge Science,,6tesHVSZJOg,This is what sea level rise will do to coastal...,https://www.youtube.com/watch?v=6tesHVSZJOg,Sea level rise is already redrawing coastlines...,2019-04-23T14:00:00Z,,,sea level rise documentary educational,957513.0,16318.0,PT7M19S,0.8
129,search,TED-Ed,,p4pWafuvdrY,How do ocean currents work? - Jennifer Verduin,https://www.youtube.com/watch?v=p4pWafuvdrY,Dive into the science of ocean currents (inclu...,2019-01-31T16:01:10Z,,,sea level rise documentary educational,3065326.0,40181.0,PT4M34S,0.7
130,search,TED-Ed,,25LW_PG2ZuI,Why isn’t the Netherlands underwater? - Stefan Al,https://www.youtube.com/watch?v=25LW_PG2ZuI,Dig into the incredible engineering of the Net...,2020-03-24T15:01:27Z,,,sea level rise documentary educational,3611215.0,98979.0,PT5M24S,0.8
