In [1]:
from pathlib import Path
import sqlite3

print("CWD:", Path.cwd())
ASSETS = Path("/content/assets")
DB     = Path("/content/music.db") if Path("/content/music.db").exists() else Path("music.db")
print("ASSETS:", ASSETS.resolve(), "exists:", ASSETS.exists())
print("DB:", DB.resolve(), "exists:", DB.exists())

with sqlite3.connect(DB) as con:
    cols = [r[1] for r in con.execute("PRAGMA table_info(tracks);")]
    print("tracks columns:", cols)
    n = con.execute("SELECT COUNT(*) FROM tracks").fetchone()[0] if cols else 0
    print("current track count:", n)


CWD: /content
ASSETS: /content/assets exists: True
DB: /content/music.db exists: True
tracks columns: ['id', 'url', 'local_path', 'title', 'artist', 'album', 'duration', 'bitrate', 'sample_rate', 'channels', 'filesize', 'ext', 'added_at', 'license_id', 'genre']
current track count: 0


In [2]:
import requests, time

def discover_first_jobs(query, max_releases=2):
    session = requests.Session()
    adv = "https://archive.org/advancedsearch.php"
    jobs = []
    page=1
    while len(jobs) < max_releases:
        r = session.get(adv, params={
            "q": query, "fl[]":["identifier","title","creator","licenseurl"],
            "output":"json", "rows":50, "page":page
        }, timeout=25)
        r.raise_for_status()
        docs = r.json().get("response",{}).get("docs",[])
        if not docs: break
        for d in docs[:max_releases-len(jobs)]:
            ident  = d.get("identifier")
            title  = d.get("title") or ident
            artist = d.get("creator") or "Unknown"
            licurl = d.get("licenseurl") or "https://archive.org/about/terms.php"
            md = session.get(f"https://archive.org/metadata/{ident}", timeout=25).json()
            files = md.get("files", [])
            urls = [f"https://archive.org/download/{ident}/{f['name']}"
                    for f in files if f.get("name","").lower().endswith((".mp3",".ogg",".m4a",".aac",".wav",".flac"))]
            jobs.append({"ident":ident,"title":title,"artist":artist,"lic":licurl,"urls":urls[:5]})
            time.sleep(0.2)
        page += 1
    return jobs

probe = discover_first_jobs('collection:netlabels AND mediatype:audio AND subject:(ambient)', max_releases=2)
probe


[{'ident': 'gv442',
  'title': '[GV-442] Astral & Shit - Aqua Marina',
  'artist': 'Astral & Shit',
  'lic': 'http://creativecommons.org/licenses/by-nc-nd/3.0/',
  'urls': ['https://archive.org/download/gv442/Astral & Shit - aqua marina - 01 taiga.mp3',
   'https://archive.org/download/gv442/Astral & Shit - aqua marina - 01 taiga.ogg',
   'https://archive.org/download/gv442/Astral & Shit - aqua marina - 02 tail.mp3',
   'https://archive.org/download/gv442/Astral & Shit - aqua marina - 02 tail.ogg',
   'https://archive.org/download/gv442/Astral & Shit - aqua marina - 03 jellyfish.mp3']},
 {'ident': 'polygon009',
  'title': 'Boy Makes Music - Untitled (Live at The Halifax Experimental Music Festival 9) [polygon009]',
  'artist': 'Boy Makes Music',
  'lic': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
  'urls': ['https://archive.org/download/polygon009/boy_makes_music-untitled-live_at_hemf-9.mp3']}]

In [4]:
import sqlite3, pathlib
DB = pathlib.Path("/content/music.db") if pathlib.Path("/content/music.db").exists() else pathlib.Path("music.db")
with sqlite3.connect(DB) as con:
    cols = {r[1] for r in con.execute("PRAGMA table_info(tracks);")}
    if "genre" not in cols:
        con.execute("ALTER TABLE tracks ADD COLUMN genre TEXT;")
        print("✅ added genre")
    else:
        print("✔ genre present")


✔ genre present


In [7]:
import requests, os, time, concurrent.futures, sqlite3, re
from urllib.parse import urlparse
from pathlib import Path

ASSETS = Path("/content/assets")
ASSETS.mkdir(parents=True, exist_ok=True)
DB = Path("/content/music.db")

def _safe(name:str)->str:
    return re.sub(r"[^0-9A-Za-z._ -]+","_",name).replace(" ","_")

def read_audio_metadata(path):
    # minimal placeholder (extend later with mutagen)
    return {"title": os.path.basename(path), "artist": "Unknown", "album": "", "duration": 0}

def upsert_track_full(meta, url="", license_code="", license_url="", genre_tag=""):
    with sqlite3.connect(DB) as con:
        c = con.cursor()
        c.execute("""CREATE TABLE IF NOT EXISTS tracks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT, artist TEXT, album TEXT,
            duration REAL, url TEXT, local_path TEXT,
            license_code TEXT, license_url TEXT,
            genre TEXT
        );""")
        c.execute("""INSERT INTO tracks
                     (title, artist, album, duration, url, local_path, license_code, license_url, genre)
                     VALUES (?,?,?,?,?,?,?,?,?)""",
                  (meta["title"], meta["artist"], meta["album"], meta["duration"],
                   url, str(meta.get("path", "")), license_code, license_url, genre_tag))
        con.commit()

def fast_import_from_internet_archive(query, max_items=50, max_workers=8,
                                      max_per_identifier=3, genre_tag="mixed"):
    session = requests.Session()
    search_url = "https://archive.org/advancedsearch.php"
    rows = []
    page = 1
    while len(rows) < max_items:
        r = session.get(search_url, params={
            "q": query,
            "fl[]":["identifier","title","creator","licenseurl"],
            "output":"json", "rows":50, "page":page
        }, timeout=25)
        r.raise_for_status()
        docs = r.json().get("response",{}).get("docs",[])
        if not docs: break
        rows.extend(docs)
        if len(docs) < 50: break
        page += 1
        time.sleep(0.5)
    print(f"Found {len(rows)} results")

    def download_release(doc):
        ident  = doc["identifier"]
        title  = doc.get("title") or ident
        artist = doc.get("creator") or "Unknown"
        licurl = doc.get("licenseurl") or "https://archive.org/about/terms.php"
        md = session.get(f"https://archive.org/metadata/{ident}", timeout=25).json()
        files = md.get("files", [])
        count = 0
        for f in files:
            nm = f.get("name","").lower()
            if nm.endswith((".mp3",".ogg",".m4a",".aac",".wav",".flac")):
                url = f"https://archive.org/download/{ident}/{nm}"
                local = ASSETS / _safe(f"{artist}-{title}-{nm}")
                try:
                    with requests.get(url, stream=True, timeout=30) as rr:
                        rr.raise_for_status()
                        with open(local,"wb") as ff:
                            for chunk in rr.iter_content(1024*64):
                                if chunk: ff.write(chunk)
                    meta = read_audio_metadata(local)
                    meta["path"]=local
                    upsert_track_full(meta,url=url,license_code="CC",license_url=licurl,genre_tag=genre_tag)
                    count += 1
                except Exception as e:
                    print("skip:", nm, "-", e)
                if count>=max_per_identifier: break
        return count

    total=0
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
        for n in ex.map(download_release, rows[:max_items]):
            total+=n
    print("✅ downloaded & indexed", total, "tracks")
    return total


In [9]:
# --- FIXED, FAST, ROBUST IA IMPORTER (drop-in) ---
!pip -q install mutagen tqdm

import os, re, time, sqlite3, requests
from pathlib import Path
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter, Retry
from mutagen import File as MF
from tqdm.auto import tqdm

ROOT   = Path("/content")
ASSETS = ROOT / "assets"; ASSETS.mkdir(parents=True, exist_ok=True)
DB     = ROOT / "music.db"

# --- DB schema (safe; includes genre) ---
def ensure_schema():
    with sqlite3.connect(DB) as con:
        con.execute("""
        CREATE TABLE IF NOT EXISTS tracks(
          id INTEGER PRIMARY KEY,
          title TEXT,
          artist TEXT,
          album TEXT,
          duration REAL,
          url TEXT,
          local_path TEXT,
          license_code TEXT,
          license_url TEXT,
          genre TEXT
        )""")
        # ensure genre exists even if table was older
        cols = [r[1] for r in con.execute("PRAGMA table_info(tracks);")]
        if "genre" not in cols:
            con.execute("ALTER TABLE tracks ADD COLUMN genre TEXT;")
    print("✅ schema ok at", DB)

def upsert_track(meta):
    with sqlite3.connect(DB) as con:
        # If this local_path exists, update; else insert.
        row = con.execute("SELECT id FROM tracks WHERE local_path=?", (meta["local_path"],)).fetchone()
        if row:
            con.execute("""UPDATE tracks SET title=?,artist=?,album=?,duration=?,url=?,license_code=?,license_url=?,genre=COALESCE(?,genre)
                           WHERE local_path=?""",
                        (meta["title"], meta["artist"], meta["album"], float(meta["duration"] or 0),
                         meta["url"], meta["license_code"], meta["license_url"], meta["genre"], meta["local_path"]))
        else:
            con.execute("""INSERT INTO tracks(title,artist,album,duration,url,local_path,license_code,license_url,genre)
                           VALUES(?,?,?,?,?,?,?,?,?)""",
                        (meta["title"], meta["artist"], meta["album"], float(meta["duration"] or 0),
                         meta["url"], meta["local_path"], meta["license_code"], meta["license_url"], meta["genre"]))
        con.commit()

def _safe(name:str)->str:
    return re.sub(r"[^0-9A-Za-z._ -]+","_",name).strip().replace(" ","_")

def _ext_from_headers(resp, fallback=".mp3"):
    ct = (resp.headers.get("Content-Type","") or "").lower()
    if "audio/flac" in ct: return ".flac"
    if "audio/wav"  in ct or "x-wav" in ct: return ".wav"
    if "audio/ogg"  in ct: return ".ogg"
    if "audio/mpeg" in ct: return ".mp3"
    if "audio/mp4"  in ct or "aac" in ct or "m4a" in ct: return ".m4a"
    return fallback

def read_audio_metadata(path: Path):
    info = {"title": path.stem, "artist":"Unknown", "album":"Unknown", "duration":0}
    try:
        mfe = MF(path, easy=True)
        if mfe:
            info["title"]  = (mfe.get("title",  [info["title"]]) or [info["title"]])[0]
            info["artist"] = (mfe.get("artist", ["Unknown"])     or ["Unknown"])[0]
            info["album"]  = (mfe.get("album",  ["Unknown"])     or ["Unknown"])[0]
        mf = MF(path)
        if mf and getattr(mf, "info", None):
            info["duration"] = float(getattr(mf.info, "length", 0) or 0)
    except Exception:
        pass
    return info

# Shared requests Session with retries/backoff and UA
def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, backoff_factor=0.6,
        status_forcelist=[429,500,502,503,504],
        allowed_methods=["GET","HEAD"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({"User-Agent": "Mozilla/5.0 (TheVoidSpeaks/1.0)"})
    return s

# Keep filename CASE for URL; only use lower() for extension check
def pick_audio_files(files, max_per_identifier=3):
    picked = []
    for f in files:
        name_orig = f.get("name","")         # keep original case
        name_lc   = name_orig.lower()
        if name_lc.endswith((".mp3",".ogg",".m4a",".aac",".wav",".flac")):
            picked.append(name_orig)
        if len(picked) >= max_per_identifier:
            break
    return picked

def fast_import_from_internet_archive(
    query="collection:netlabels AND mediatype:audio",
    max_items=100,
    max_workers=8,
    max_per_identifier=3,
    genre_tag=None,
    polite_delay=0.2
):
    ensure_schema()
    s = make_session()

    # 1) discover identifiers/releases
    jobs = []
    page = 1
    adv = "https://archive.org/advancedsearch.php"
    with tqdm(desc="Discovering", unit="rel") as pbar:
        while len(jobs) < max_items:
            r = s.get(adv, params={
                "q": query,
                "fl[]": ["identifier","title","creator","licenseurl"],
                "output": "json",
                "rows": 50,
                "page": page
            }, timeout=25)
            r.raise_for_status()
            docs = r.json().get("response",{}).get("docs",[])
            if not docs:
                break
            for d in docs:
                if len(jobs) >= max_items: break
                ident  = d.get("identifier")
                title  = d.get("title") or ident
                artist = d.get("creator") or "Unknown"
                licurl = d.get("licenseurl") or "https://archive.org/about/terms.php"
                md = s.get(f"https://archive.org/metadata/{ident}", timeout=25).json()
                files = md.get("files", [])
                picks = pick_audio_files(files, max_per_identifier=max_per_identifier)
                for fname in picks:
                    url = f"https://archive.org/download/{ident}/{fname}"  # ✅ original case
                    jobs.append({
                        "url": url, "title": title, "artist": artist, "album": ident,
                        "license_code": "CC/PD", "license_url": licurl, "genre": genre_tag
                    })
                time.sleep(polite_delay)
            page += 1
            pbar.update(1)

    if not jobs:
        print("No candidates found for query.")
        return 0

    # 2) download sequentially (safer for IA; still fast enough)
    ok = 0; fail = 0
    for j in tqdm(jobs, desc="Downloading", unit="trk"):
        try:
            # decide filename from artist-title + server ext
            ext = os.path.splitext(urlparse(j["url"]).path)[1] or ".mp3"
            fname = _safe(f"{j['artist']}-{j['title']}{ext}")
            out = ASSETS / fname

            if out.exists() and out.stat().st_size > 0:
                meta = read_audio_metadata(out)
                upsert_track({
                    "title":  meta["title"] or j["title"],
                    "artist": meta["artist"] or j["artist"],
                    "album":  meta["album"]  or j["album"],
                    "duration": meta["duration"] or 0,
                    "url": j["url"],
                    "local_path": str(out),
                    "license_code": j["license_code"],
                    "license_url": j["license_url"],
                    "genre": j["genre"]
                })
                ok += 1
                continue

            with s.get(j["url"], stream=True, timeout=45) as r:
                r.raise_for_status()
                # adjust ext if Content-Type says different
                real_ext = _ext_from_headers(r, ext or ".mp3")
                if real_ext and real_ext != ext:
                    ext = real_ext
                    fname = _safe(f"{j['artist']}-{j['title']}{ext}")
                    out = ASSETS / fname
                with open(out, "wb") as f:
                    for chunk in r.iter_content(1024*64):
                        if chunk: f.write(chunk)

            meta = read_audio_metadata(out)
            upsert_track({
                "title":  meta["title"] or j["title"],
                "artist": meta["artist"] or j["artist"],
                "album":  meta["album"]  or j["album"],
                "duration": meta["duration"] or 0,
                "url": j["url"],
                "local_path": str(out),
                "license_code": j["license_code"],
                "license_url": j["license_url"],
                "genre": j["genre"]
            })
            ok += 1
        except Exception as e:
            print("skip:", j["url"], "-", type(e).__name__, e)
            fail += 1

    print(f"Done: {ok} ok, {fail} skipped/failed")
    return ok


In [11]:
import sqlite3, pathlib
DB = pathlib.Path("/content/music.db")

with sqlite3.connect(DB) as con:
    cur = con.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS tracks(
      id INTEGER PRIMARY KEY,
      title TEXT,
      artist TEXT,
      album TEXT,
      duration REAL,
      url TEXT,
      local_path TEXT,
      license_code TEXT,
      license_url TEXT,
      genre TEXT
    )
    """)
    cols = [r[1] for r in cur.execute("PRAGMA table_info(tracks);")]

    need = {
        "license_code": "TEXT",
        "license_url": "TEXT",
        "genre": "TEXT",
        "duration": "REAL",
        "local_path": "TEXT"
    }
    for col, sqltype in need.items():
        if col not in cols:
            cur.execute(f"ALTER TABLE tracks ADD COLUMN {col} {sqltype};")

    con.commit()
print("✅ Database schema fixed and ready.")


✅ Database schema fixed and ready.


In [12]:
fast_import_from_internet_archive(
    query='collection:netlabels AND mediatype:audio AND subject:(ambient)',
    max_items=20,
    max_per_identifier=2,
    genre_tag="ambient"
)


✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/21 [00:00<?, ?trk/s]

Done: 21 ok, 0 skipped/failed


21

In [13]:
genres = {
    "ambient": "collection:netlabels AND mediatype:audio AND subject:(ambient OR atmosphere)",
    "lofi": "collection:netlabels AND mediatype:audio AND subject:(lofi OR chillhop OR chill)",
    "electronic": "collection:netlabels AND mediatype:audio AND subject:(electronic OR synth OR downtempo)",
    "rock": "collection:netlabels AND mediatype:audio AND subject:(rock OR alternative OR indie)",
    "jazz": "collection:netlabels AND mediatype:audio AND subject:(jazz OR blues OR swing)",
    "hiphop": "collection:netlabels AND mediatype:audio AND subject:(hiphop OR rap OR beats)",
}

for g, q in genres.items():
    fast_import_from_internet_archive(query=q, max_items=1000, max_per_identifier=3, genre_tag=g)


✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1000 [00:00<?, ?trk/s]

Done: 1000 ok, 0 skipped/failed
✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1001 [00:00<?, ?trk/s]

Done: 1001 ok, 0 skipped/failed
✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1001 [00:00<?, ?trk/s]

skip: https://archive.org/download/SomeAncientHoliday/Track 02. Mrs. Danvers - Estuary 883 FDC#3 dedicated.mp3 - HTTPError 404 Client Error: Not Found for url: https://ia600207.us.archive.org/9/items/SomeAncientHoliday/Track%2002.%20Mrs.%20Danvers%20-%20Estuary%20883%20FDC#3%20dedicated.mp3
Done: 1000 ok, 1 skipped/failed
✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1001 [00:00<?, ?trk/s]

skip: https://archive.org/download/SBTR005BONEYARDSKULLBONG/the Boneyard Sk%C3%BCllbong Rock n Roll Horror Show - 1134.mp3 - HTTPError 404 Client Error: Not Found for url: https://ia803204.us.archive.org/15/items/SBTR005BONEYARDSKULLBONG/the%20Boneyard%20Sk%C3%BCllbong%20Rock%20n%20Roll%20Horror%20Show%20-%201134.mp3
skip: https://archive.org/download/SBTR005BONEYARDSKULLBONG/the Boneyard Sk%C3%BCllbong Rock n Roll Horror Show - 1134.ogg - HTTPError 404 Client Error: Not Found for url: https://ia903204.us.archive.org/15/items/SBTR005BONEYARDSKULLBONG/the%20Boneyard%20Sk%C3%BCllbong%20Rock%20n%20Roll%20Horror%20Show%20-%201134.ogg
skip: https://archive.org/download/SBTR005BONEYARDSKULLBONG/the Boneyard Sk%C3%BCllbong Rock n Roll Horror Show - American Horror Story.mp3 - HTTPError 404 Client Error: Not Found for url: https://ia803204.us.archive.org/15/items/SBTR005BONEYARDSKULLBONG/the%20Boneyard%20Sk%C3%BCllbong%20Rock%20n%20Roll%20Horror%20Show%20-%20American%20Horror%20Story.mp3
Done:

Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1001 [00:00<?, ?trk/s]

Done: 1001 ok, 0 skipped/failed
✅ schema ok at /content/music.db


Discovering: 0rel [00:00, ?rel/s]

Downloading:   0%|          | 0/1002 [00:00<?, ?trk/s]

skip: https://archive.org/download/hnr029/01.FuckYourMessiah.flac - OSError [Errno 28] No space left on device
skip: https://archive.org/download/hnr029/01.FuckYourMessiah.mp3 - OSError [Errno 28] No space left on device
skip: https://archive.org/download/hnr029/01.FuckYourMessiah.ogg - OSError [Errno 28] No space left on device
skip: https://archive.org/download/kreislauf140/01_sniff_the_sun_dust.mp3 - OSError [Errno 28] No space left on device
skip: https://archive.org/download/kreislauf140/02_rubberneck.mp3 - OSError [Errno 28] No space left on device
skip: https://archive.org/download/kreislauf140/03_hot_coppers.mp3 - OSError [Errno 28] No space left on device
skip: https://archive.org/download/ende595-pastel-boi-singles-2/01 -  pastel boi - smeared.flac - OSError [Errno 28] No space left on device
skip: https://archive.org/download/ende595-pastel-boi-singles-2/01 -  pastel boi - smeared.mp3 - OSError [Errno 28] No space left on device: '/content/assets/Unknown-_ENDE595__-pastel_bo