## Idneitfy sources of media files and download

### Load the data

In [None]:
import bs4
import os
import json
import requests
import codecs
from IPython.display import clear_output


DATA_DIR = "data"
WAYBACK_BASE_URL = "https://web.archive.org"

html_files = []
# walk through all files in the DATA_DIR
# and find all HTML files
html_extensions = [".html", ".htm"]
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(tuple(html_extensions)):
            file_path = os.path.join(root, file)
            html_files.append(file_path)


def download_file(url, dest, retries=3, sleep=1):
    if os.path.isfile(dest):
        print(f"File {dest} already exists.")
        return
    print(f"Downloading {url} to {dest}")
    for i in range(retries):
        print(f"Attempt {i+1}/{retries}")
        try:
            response = requests.get(url)
            with open(dest, "wb") as f:
                f.write(response.content)
            return
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(sleep)


print(f"Found {len(html_files)} HTML files.")

for file_path in html_files:
    print("Processing file:", file_path)
    html_dir = os.path.dirname(file_path)

    html_content = None
    try:
        with codecs.open(file_path, "r", "utf-8") as f:
            html_content = f.read()
    except Exception as e:
        print(f"Error reading file: {e} in utf-8, trying latin-1")

    try:
        with codecs.open(file_path, "r", "latin-1") as f:
            html_content = f.read()
    except Exception as e:
        print(f"Error reading file: {e} in latin-1")

    if not html_content:
        print("Error reading file, skipping...")
        continue

    soup = bs4.BeautifulSoup(html_content, "html.parser")

    ########## Extract basic metadata ##########
    title = soup.title.string
    description = soup.find("meta", attrs={"name": "description"})
    if description:
        description = description["content"]
    else:
        description = None

    metadata = {
        "title": title,
        "description": description,
    }

    metadata_path = os.path.join(html_dir, "metadata.json")
    with open(metadata_path, "w") as f:
        json.dump(metadata, f, indent=2)

    ########## Extract links ##########
    links = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href:
            links.append(href)

    links_path = os.path.join(html_dir, "links.json")
    with open(links_path, "w") as f:
        json.dump(links, f, indent=2)

    ########################################
    ########## Extract sources #############
    ########################################

    sources = []
    tags = [
        "link",
        "script",
        "iframe",
        "embed",
        "audio",
        "video",
        "source",
        "track",
        "object",
        "img",
    ]
    # getting all the `src` attributes from the tags
    # the tags we are interested in are defined in the `tags` list
    for tag in tags:
        for script in soup.find_all(tag):
            src = script.get("src")
            if src:
                src = WAYBACK_BASE_URL + src
                sources.append(src)

    sources_path = os.path.join(html_dir, "sources.json")
    print(f"Found {len(sources)} sources, saving results to {sources_path}")
    with open(sources_path, "w") as f:
        json.dump(sources, f, indent=2)

    ########################################
    ########## Extract MIDI URLs ###########
    ########################################
    midi_urls = []
    for src in sources:
        if src.endswith(".mid"):
            midi_urls.append(src)

    midi_urls_path = os.path.join(html_dir, "midi.json")
    print(f"Found {len(midi_urls)} MIDI files, saving results to {midi_urls_path}")
    with open(midi_urls_path, "w") as f:
        json.dump(midi_urls, f, indent=2)

    midi_dir = os.path.join(html_dir, "mid")
    os.makedirs(midi_dir, exist_ok=True)
    print(f"Dowloading {len(midi_urls)} MIDI files to {midi_dir}")
    for midi_url in midi_urls:
        print("Downloading MIDI file:", midi_url)
        midi_file_path = os.path.join(midi_dir, os.path.basename(midi_url))
        download_file(midi_url, midi_file_path)

    ########################################
    ########## Extract audio ###############
    ########################################
    audio_urls = []
    audio_extensions = [
        ".mp3",
        ".wav",
        ".ogg",
        ".aac",
        ".flac",
        ".alac",
        ".aiff",
        ".dsd",
        ".wma",
        ".opus",
        ".m4a",
    ]
    for src in sources:
        if src.endswith(tuple(audio_extensions)):
            audio_urls.append(src)

    audio_urls_path = os.path.join(html_dir, "audio.json")
    print(f"Found {len(audio_urls)} audio files, saving results to {audio_urls_path}")
    with open(audio_urls_path, "w") as f:
        json.dump(audio_urls, f, indent=2)

    ########################################
    ########## Extract image URLs ##########
    ########################################
    image_urls = []
    image_extensions = [
        ".jpg",
        ".jpeg",
        ".png",
        ".gif",
        ".svg",
        ".webp",
        ".bmp",
        ".ico",
        ".tiff",
        ".tif",
    ]
    for src in sources:
        if src.endswith(tuple(image_extensions)):
            image_urls.append(src)

    image_urls_path = os.path.join(html_dir, "images.json")
    print(f"Found {len(image_urls)} images, saving results to {image_urls_path}")
    with open(image_urls_path, "w") as f:
        json.dump(image_urls, f, indent=2)

    image_dir = os.path.join(html_dir, "images")
    os.makedirs(image_dir, exist_ok=True)
    print(f"Dowloading {len(image_urls)} images to {image_dir}")
    for image_url in image_urls:
        print("Downloading image:", image_url)
        image_file_path = os.path.join(image_dir, os.path.basename(image_url))
        download_file(image_url, image_file_path)

    ########################################
    ########## Extract videos #############
    ########################################

    video_urls = []
    video_extensions = [".mp4", ".webm", ".ogg", ".avi", ".flv", ".mov", ".wmv", ".mkv"]
    for src in sources:
        if src.endswith(tuple(video_extensions)):
            video_urls.append(src)

    video_urls_path = os.path.join(html_dir, "videos.json")
    print(f"Found {len(video_urls)} videos, saving results to {video_urls_path}")
    with open(video_urls_path, "w") as f:
        json.dump(video_urls, f, indent=2)

    video_dir = os.path.join(html_dir, "videos")
    os.makedirs(video_dir, exist_ok=True)
    print(f"Dowloading {len(video_urls)} videos to {video_dir}")
    for video_url in video_urls:
        print("Downloading video:", video_url)
        video_file_path = os.path.join(video_dir, os.path.basename(video_url))
        download_file(video_url, video_file_path)

    ########################################
    ########## Extract Shockwave files ####
    ########################################
    swf_urls = []
    for src in sources:
        if src.endswith(".swf"):
            swf_urls.append(src)

    swf_urls_path = os.path.join(html_dir, "swf.json")
    print(f"Found {len(swf_urls)} Shockwave files, saving results to {swf_urls_path}")
    with open(swf_urls_path, "w") as f:
        json.dump(swf_urls, f, indent=2)

    swf_dir = os.path.join(html_dir, "swf")
    os.makedirs(swf_dir, exist_ok=True)
    print(f"Dowloading {len(swf_urls)} Shockwave files to {swf_dir}")
    for swf_url in swf_urls:
        print("Downloading Shockwave file:", swf_url)
        swf_file_path = os.path.join(swf_dir, os.path.basename(swf_url))
        download_file(swf_url, swf_file_path)

    ########################################
    ########## Extract VRML files ##########
    ########################################
    vrml_urls = []
    for src in sources:
        if src.endswith(".wrl"):
            vrml_urls.append(src)

    vrml_urls_path = os.path.join(html_dir, "vrml.json")
    print(f"Found {len(vrml_urls)} VRML files, saving results to {vrml_urls_path}")
    with open(vrml_urls_path, "w") as f:
        json.dump(vrml_urls, f, indent=2)

    vrml_dir = os.path.join(html_dir, "vrml")
    os.makedirs(vrml_dir, exist_ok=True)
    print(f"Dowloading {len(vrml_urls)} VRML files to {vrml_dir}")
    for vrml_url in vrml_urls:
        print("Downloading VRML file:", vrml_url)
        vrml_file_path = os.path.join(vrml_dir, os.path.basename(vrml_url))
        download_file(vrml_url, vrml_file_path)

    clear_output()

### Congrates! You have downloaded the media assets from the html page!!!
Check deep in your `data` folder to see the downloaded assets.