In [12]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import os
import base64
import cv2
from anthropic import AsyncAnthropic
from tqdm import tqdm

In [13]:
LINK_PREFIX = "https://docs.manim.community/en/stable/"
MEDIA_OUTPUT_DIR = "downloaded_media"
SUMMARY_OUTPUT_DIR = "summary"
INPUT_DIR = "page_content"

client = AsyncAnthropic()

In [14]:
def link_to_file_name(link):
    return link[39:].replace('/', '.')

In [15]:
def extract_text_and_media(path):
    with open(path, "rb") as f:
        data = f.read()
    soup = BeautifulSoup(data, "html.parser")
    media_tags = soup.find_all(["img", "video"])
    marker = "<<<MEDIA>>>"
    media_list = []

    # Get images and replace all of them with marker
    for tag in media_tags:
        if tag.name == "img":
            src = tag.get("src") or tag.get("data‑src")
            media_list.append(("image", src))
        else:
            src = tag.get("src")
            if not src and tag.find("source"):
                src = tag.find("source").get("src")
            media_list.append(("video", src))
        tag.replace_with(marker)

    # Get all the texts and split based on marker
    full_text = soup.get_text()
    parts = [piece.strip() for piece in full_text.split(marker)]

    # Interleave
    result = []
    for i, media in enumerate(media_list):
        if parts[i]:
            result.append(("text", parts[i]))
        result.append(media)
    if len(parts) > len(media_list) and parts[-1]:
        result.append(("text", parts[-1]))

    return result

In [16]:
def save_images_and_videos(url, content):
    session = requests.Session()

    for t, info in content:
        if t == "image" or t == "video":
            full_url = urljoin(url, info)
            response = session.get(full_url)

            if response.status_code == 200:
                with open(f"{MEDIA_OUTPUT_DIR}/{link_to_file_name(full_url)}", "wb") as f:
                    f.write(response.content)
                print(f"[saved] {link_to_file_name(full_url)}")
            else:
                print(f"[failed] {full_url} (status code {response.status_code})")

In [None]:
def construct_message_with_media(url, content):
    message_content = []

    for t, info in content:
        if t == "image":
            full_url = urljoin(url, info)
            path = os.path.join(MEDIA_OUTPUT_DIR, link_to_file_name(full_url))

            # No svg
            if path[-3:] == "svg":
                continue

            with open(path, "rb") as f:
                image_data = f.read()

            message_content.append({
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": base64.standard_b64encode(image_data).decode("utf-8"),
                }
            })

        elif t == "video":
            # Extract middle frame
            full_url = urljoin(url, info)
            path = os.path.join(MEDIA_OUTPUT_DIR, link_to_file_name(full_url))
            cap = cv2.VideoCapture(path)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            middle_frame_idx = frame_count // 2

            cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
            _, frame = cap.read()
            cap.release()
            _, buffer = cv2.imencode(".png", frame)
            image_data = buffer.tobytes()

            message_content.append({
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": base64.standard_b64encode(image_data).decode("utf-8"),
                }
            })
        else:
            message_content.append({ "type": "text", "text": info })
    
    return { "role": "user", "content": message_content }

In [8]:
def run_save():
    for file_name in os.listdir(INPUT_DIR):
        path = os.path.join(INPUT_DIR, file_name)
        
        with open(path, "rb") as f:
            url = f.readline().decode('utf-8').strip()
        
        content = extract_text_and_media(path)
        save_images_and_videos(url, content)

# run_save()

In [11]:
results = []

for file_name in tqdm(os.listdir(INPUT_DIR)):
    path = os.path.join(INPUT_DIR, file_name)
        
    with open(path, "rb") as f:
        url = f.readline().decode('utf-8').strip()
        
    content = extract_text_and_media(path)
    user_message = [construct_message_with_media(url, content)]
        
    query = client.messages.create(
        model="claude-3-5-haiku-20241022",
        temperature=1,
        max_tokens=8192,
        system="You are an expert technical writer. You will be given the content of some documentation for the Manim library. Please summarize the information very succinctly while also making sure that nothing is left out. Minimizing number of characters is of high importance here, keep everything in around 1 paragraph unless the information is actually really important! Furthermore, if something isn't of importance to the actual documentation (e.g. changelog, how to contribute, etc.) just say 'Not Relevant' and don't output a summary <-- very important (e.g. say 'Not Relevant' for changelog stuffs). Output only the summary/response and nothing more -- NO commentary",
        messages=user_message
    )
    results.append((file_name, query))

for file_name, query in tqdm(results):
    query = await query
    info = query.content[0].text

    with open(os.path.join(SUMMARY_OUTPUT_DIR, file_name.replace("html", "txt")), "w", encoding="utf-8") as f:
        f.write(info)


  results = []
100%|██████████| 539/539 [01:46<00:00,  5.06it/s]
100%|██████████| 539/539 [1:27:58<00:00,  9.79s/it]
