In [8]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Function to extract the manga name from the URL
def extract_manga_name(url):
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.strip("/").split("-chapter")[0]  # Remove chapter info
    return path_parts.replace("/", "_")  # Ensure valid folder name

# Function to sanitize filenames
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (" ", ".", "_")).rstrip()

# Function to download images in order from the "readerarea" div
def download_manga_images(url):
    manga_name = extract_manga_name(url)
    save_dir = os.path.join("Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap", manga_name)  # Folder for the manga
    os.makedirs(save_dir, exist_ok=True)

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Find the div with id="readerarea"
    readerarea = soup.find("div", id="readerarea")
    if not readerarea:
        print("No 'readerarea' div found on the page.")
        return

    img_tags = readerarea.find_all("img")  # Find only images inside "readerarea"

    for index, img in enumerate(img_tags, start=1):  # Keep images in order
        img_url = img.get("src")
        if not img_url:
            continue

        # Only process PNG and JPG images
        if not (img_url.lower().endswith(".png") or img_url.lower().endswith(".jpg") or img_url.lower().endswith(".jpeg") or img_url.lower().endswith(".webp")):
            continue

        img_url = urljoin(url, img_url)
        parsed_img_url = urlparse(img_url)
        img_extension = os.path.splitext(parsed_img_url.path)[1]  # Get file extension

        # Ensure ordered naming (e.g., 001.png, 002.jpg)
        img_name = f"{index:03d}{img_extension}"  # Format as 001, 002, etc.

        img_path = os.path.join(save_dir, img_name)

        try:
            img_response = requests.get(img_url)
            img_response.raise_for_status()
            with open(img_path, "wb") as img_file:
                img_file.write(img_response.content)
            print(f"Downloaded: {img_path}")
        except Exception as e:
            print(f"Failed to download {img_url}: {e}")


In [10]:
# Example usage
manga_url = "https://rawkuma.com/jitsu-wa-imouto-deshita-chapter-14/"
download_manga_images(manga_url)

Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\001.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\002.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\003.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\004.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\005.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\006.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\007.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\008.jpg
Downloaded: Z:/github/Autotranslate_Manga/MangaAutoTranslator/dataset/in/jap\jitsu-wa-imouto-deshita\009.jpg
Downloaded: Z:/gith