<a href="https://colab.research.google.com/github/yuhrtm/Tibame_GAD245/blob/main/%E6%8A%93ptt_beauty%E7%85%A7%E7%89%87.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#抓PTT beauty 的照片（指定網址）

In [None]:
import os
import urllib.request as req
from bs4 import BeautifulSoup
import json
import re
import socket

# 常數配置
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
SUPPORTED_IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "gif"}
TIMEOUT = 10  # 超時設定（秒）
BASE_FOLDER = "drive/MyDrive/PTT_Beauty"  # 主資料夾名稱

def create_folder(path):
    """建立資料夾，若已存在則忽略"""
    os.makedirs(path, exist_ok=True)

def sanitize_filename(filename):
    """清理檔案名稱，移除不合法字元"""
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def fetch_html(url):
    """獲取並解析 HTML"""
    try:
        r = req.Request(url)
        r.add_header("User-Agent", USER_AGENT)
        response = req.urlopen(r, timeout=TIMEOUT)
        return BeautifulSoup(response, "html.parser")
    except Exception as e:
        print(f"無法獲取 URL: {url}, 錯誤原因: {e}")
        return None

def download_image(href, save_path):
    """下載圖片到指定路徑"""
    try:
        r = req.Request(href)
        r.add_header("User-Agent", USER_AGENT)
        with req.urlopen(r, timeout=TIMEOUT) as img, open(save_path, "wb") as f:
            f.write(img.read())
        print(f"  成功下載圖片: {save_path}")
    except Exception as e:
        print(f"  下載失敗: {href}, 錯誤原因: {e}")

def extract_title(html):
    """從文章中提取標題"""
    title_tag = html.find("meta", property="og:title")
    return title_tag["content"].strip() if title_tag and "content" in title_tag.attrs else "未知標題"

def extract_images(html, folder_path):
    """提取並下載文章中的圖片"""
    create_folder(folder_path)
    images = []
    for link in html.find_all("a", href=True):
        href = link["href"]
        ext = href.split(".")[-1].lower()
        if ext in SUPPORTED_IMAGE_EXTENSIONS:
            sanitized_name = sanitize_filename(href.split("/")[-1])
            save_path = os.path.join(folder_path, sanitized_name)
            download_image(href, save_path)
            images.append({"url": href, "file_path": save_path})
    return images

def process_article(url):
    """處理單篇文章，下載圖片並儲存元資料"""
    print(f"\n正在處理文章: {url}")
    html = fetch_html(url)
    if not html:
        return

    title = extract_title(html)
    print(f"文章標題: {title}")
    sanitized_title = sanitize_filename(title[:30])  # 限制資料夾名稱長度

    # 在主資料夾下建立子資料夾
    article_folder = os.path.join(BASE_FOLDER, sanitized_title)
    create_folder(article_folder)

    # 提取圖片並儲存元資料
    images = extract_images(html, article_folder)

    metadata = {
        "article_url": url,
        "title": title,
        "images": images
    }

    metadata_path = os.path.join(article_folder, "metadata.json")
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)
    print(f"元資料已儲存至: {metadata_path}")
    print(f"完成處理文章: {title}")
    print("-" * 50)

def process_index_page(index_url):
    """處理索引頁面，提取文章連結並處理每篇文章"""
    print(f"正在處理索引頁面: {index_url}")
    html = fetch_html(index_url)
    if not html:
        return

    for article in html.find_all("div", class_="title"):
        a_tag = article.find("a")
        if a_tag and "href" in a_tag.attrs:
            article_url = f"https://www.ptt.cc{a_tag['href']}"
            process_article(article_url)

def main():
    """主程式入口"""
    # 創建主資料夾
    create_folder(BASE_FOLDER)

    index_url = "https://www.ptt.cc/bbs/Beauty/index3950.html"
    try:
        process_index_page(index_url)
    except KeyboardInterrupt:
        print("\n程式已中止！")
    except Exception as e:
        print(f"主程式執行時發生錯誤: {e}")

if __name__ == "__main__":
    main()


正在處理索引頁面: https://www.ptt.cc/bbs/Beauty/index3950.html

正在處理文章: https://www.ptt.cc/bbs/Beauty/M.1737162050.A.EA1.html
文章標題: [正妹] Cosplay 1614 日本 尤妮
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/VYxMbjE.jpeg
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/eD1CSNX.jpeg
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/xT2mpqY.jpeg
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/6CEIy7Z.jpeg
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/JuCSc3Y.jpeg
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/e8dN5uA.png
元資料已儲存至: drive/MyDrive/PTT_Beauty/[正妹] Cosplay 1614 日本 尤妮/metadata.json
完成處理文章: [正妹] Cosplay 1614 日本 尤妮
--------------------------------------------------

正在處理文章: https://www.ptt.cc/bbs/Beauty/M.1737164418.A.B9B.html
文章標題: [正妹] 襯衫下的驚喜
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] 襯衫下的驚喜/EC72Vej5_o.png
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] 襯衫下的驚喜/iBq2bzgs_o.png
  成功下載圖片: drive/MyDrive/PTT_Beauty/[正妹] 襯衫下的驚喜/vjnkcs4S_o.png