In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
import time

BASE_URL = "https://*** fill the web you needed"  
DOWNLOAD_DIR = "samurai_workshop_downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
VISITED = set()

def download_file(url, filename):
    try:
        response = requests.get(url, stream=True, headers=HEADERS, timeout=30)
        response.raise_for_status()
        filepath = os.path.join(DOWNLOAD_DIR, filename)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"  √ 已下载: {filename}")
        return True
    except Exception as e:
        print(f"  × 下载失败 {url}: {e}")
        return False

def clean_filename(title):
    return re.sub(r'[\\/:*?"<>|]', '', title).strip()

def is_document_link(href):
    return re.search(r'\.(pdf|pptx?|docx?|xls[xm]?|zip)$', href, re.IGNORECASE)

def get_absolute_url(base, link):
    if link.startswith('http'):
        return link
    if link.startswith('/'):
        return '/'.join(base.split('/')[:3]) + link
    return base.rstrip('/') + '/' + link

def crawl_recursive(url, depth=1, max_depth=3):
    if depth > max_depth or url in VISITED:
        return
    VISITED.add(url)
    print(f"{'  '*depth}→ [{depth}] {url}")
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        # 下载所有文档链接
        for a in soup.find_all('a', href=True):
            href = a['href']
            if is_document_link(href):
                file_url = get_absolute_url(url, href)
                original_filename = os.path.basename(file_url.split('?')[0])
                title = clean_filename(a.get_text(strip=True)) or "file"
                final_filename = f"{title}_{original_filename}"
                download_file(file_url, final_filename)
        # 递归查找所有子链接
        for a in soup.find_all('a', href=True):
            link = a['href']
            abs_link = get_absolute_url(url, link)
            # 只递归 http(s) 链接，避免下载文件和外部站点
            if abs_link.startswith('http') and not is_document_link(abs_link):
                crawl_recursive(abs_link, depth+1, max_depth)
    except Exception as e:
        print(f"{'  '*depth}× 访问失败 {url}: {e}")

if __name__ == "__main__":
    crawl_recursive(BASE_URL, depth=1, max_depth=6)
    print("\n递归爬虫任务完成。请检查 'samurai_workshop_downloads' 目录。")

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
import time

BASE_URL = "  "
DOWNLOAD_DIR = "samurai_workshop_downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
VISITED = set()
ALLOWED_PREFIXES = set()

def download_file(url, filename):
    try:
        response = requests.get(url, stream=True, headers=HEADERS, timeout=30)
        response.raise_for_status()
        filepath = os.path.join(DOWNLOAD_DIR, filename)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"  √ 已下载: {filename}")
        return True
    except Exception as e:
        print(f"  × 下载失败 {url}: {e}")
        return False

def clean_filename(title):
    return re.sub(r'[\\/:*?"<>|]', '', title).strip()

def is_document_link(href):
    return re.search(r'\.(pdf|pptx?|docx?|xls[xm]?|zip)$', href, re.IGNORECASE)

def get_absolute_url(base, link):
    if link.startswith('http'):
        return link
    if link.startswith('/'):
        return '/'.join(base.split('/')[:3]) + link
    return base.rstrip('/') + '/' + link

def collect_allowed_prefixes(base_url):
    """收集第二层目录的绝对链接前缀"""
    prefixes = set()
    try:
        resp = requests.get(base_url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        for a in soup.find_all('a', href=True):
            link = a['href']
            abs_link = get_absolute_url(base_url, link)
            # 只收集 http(s) 链接且不是文档文件
            if abs_link.startswith('http') and not is_document_link(abs_link):
                # 只收集属于 base_url 域名下的链接
                if abs_link.startswith(base_url) or abs_link.startswith('/'.join(base_url.split('/')[:3])):
                    # 只保留到目录层级（去掉参数和锚点）
                    abs_link = abs_link.split('?')[0].split('#')[0].rstrip('/') + '/'
                    prefixes.add(abs_link)
    except Exception as e:
        print(f"收集第二层目录失败: {e}")
    return prefixes

def crawl_recursive(url, depth=1, max_depth=6):
    if depth > max_depth or url in VISITED:
        return
    # 限制递归范围：只允许在 ALLOWED_PREFIXES 下递归
    if depth > 2:
        in_allowed = False
        for prefix in ALLOWED_PREFIXES:
            if url.startswith(prefix):
                in_allowed = True
                break
        if not in_allowed:
            return
    VISITED.add(url)
    print(f"{'  '*depth}→ [{depth}] {url}")
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        # 下载所有文档链接
        for a in soup.find_all('a', href=True):
            href = a['href']
            if is_document_link(href):
                file_url = get_absolute_url(url, href)
                original_filename = os.path.basename(file_url.split('?')[0])
                title = clean_filename(a.get_text(strip=True)) or "file"
                final_filename = f"{title}_{original_filename}"
                download_file(file_url, final_filename)
        # 递归查找所有子链接
        for a in soup.find_all('a', href=True):
            link = a['href']
            abs_link = get_absolute_url(url, link)
            # 只递归 http(s) 链接，避免下载文件和外部站点
            if abs_link.startswith('http') and not is_document_link(abs_link):
                crawl_recursive(abs_link, depth+1, max_depth)
    except Exception as e:
        print(f"{'  '*depth}× 访问失败 {url}: {e}")

if __name__ == "__main__":
    global ALLOWED_PREFIXES
    ALLOWED_PREFIXES = collect_allowed_prefixes(BASE_URL)
    print("允许递归的第二层目录:")
    for p in ALLOWED_PREFIXES:
        print("  ", p)
    crawl_recursive(BASE_URL, depth=1, max_depth=6)
    print("\n递归爬虫任务完成。请检查 'samurai_workshop_downloads' 目录。")