In [1]:
# [单元格1：导入库]
import re
import time
import random
import json
from urllib.parse import urljoin, urlencode
import requests
from bs4 import BeautifulSoup
from tenacity import retry, wait_fixed, stop_after_attempt, retry_if_exception_type
import pandas as pd

print("所有依赖导入完成")

所有依赖导入完成


In [2]:
# [单元格2：配置参数]
PROFILE_URL = "https://www.douban.com/people/253231903/"

# 手动输入 cookie
COOKIES = input("请输入你的 Douban Cookie： ")

OUT_PREFIX = "my_douban_data"

MOBILE_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

# 输入页数
PAGES = int(input("输入页数: "))
print(f"将抓取 {PAGES} 页数据")

请输入你的 Douban Cookie：  ll="108288"; bid=EphK7hIfyU8; _pk_id.100001.8cb4=b9803c68ca29e27b.1758790568.; push_noty_num=0; push_doumail_num=0; _vwo_uuid_v2=D6E979802290DA50DBF2DD61AA791A1C0|39270a048722a389cff05b117b602d4b; dbcl2="253231903:1KTKV/As35Q"; __utmv=30149280.25323; __utmz=30149280.1759511831.3.3.utmcsr=localhost:8888|utmccn=(referral)|utmcmd=referral|utmcct=/; ct=y; ck=Lkgg; ap_v=0,6.0; __utmc=30149280; frodotk_db="8cd4beafe6ffe240a67e3c0fb4ffa6c0"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1759818007%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.8cb4=1; __utma=30149280.1370431101.1759501949.1759815600.1759818007.7; __utmt=1; __utmb=30149280.3.9.1759818007
输入页数:  45


将抓取 45 页数据


In [3]:
# [单元格3：定义工具函数]
def parse_cookie_str(cookie_str: str) -> dict:
    """把 'a=1; b=2; dbcl2="xxx"' 转成 dict"""
    jar = {}
    for part in cookie_str.split(";"):
        if "=" in part:
            k, v = part.split("=", 1)
            jar[k.strip()] = v.strip()
    return jar

def text_or_none(el):
    return el.get_text(strip=True) if el else None

def safe_int(s):
    try:
        return int(re.sub(r"[^\d]", "", s))
    except Exception:
        return None

print("工具函数定义完成")

工具函数定义完成


In [4]:
# [单元格4：定义类客户端]
class DoubanClient:
    def __init__(self, cookies: str | None = None, base: str = "https://www.douban.com"):
        self.base = base.rstrip("/") + "/"
        self.sess = requests.Session()
        self.sess.headers.update({
            "User-Agent": MOBILE_UA,
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": "https://www.douban.com/",
            "Connection": "keep-alive",
        })
        if cookies:
            self.sess.cookies.update(parse_cookie_str(cookies))

    @retry(wait=wait_fixed(2), stop=stop_after_attempt(3), reraise=True)
    def get(self, url: str, params: dict | None = None) -> requests.Response:
        resp = self.sess.get(url, params=params, timeout=15)
        if resp.status_code in (403, 429):
            raise requests.HTTPError(f"HTTP {resp.status_code}")
        resp.raise_for_status()
        return resp

    def soup(self, path_or_url: str, params: dict | None = None) -> BeautifulSoup:
        url = path_or_url if path_or_url.startswith("http") else urljoin(self.base, path_or_url)
        r = self.get(url, params=params)
        return BeautifulSoup(r.text, "lxml")

print("豆瓣客户端类定义完成")

豆瓣客户端类定义完成


In [5]:
# [单元格5：定义解析函数]
def parse_profile_summary(soup: BeautifulSoup) -> dict:
    """个人主页概要"""
    out = {}
    
    # 昵称
    h1 = soup.select_one("#db-usr-profile h1, h1")
    out["nickname"] = text_or_none(h1)
    
    # 签名
    signature = soup.select_one(".signature, .user-intro, #user_intro")
    out["signature_or_intro"] = text_or_none(signature)
    
    # 地点
    pl_items = soup.select(".user-info .pl")
    for pl in pl_items:
        t = pl.get_text(" ", strip=True)
        if "常居" in t or "所在地" in t:
            out["location"] = t.replace("常居:", "").replace("所在地:", "").strip()
    
    # 关注/被关注
    follow_links = soup.select('a[href*="/contacts"] , a[href*="/rev_contacts"]')
    for a in follow_links:
        href = a.get("href", "")
        if "/contacts" in href and "/rev_contacts" not in href:
            out["following"] = safe_int(a.get_text())
        elif "/rev_contacts" in href:
            out["followers"] = safe_int(a.get_text())
    
    # 各种计数
    counters = {}
    for a in soup.select("a[href]"):
        txt = a.get_text(strip=True)
        href = a.get("href", "")
        if "movie" in href and any(k in txt for k in ["看过", "在看", "想看", "电影"]):
            counters.setdefault("movie", set()).add(txt)
        if "book" in href and any(k in txt for k in ["读过", "在读", "想读", "读书"]):
            counters.setdefault("book", set()).add(txt)
        if "music" in href and any(k in txt for k in ["听过", "在听", "想听", "音乐"]):
            counters.setdefault("music", set()).add(txt)
        if "note" in href and "日记" in txt:
            counters.setdefault("note", set()).add(txt)
        if "status" in href and "广播" in txt:
            counters.setdefault("status", set()).add(txt)
    out["counters_raw"] = {k: sorted(list(v)) for k, v in counters.items()}
    
    return out

def parse_status_page(soup: BeautifulSoup) -> list[dict]:
    """广播页面"""
    items = []
    for it in soup.select(".status-item, .new-status, .status-wrapper"):
        ts = text_or_none(it.select_one(".created_at, .pubtime, .timestamp"))
        content = text_or_none(it.select_one(".status-content, .bd, .content, .text"))
        link_el = it.select_one("a[href*='/status/']")
        link = link_el.get("href") if link_el else None
        items.append({"time": ts, "text": content, "link": link})
    return items

def parse_collect_grid(soup: BeautifulSoup) -> list[dict]:
    """标记页面网格 - 含评分"""
    rows = []
    for it in soup.select(".grid-view .item, .collect-list .item, .interest-list .item"):
        title_el = it.select_one(".title a, .info h2 a, .title a.nbg")
        title = text_or_none(title_el)
        link = title_el.get("href") if title_el else None
        info = text_or_none(it.select_one(".intro, .pub, .info"))
        date = text_or_none(it.select_one(".date, .time, .collect-date"))
        poster_el = it.select_one("img")
        poster = poster_el.get("src") if poster_el else None
        
        # 增强评分提取 - 尝试多种选择器
        rating = None
        
        # 方法1: 查找星级评分 (class包含rating)
        rating_els = it.select("[class*='rating'], .stars, .rating_num, .ll.rating_num")
        for rating_el in rating_els:
            # 处理 class="rating4-t" 这样的评分
            class_str = ' '.join(rating_el.get('class', []))
            if 'rating' in class_str:
                match = re.search(r'rating(\d+)-t', class_str)
                if match:
                    rating = int(match.group(1)) * 2  # 5星制转10分制
                    break
            
            # 处理数字评分
            rating_text = rating_el.get_text(strip=True)
            rating_match = re.search(r'(\d+\.?\d*)', rating_text)
            if rating_match:
                try:
                    rating = float(rating_match.group(1))
                    break
                except ValueError:
                    continue
        
        # 方法2: 查找包含"推荐"或星号的文本
        if rating is None:
            interest_el = it.select_one(".interest-ext, .comment, .tags")
            if interest_el:
                interest_text = interest_el.get_text()
                if '推荐' in interest_text or '★' in interest_text:
                    # 简单判断为有评分但未显示具体分数
                    rating = "有评价"
        
        rows.append({
            "title": title, 
            "info": info, 
            "mark_time": date, 
            "link": link, 
            "poster": poster,
            "rating": rating
        })
    return rows
print("解析器函数定义完成")

解析器函数定义完成


In [6]:
# [单元格6：定义抓取函数]
def crawl_profile(client: DoubanClient, profile_url: str) -> dict:
    """抓取个人主页概要"""
    print("正在抓取个人主页概要...")
    soup = client.soup(profile_url)
    out = parse_profile_summary(soup)
    m = re.search(r"/people/([^/]+)/", profile_url)
    out["uid"] = m.group(1) if m else None
    print("个人主页概要抓取完成")
    return out

def crawl_statuses(client: DoubanClient, uid: str, pages: int = 2, delay=(1.2, 2.5)) -> list[dict]:
    """抓取广播"""
    print(f"正在抓取广播，共{pages}页...")
    all_rows = []
    for p in range(1, pages + 1):
        print(f"  正在抓取第 {p}/{pages} 页广播...")
        url = f"https://www.douban.com/people/{uid}/statuses"
        soup = client.soup(url + "?" + urlencode({"p": p}))
        rows = parse_status_page(soup)
        all_rows.extend(rows)
        time.sleep(random.uniform(*delay))
    print(f"广播抓取完成，共{len(all_rows)}条")
    return all_rows

def crawl_collect(client: DoubanClient, uid: str, cat: str = "movie", interest: str = "collect",
                  pages: int = 2, delay=(1.2, 2.5)) -> list[dict]:
    """抓取标记内容"""
    print(f"正在抓取{cat}的{interest}，共{pages}页...")
    all_rows = []
    
    domain = {
        "movie": "https://movie.douban.com",
        "book":  "https://book.douban.com",
        "music": "https://music.douban.com",
    }.get(cat, "https://www.douban.com")
    
    path = {"wish": "wish", "do": "do", "collect": "collect"}.get(interest, "collect")
    base_url = f"{domain}/people/{uid}/{path}"
    
    for p in range(pages):
        print(f"  正在抓取第 {p+1}/{pages} 页{cat}的{interest}...")
        params = {"start": p * 15, "sort": "time"}
        try:
            soup = client.soup(base_url, params=params)
            rows = parse_collect_grid(soup)
            all_rows.extend(rows)
        except Exception as e:
            try:
                fallback_params = {"cat": cat, "sort": "time", "start": p * 15, "mode": "grid"}
                soup = client.soup(f"https://www.douban.com/people/{uid}/{path}", params=fallback_params)
                rows = parse_collect_grid(soup)
                all_rows.extend(rows)
            except Exception:
                print(f"[WARN] {cat}/{interest} 第 {p+1} 页抓取失败：{e}")
        time.sleep(random.uniform(*delay))
    
    print(f"{cat}的{interest}抓取完成，共{len(all_rows)}条")
    return all_rows

print("抓取函数定义完成")

抓取函数定义完成


In [7]:
# [单元格7：初始化客户端]
print("正在初始化豆瓣客户端...")
client = DoubanClient(cookies=COOKIES)
print("豆瓣客户端初始化完成")

正在初始化豆瓣客户端...
豆瓣客户端初始化完成


In [8]:
# [单元格8：开始抓取数据]
print("=" * 50)
print("开始抓取豆瓣数据...")
print("=" * 50)

# 主页概要
prof = crawl_profile(client, PROFILE_URL)
uid = prof.get("uid")
print(f"用户ID: {uid}")
print(f"昵称: {prof.get('nickname')}")

# 广播（保留功能但不获取）
# statuses = crawl_statuses(client, uid=uid, pages=PAGES)
statuses = []
print("跳过广播抓取")

# 电影（只获取这部分）
movies_wish = crawl_collect(client, uid, cat="movie", interest="wish", pages=PAGES)
movies_do   = crawl_collect(client, uid, cat="movie", interest="do", pages=PAGES)
movies_done = crawl_collect(client, uid, cat="movie", interest="collect", pages=PAGES)

# 图书（保留功能但不获取）
# books_wish  = crawl_collect(client, uid, cat="book",  interest="wish", pages=PAGES)
# books_do    = crawl_collect(client, uid, cat="book",  interest="do",   pages=PAGES)
# books_done  = crawl_collect(client, uid, cat="book",  interest="collect", pages=PAGES)
books_wish = []
books_do = []
books_done = []
print("跳过图书抓取")

# 音乐（保留功能但不获取）
# music_wish  = crawl_collect(client, uid, cat="music", interest="wish", pages=PAGES)
# music_do    = crawl_collect(client, uid, cat="music", interest="do",   pages=PAGES)
# music_done  = crawl_collect(client, uid, cat="music", interest="collect", pages=PAGES)
music_wish = []
music_do = []
music_done = []
print("跳过音乐抓取")

print("所有数据抓取完成！")

开始抓取豆瓣数据...
正在抓取个人主页概要...
个人主页概要抓取完成
用户ID: 253231903
昵称: jiam18(编辑)
跳过广播抓取
正在抓取movie的wish，共45页...
  正在抓取第 1/45 页movie的wish...
  正在抓取第 2/45 页movie的wish...
  正在抓取第 3/45 页movie的wish...
  正在抓取第 4/45 页movie的wish...
  正在抓取第 5/45 页movie的wish...
  正在抓取第 6/45 页movie的wish...
  正在抓取第 7/45 页movie的wish...
  正在抓取第 8/45 页movie的wish...
  正在抓取第 9/45 页movie的wish...
  正在抓取第 10/45 页movie的wish...
  正在抓取第 11/45 页movie的wish...
  正在抓取第 12/45 页movie的wish...
  正在抓取第 13/45 页movie的wish...
  正在抓取第 14/45 页movie的wish...
  正在抓取第 15/45 页movie的wish...
  正在抓取第 16/45 页movie的wish...
  正在抓取第 17/45 页movie的wish...
  正在抓取第 18/45 页movie的wish...
  正在抓取第 19/45 页movie的wish...
  正在抓取第 20/45 页movie的wish...
  正在抓取第 21/45 页movie的wish...
  正在抓取第 22/45 页movie的wish...
  正在抓取第 23/45 页movie的wish...
  正在抓取第 24/45 页movie的wish...
  正在抓取第 25/45 页movie的wish...
  正在抓取第 26/45 页movie的wish...
  正在抓取第 27/45 页movie的wish...
  正在抓取第 28/45 页movie的wish...
  正在抓取第 29/45 页movie的wish...
  正在抓取第 30/45 页movie的wish...
  正在抓取第 31/45 页movie的wish...
  正在抓取第 32/4

In [9]:
# [单元格9：预览]
import pandas as pd

print("数据预览:")

# 已看电影
try:
    df_done = pd.read_csv(f"{OUT_PREFIX}_movies_done.csv")
    df_done_clean = df_done.drop(['link', 'poster', 'info'], axis=1, errors='ignore')
    print(f"\n已看电影 ({len(df_done_clean)}部) - 前10部:")
    display(df_done_clean.head(10))
except:
    print("无已看电影数据")

# 想看电影  
try:
    df_wish = pd.read_csv(f"{OUT_PREFIX}_movies_wish.csv")
    df_wish_clean = df_wish.drop(['link', 'poster', 'info'], axis=1, errors='ignore')
    print(f"\n想看电影 ({len(df_wish_clean)}部) - 前10部:")
    display(df_wish_clean.head(10))
except:
    print("无想看电影数据")

数据预览:

已看电影 (30部) - 前10部:


Unnamed: 0,title,mark_time,rating
0,合法副本 / Copie conforme/ 似是有缘人(港) / 爱情对白(台),2025-09-15,10
1,沙丘2 / Dune: Part Two/ 沙丘：第二部(台) / 沙丘瀚战：第二章(港),2025-09-08,6
2,爱情短片 / Krótki film o miłości/ 情路长短调(台) / 关于爱情的短片,2025-08-31,10
3,乱/ 乱日本战国时代 / Revolt,2025-08-23,10
4,蓝白红三部曲之红 / Trois couleurs: Rouge/ 红 / 红色情深(台),2025-08-11,10
5,蓝白红三部曲之白 / Trois couleurs: Blanc/ 白 / 白色情迷(台),2025-08-10,10
6,性梦爱三部曲：梦 / Drømmer/ 性爱梦之做梦(港) / 同夢奇緣之夢(港),2025-08-09,10
7,蓝白红三部曲之蓝 / Trois couleurs: Bleu/ 蓝 / 蓝色情挑(台),2025-08-05,10
8,于洛先生的假期 / Les vacances de Monsieur Hulot/ 妙人异迹...,2025-08-03,6
9,F1：狂飙飞车 / F1: The Movie/ F1：赛道风云 / F1电影(台),2025-07-10,8



想看电影 (30部) - 前10部:


Unnamed: 0,title,mark_time,rating
0,生生长流 / زندگی و دیگر هیچ/ 春风吹又生(港) / 生活在继续,2025-09-25,
1,痛苦与荣耀 / Dolor y gloria/ 万千痛爱在一身(港) / Pain & Glory,2025-09-21,
2,关于我母亲的一切 / Todo sobre mi madre/ 论尽我阿妈(港) / 我的母...,2025-09-15,
3,"巴黎，我爱你 / Paris, je t'aime/ 我爱巴黎(港) / Paris, I ...",2025-09-14,
4,地球之夜 / Night on Earth/ 地球这分钟 / 地球一夜,2025-09-09,
5,随风而逝 / باد ما را خواهد برد/ 风再起时(港) / 风带着我来(台),2025-09-08,
6,樱桃的滋味 / طعم گیلاس/ 樱桃之味 / Taste of Cherry,2025-09-08,
7,何处是我朋友的家 / خانه دوست کجاست؟‎/ 踏破铁鞋无觅处(港) / Whe...,2025-09-08,
8,楢山节考 / 楢山節考/ The Ballad of Narayama,2025-09-08,
9,红樱桃/ Red Cherry / Красная вишня,2025-09-07,


In [10]:
# [单元格10：保存数据]
print("正在保存数据...")

# 汇总 JSON
bundle = {
    "profile": prof,
    "movies": {"wish": movies_wish, "doing": movies_do, "done": movies_done},
    "meta": {"pages": PAGES}
}

# 写 JSON
json_path = f"{OUT_PREFIX}.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(bundle, f, ensure_ascii=False, indent=2)
print(f"[OK] 写入 {json_path}")

# 导出 CSV
def to_csv(rows, name):
    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(f"{OUT_PREFIX}_{name}.csv", index=False, encoding="utf-8-sig")
        print(f"[OK] 写入 {OUT_PREFIX}_{name}.csv ({len(rows)}条)")

to_csv(movies_wish, "movies_wish")
to_csv(movies_do, "movies_doing")
to_csv(movies_done, "movies_done")

print("所有数据保存完成！")

正在保存数据...
[OK] 写入 my_douban_data.json
[OK] 写入 my_douban_data_movies_wish.csv (125条)
[OK] 写入 my_douban_data_movies_done.csv (665条)
所有数据保存完成！
