In [28]:
from __future__ import annotations
import multitasking
import requests
from tqdm import tqdm
import pandas as pd
import json
from retry import retry
from bs4 import BeautifulSoup

In [44]:
def get_uid_by_url_token(url_token: str) -> str:
    """
    根据知乎用户 url_token 获取其 uid

    Parameters
    ----------
    url_token : 知乎用户 url_token
        例如主页为:https://www.zhihu.com/people/la-ge-lang-ri-96-69 的用户
        其 url_token 为: la-ge-lang-ri-96-69

        注意,此参数类型为字符串

    Return
    ------
    str : 用户 uid
    """
    headers = {
        "authority": "www.zhihu.com",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.68",
        "x-requested-with": "fetch",
        "content-type": "multipart/form-data; boundary=----WebKitFormBoundarycwskcLmf85lBwPKR",
        "accept": "*/*",
        "origin": "https://www.zhihu.com",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "cors",
        "sec-fetch-dest": "empty",
        "referer": "https://www.zhihu.com/",
        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    }

    url = "https://api.zhihu.com/people/" + url_token
    response = requests.get(url, headers=headers)
    uid = response.json()["id"]
    return uid


@retry(tries=3)
def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame:
    """
    获取用户的回答脚本数据列表

    Parameters
    ----------
    url_token : 知乎用户 url_token
        例如主页为:https://www.zhihu.com/people/la-ge-lang-ri-96-69 的用户
        其 url_token 为: la-ge-lang-ri-96-69

        注意,此参数类型为字符串

    max_count : 限制获取的最大回答数(默认为 100000)

    Return
    ------
    DataFrame : 包含用户回答数据的 DataFrame


    """
    headers = {
        "User-Agent": "osee2unifiedRelease/4318 osee2unifiedReleaseVersion/7.7.0 Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
        "X-APP-BUILD-VERSION": "4318",
        "x-app-bundleid": "com.zhihu.ios",
        "X-APP-ZA": "OS=iOS&Release=14.5&Model=iPhone10,1&VersionName=7.7.0&VersionCode=4318&Width=750&Height=1334&DeviceType=Phone&Brand=Apple&OperatorType=46009",
    }

    operations = {
        "作者名称": ["author", lambda x: x["name"]],
        "作者ID": ["author", lambda x: x["id"]],
        "作者token": ["author", lambda x: x["url_token"]],
        "回答点赞数": ["voteup_count", lambda x: x],
        "回答时间": ["created_time", lambda x: x],
        "更新时间": ["updated_time", lambda x: x],
        "回答ID": ["url", lambda x: x.split("/")[-1]],
        "问题ID": ["question", lambda x: x["id"]],
        "问题内容": ["question", lambda x: x["title"]],
    }
    try:
        uid = get_uid_by_url_token(url_token)
    except:
        return pd.DataFrame(columns=operations.keys())
    bar: tqdm = None
    offset = 0
    limit = 20
    dfs: list[pd.DataFrame] = []
    url = f"https://api.zhihu.com/members/{uid}/answers"
    while 1:
        params = (
            ("limit", f"{limit}"),
            ("offset", f"{offset}"),
        )

        response = requests.get(url, headers=headers, params=params)

        if response.json().get("paging") is None:
            return pd.DataFrame(columns=operations.keys())
        total = response.json()["paging"]["totals"]
        if bar is None:
            bar = tqdm(total=total, desc="获取回答数据中")
        bar.update(limit)
        data = response.json().get("data")
        raw_df = pd.DataFrame(data)
        if len(raw_df) == 0 or offset >= total or offset > max_count:
            break
        df = pd.DataFrame(columns=operations.keys())
        for new_column, (old_column, operation) in operations.items():
            df[new_column] = raw_df[old_column].apply(operation)
        dfs.append(df)
        offset += 20

    bar.close()
    df = pd.concat(dfs)
    return df


def get_answer_content(qid: str, aid) -> str:
    """
    根据回答ID和问题ID获取回答内容

    Parameters
    ----------
    qid : 问题ID
    aid : 回答ID
    例如一个回答链接为: https://www.zhihu.com/question/438404653/answer/1794419766

    其 qid 为 438404653

    其 aid 为 1794419766

    注意,这两个参数均为字符串

    Return
    ------
    str : 回答内容
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Mobile/15E148 Safari/604.1",
        "Host": "www.zhihu.com",
    }
    url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.text, "html.parser")
    content = " ".join([p.text.strip() for p in soup.find_all("p")])
    return content


def reformat_csv_to_openassitant(df: pd.DataFrame) -> pd.DataFrame:
    """
    Reformat the downloaded CSV into either Instruction or Text format
    so that it could be directly ingested into the training pipeline.

    Parameters
    ----------
    df: the downloaded panda dataframe

    Return
    ------
    DataFrame: reformatted dataframe
    """
    new_df = pd.DataFrame()
    new_df["INSTRUCTION"] = df["问题内容"]
    new_df["RESPONSE"] = df["回答内容"]
    new_df["SOURCE"] = "Zhihu"
    new_df["METADATA"] = df.apply(
        lambda x: json.dumps(
            {
                "回答点赞数": x["回答点赞数"],
                "回答时间": x["回答时间"],
                "更新时间": x["更新时间"],
            },
            ensure_ascii=False,
        ),
        axis=1,
    )

    return new_df


def save_answers_to_csv(url_token: str, csv_path: str, max_count: int = 10000) -> None:
    """
    根据用户 url_token 获取用户回答数据,并保存到 csv 文件中

    Parameters
    ----------
    url_token : 知乎用户 url_token
        例如主页为:https://www.zhihu.com/people/la-ge-lang-ri-96-69 的用户
        其 url_token 为: la-ge-lang-ri-96-69

        注意,此参数类型为字符串

    csv_path : 待保存的回答数据 csv 路径
        例如: '回答数据.csv'

    max_count : 限制获取的最大回答数(可选,默认为 100000)

    Return
    ------
    DataFrame:包含用户多个回答数据的 DataFrame
    """
    df = get_user_answers(url_token, max_count=max_count)
    if len(df) == 0:
        print("url_token 可能有误!")
        return
    content_list = {}

    @retry(tries=3)
    @multitasking.task
    def start(qid: str, aid: str):
        content = get_answer_content(qid, aid)
        content_list[
            qid
        ] = content  # make sure the qid and aid answer are corresponding during multitasking
        bar.update()

    bar = tqdm(total=len(df), desc="获取回答内容")
    for row in df.iloc:
        qid, aid = row["问题ID"], row["回答ID"]
        start(qid, aid)
    multitasking.wait_for_tasks()
    df["回答内容"] = df["问题ID"].apply(lambda x: content_list[x])
    updated_df = reformat_csv_to_openassitant(df)
    updated_df.to_csv(csv_path, encoding="utf-8-sig", index=None)
    bar.close()
    print(f"url_token 为 {url_token} 的用户回答数据已存储到文件:{csv_path}")

    return updated_df

In [45]:
url_token = "nicole-97-93"
# 回答数据保存路径
csv_path = "nicole-97-93.csv"
# 调用函数获取数据
df = save_answers_to_csv(url_token, csv_path)



获取回答数据中:   0%|                                                                | 0/7 [00:00<?, ?it/s][A[A

获取回答数据中: 40it [00:00, 112.11it/s]                                                                   [A[A


获取回答内容:   0%|                                                                  | 0/7 [00:00<?, ?it/s][A[A

获取回答内容:  14%|████████▎                                                 | 1/7 [00:00<00:03,  1.86it/s][A[A

获取回答内容:  29%|████████████████▌                                         | 2/7 [00:00<00:01,  2.91it/s][A[A

获取回答内容:  57%|█████████████████████████████████▏                        | 4/7 [00:00<00:00,  4.45it/s][A[A

获取回答内容:  71%|█████████████████████████████████████████▍                | 5/7 [00:01<00:00,  3.56it/s][A[A

获取回答内容: 100%|██████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.95it/s][A[A

url_token 为 nicole-97-93 的用户回答数据已存储到文件:nicole-97-93.csv





In [46]:
df.METADATA.iloc[0]

'{"回答点赞数": 1, "回答时间": 1637027769, "更新时间": 1637027769}'

In [47]:
df

Unnamed: 0,INSTRUCTION,RESPONSE,SOURCE,METADATA
0,call of silence在动漫哪里出现过？,日本漫画家谏山创创作的《进击的巨人》里面有,Zhihu,"{""回答点赞数"": 1, ""回答时间"": 1637027769, ""更新时间"": 16370..."
1,高三毕业不打暑假工就是不上进吗？,看你怎么想了吧，你要做的目的是什么，不做的目的是什么，这点要想清楚。不做的目的可以是抓紧学习...,Zhihu,"{""回答点赞数"": 0, ""回答时间"": 1597887713, ""更新时间"": 15978..."
2,如何看待成都一男子公交站外持刀伤人致死?,知道后第一反应觉得非常的震惊，从月黑风高上升到光天化日之下杀人。 我已经有自己被遇害的画面了...,Zhihu,"{""回答点赞数"": 2, ""回答时间"": 1592904740, ""更新时间"": 15929..."
3,养猫该不该让猫进卧室？,我是不推荐让猫猫进房间，在哪都可以睡啦。 我合租的是三房一厅两卫的套间，两只猫。 从小就不让...,Zhihu,"{""回答点赞数"": 5, ""回答时间"": 1587792940, ""更新时间"": 15877..."
4,如何看待 N 号房事件又 10 名男性运营者被逮捕，多为未成年人，最小 12 岁？,青少年性教育真的很重要。 含蓄这个性格有利有弊。要正确的认识性教育，稍微偏一点都不行。我不希...,Zhihu,"{""回答点赞数"": 0, ""回答时间"": 1586245854, ""更新时间"": 15862..."
5,分享一下你的狗狗的可爱照片和小故事好吗？,关于我家拉布拉多起名多多的可爱照片太多了！！因为它实在是太可爱了哈哈哈哈哈 多多第一天晚上到...,Zhihu,"{""回答点赞数"": 0, ""回答时间"": 1582905777, ""更新时间"": 15829..."
6,该不该在亲戚家公司上班？,千万不要去，我就说些亲身经历。希望你永远都不要与我感同身受（因为别去是最好的呵呵....） ...,Zhihu,"{""回答点赞数"": 61, ""回答时间"": 1543734202, ""更新时间"": 1543..."


In [16]:
from __future__ import annotations

import dataclasses
import re
import time
from dataclasses import dataclass
from typing import List, Union

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from loguru import logger
from playwright.sync_api import Locator, Page, sync_playwright
from playwright.async_api import async_playwright
from tqdm import tqdm

In [17]:
@dataclass
class Content_Data:
    question_id: int
    answer_id: int
    author_id: str
    question_title: str
    content: str
    upvotes: str
    answer_creation_time: str


def get_answer_content(qid: int, aid: int, question_str: str) -> str:
    """
    根据回答ID和问题ID获取回答内容
    Parameters
    ----------
    qid : 问题ID
    aid : 回答ID
    例如一个回答链接为: https://www.zhihu.com/question/438404653/answer/1794419766
    其 qid 为 438404653
    其 aid 为 1794419766
    注意,这两个参数均为字符串
    Return
    ------
    str : 回答内容
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Mobile/15E148 Safari/604.1",
        "Host": "www.zhihu.com",
    }
    url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.text, "html.parser")
    content = " ".join([p.text.strip() for p in soup.find_all("p")])
    """
    "<meta itemProp="dateCreated" content="2023-02-20T13:19:30.000Z"/>"
    last time from meta tag with item prop attributes seems to be the post creation datetime. I verified by looking at page online

    """
    answer_creation_time_div = soup.find_all(
        "meta",
        {"itemprop": "dateCreated"},
    )
    answer_creation_time_content = ""
    if len(answer_creation_time_div) > 0:
        answer_creation_time_content = answer_creation_time_div[-1].attrs["content"]
    upvotes = (
        soup.find(
            "button",
            {"class": "Button VoteButton VoteButton--up"},
        )
        .get_text()
        .replace("\u200b", "")
    )
    author_ids = soup.find_all(
        "meta",
        {"itemprop": "url"},
    )
    author_id_div = [x for x in author_ids if "/people/" in x.attrs["content"]]
    author_id = author_id_div[0].attrs["content"]
    return Content_Data(
        question_id=qid,
        answer_id=aid,
        author_id=author_id,
        question_title=question_str,
        content=content,
        upvotes=upvotes,
        answer_creation_time=answer_creation_time_content,
    )


def get_all_href(page: Union[Page, Locator]) -> List[str]:
    hrefs = page.evaluate(
        """() => {
            let links = document.querySelectorAll('[href]');
            let hrefs = [];
            for (let link of links) {
                hrefs.push(link.href);
            }
            return hrefs;
        }"""
    )
    valid_hrefs = [x for x in hrefs if isinstance(x, str) and "https://" in x]
    return valid_hrefs


"""
Scrape people from round table topics. Save a list of zhihu people profile url to csv
"""


def scrape_people_roundtable():
    headless = False
    all_ppl_df = pd.DataFrame()
    roundtable_topic_scrolldown = 20
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=headless, timeout=60000)
        page = browser.new_page()
        page.goto("https://zhihu.com/roundtable")
        # Scroll down roundtable topic to get more topic urls
        for _ in range(roundtable_topic_scrolldown):
            page.keyboard.down("End")
            page.wait_for_timeout(1000)

        hrefs = get_all_href(page)
        relevent_hrefs = [x for x in hrefs if "https://www.zhihu.com/roundtable/" in x]
        np.random.shuffle(relevent_hrefs)
        # Earlier round table topic might not have started yet. The offset roundtable topic is arbitrary.

        starting_offset = 4
        for topic_url in tqdm(relevent_hrefs[starting_offset:]):
            try:
                page.goto(topic_url)
                all_hrefs = get_all_href(page)
                people_urls = [x for x in all_hrefs if "/people/" in x]
                latest_people_id = pd.DataFrame({"people_id": people_urls})
                all_ppl_df = pd.concat([all_ppl_df, latest_people_id])
            except Exception as e1:
                logger.error(e1)

            all_ppl_df.to_csv("people.csv")






In [19]:

headless = False
pattern = r"/question/\d+/answer/\d+"
all_payloads = []
roundtable_topic_scrolldown = 20
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=headless, timeout=60000)
    page = await browser.new_page()
    page.goto("https://zhihu.com/roundtable")
    # Scroll down roundtable topic to get more topic urls
    for _ in range(roundtable_topic_scrolldown):
        page.keyboard.down("End")
        page.wait_for_timeout(1000)

    hrefs = get_all_href(page)
    relevent_hrefs = [x for x in hrefs if "https://www.zhihu.com/roundtable/" in x]
#         print(relevent_hrefs)
#         np.random.shuffle(relevent_hrefs)
#         # Earlier round table topic might not have started yet. The offset roundtable topic is arbitrary.

#         starting_offset = 4
#         for topic_url in tqdm(relevent_hrefs[starting_offset:]):
#             try:
#                 page.goto(topic_url)
#                 all_hrefs = get_all_href(page)
#                 question_urls = set(
#                     [x for x in all_hrefs if "/question/" in x and "waiting" not in x]
#                 )
#                 # people_urls = [x for x in all_hrefs if "/people/" in x]
#                 for qId in question_urls:
#                     qUrl = qId.replace("?write", "")

#                     page.goto(qUrl)
#                     question_title = page.locator(
#                         ".QuestionHeader-title"
#                     ).all_inner_texts()[0]
#                     all_hrefs = get_all_href(page.locator(".QuestionAnswers-answers"))
#                     # search for all question-answer url
#                     matches_question_answer_url = set(
#                         [
#                             s
#                             for s in all_hrefs
#                             if isinstance(s, str) and re.search(pattern, s)
#                         ]
#                     )

#                     for k in matches_question_answer_url:
#                         elem = k.split("/")
#                         qId = int(elem[-3])
#                         aId = int(elem[-1])

#                         complete_content_data = get_answer_content(
#                             qId, aId, question_title
#                         )

#                         content_data_dict = dataclasses.asdict(complete_content_data)
#                         all_payloads.append(content_data_dict)
#                         time.sleep(1)
#             except Exception as e1:
#                 logger.error(e1)
#             tmp_df = pd.json_normalize(all_payloads)
#             print(tmp_df)
#             tmp_df.to_csv("tmp.csv")

Error: 
╔════════════════════════════════════════════════════════════════════════════════════════════════╗
║ Looks like you launched a headed browser without having a XServer running.                     ║
║ Set either 'headless: true' or use 'xvfb-run <your-playwright-app>' before running Playwright. ║
║                                                                                                ║
║ <3 Playwright Team                                                                             ║
╚════════════════════════════════════════════════════════════════════════════════════════════════╝
=========================== logs ===========================
<launching> /home/ubuntu/.cache/ms-playwright/chromium-1048/chrome-linux/chrome --disable-field-trial-config --disable-background-networking --enable-features=NetworkService,NetworkServiceInProcess --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --disable-sync --force-color-profile=srgb --metrics-recording-only --no-first-run --enable-automation --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --no-sandbox --user-data-dir=/tmp/playwright_chromiumdev_profile-7nFPor --remote-debugging-pipe --no-startup-window
<launched> pid=120754
[pid=120754][err] [120754:120754:0301/054408.272049:ERROR:ozone_platform_x11.cc(239)] Missing X server or $DISPLAY
[pid=120754][err] [120754:120754:0301/054408.273382:ERROR:env.cc(255)] The platform failed to initialize.  Exiting.
============================================================