In [1]:
import requests
import time
import json
from bs4 import BeautifulSoup

In [4]:
import requests
import time
import json
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'


def get_web_page(url):
    resp = requests.get(
        url=url,
        cookies={'over18': '1'}
    )
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')

    # 取得上一頁的連結
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []  # 儲存取得的文章資料
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:  # 發文日期正確
            # 取得推文數
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)  # 轉換字串為數字
                except ValueError:
                    # 若轉換失敗，可能是'爆'或 'X1', 'X2', ...
                    # 若不是, 不做任何事，push_count 保持為 0
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            # 取得文章連結及標題
            if d.find('a'):  # 有超連結，表示文章存在，未被刪除
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text
#                 if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })
    return articles, prev_url


def get_author_ids(posts, pattern):
    ids = set()
    for post in posts:
        if pattern in post['author']:
            ids.add(post['author'])
    return ids

if __name__ == '__main__':
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []  # 全部的今日文章
        today = time.strftime("%m/%d").lstrip('0')  # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
        current_articles, prev_url = get_articles(current_page, today)  # 目前頁面的今日文章
        while current_articles:  # 若目前頁面有今日文章則加入 articles，並回到上一頁繼續尋找是否有今日文章
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)

        # 印出所有不同的 5566 id
        # print(get_author_ids(articles, '5566'))

        # 儲存或處理文章資訊
        print('今天有', len(articles), '篇文章')
        threshold = 50
        print('熱門文章(> %d 推):' % (threshold))
        for a in articles:
            if int(a['push_count']) > threshold:
                print(a)
        with open('gossiping.json', 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)
        print('\n')
        print(get_author_ids(articles, '5566'))

今天有 429 篇文章
熱門文章(> 50 推):
{'title': '[新聞] 女偶像「宅男太噁」宣布退團 揭地上滿是', 'href': '/bbs/Gossiping/M.1625708102.A.580.html', 'push_count': 99, 'author': 'squall021'}
{'title': '[問卦] 黃暐瀚朱學恆網路聲量異常大？', 'href': '/bbs/Gossiping/M.1625707138.A.00F.html', 'push_count': 77, 'author': 'bezbol'}
{'title': '[問卦] 家裡第一台電腦是用什麼作業系統?', 'href': '/bbs/Gossiping/M.1625707173.A.CB5.html', 'push_count': 56, 'author': 'KLGlikeshit'}
{'title': '[問卦] 外來種八哥過度繁殖，現在電線桿上看不到麻雀', 'href': '/bbs/Gossiping/M.1625706138.A.8CF.html', 'push_count': 91, 'author': 'likedog'}
{'title': '[問卦] 一台ATM 四個女的在排隊?', 'href': '/bbs/Gossiping/M.1625705157.A.698.html', 'push_count': 56, 'author': 'greenhua'}
{'title': 'Re: [新聞] 民進黨比國民黨可怕\u3000柯文哲：控制媒體到我懷疑自己是共', 'href': '/bbs/Gossiping/M.1625705252.A.FE1.html', 'push_count': 99, 'author': 'angellll'}
{'title': '[新聞] 中壢湯包老闆遭控「休息室性侵女員工」\u3000', 'href': '/bbs/Gossiping/M.1625703139.A.EDC.html', 'push_count': 59, 'author': 'Eliphalet'}
{'title': '[新聞] 韓國瑜為孩子準備營養補品！親自送貨到', 'href': '/bbs/Gossiping/M.162