### Ptt 論壇爬蟲(適用各個主題版)
### Ptt Crawler

In [1]:
from bs4 import BeautifulSoup
from typing import Tuple
import requests
import re

# comment's ip of some boards was hidden.
hide_comment_ip_board = [
    'Stock',
    'Sex'
]


class PttCrawler:
    """Fetch post information on PTT.

    Guide
    1. Initialize the board you need.
    2. Get the number of maximum page.
    3. Generate urls of each page.
    4. Take API of the post in one page.
    """

    def __init__(self, board_name):
        """Initialize.

        Initialize with board homepage,
        We need cookie for passing.
        """
        self.board_name = board_name
        self.url = f'https://www.ptt.cc/bbs/{board_name}/index.html'

        session = requests.Session()
        session.get(self.url)
        ck = session.cookies.get_dict()
        ck['over18'] = '1'
        ck = ';'.join([k + '=' + v for k, v in ck.items()])

        # go homepage of the board first for header para.
        header = {
            'cookie': ck,
        }
        self.header = header

    def get_total_page(self) -> int:
        """Get the total number of pages of this board."""
        res = requests.get(self.url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        last_page_url = soup.findAll('a', text='‹ 上頁')[0]['href']
        total_page = int(re.findall(r'/index(\w+)', last_page_url)[0])
        self.total_page = total_page

        return total_page

    def get_urls_of_posts(self, page_no: int) -> list:
        """Generate post's url of selected page."""
        if page_no > self.total_page:
            raise ValueError('page_no is more than the max it had.')

        url = f'https://www.ptt.cc/bbs/{self.board_name}/index{page_no}.html'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        post_urls = [
            a['href'].split('/')[-1]
            for a in soup.find_all(
                'a', text=re.compile('\[+')
            )
        ]
        return post_urls

    def get_api_of_whole_page(self, page_no: int) -> dict:
        """Go through whole page with get api of each post."""
        output = {}
        urls = self.get_urls_of_posts(page_no)
        for i in len(range(urls)):
            url = f'https:// www.ptt.cc/bbs/{self.board_name}/{urls[i]}'
            output[i] = self.get_api_of_post(url)
        return output

    def get_api_of_post(self, post_url: str) -> dict:
        """Take all element on a page."""
        soup = self.get_soup_of_post(post_url)
        content = self.get_post_content(soup)
        author, board, title, post_time = self.get_post_info(soup)
        comment = self.get_post_comment(soup)

        return {
            'url': post_url,
            'board': author,
            'author': board,
            'title': title,
            'post_time': post_time,
            'content': content,
            'comment': comment,
        }

    def get_soup_of_post(self, post_url) -> BeautifulSoup:
        """Get source of a post."""
        url = f'https://www.ptt.cc/bbs/{self.board_name}/{post_url}'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')

        return soup

    def get_post_content(self, post_soup: BeautifulSoup) -> str:
        """Description of process.

        Strings of contents are contained in 'main-content' without tagging.
        We should retrive content by its tag not belong to div and span.
        (Type of content belong to 'NavigableString')
        """
        output = ""
        for c in post_soup.find_all(
            'div', {'id': 'main-content'}
        )[0].contents:

            if (c.name not in ['div', 'span']) \
                    and (c.string is not None):
                output += c.string

        return output.strip()

    def get_post_info(
        self, post_soup: BeautifulSoup
    ) -> Tuple[str, str, str, str]:
        """Get basic information of a post."""
        author, board, title, post_time = post_soup.find_all(
            'span', {'class': 'article-meta-value'}
        )

        return (
            author.getText(), board.getText(),
            title.getText(), post_time.getText()
        )

    def get_post_comment(self, post_soup: BeautifulSoup) -> dict:
        """Get comments of a post."""
        comment_ele = post_soup.find_all('div', {'class': 'push'})
        comments = {}
        if len(comment_ele) > 0:
            for idx, div in enumerate(comment_ele):

                push_tag, user, comment, info = div.find_all('span')

                # some board with no ip address for comments,
                # e.g. Stock, Sex..
                if self.board_name in hide_comment_ip_board:
                    comment_date, comment_time = info.getText().split()
                    ip = None
                else:
                    ip, comment_date, comment_time = info.getText().split()

                comments[idx] = {
                    'push_tag': push_tag.getText().strip(),
                    'user': user.getText().strip(),
                    'comment': comment.getText().strip(),
                    'ip': ip,
                    'comment_date': comment_date.strip(),
                    'comment_time': comment_time.strip(),
                }
            return comments
        else:
            return comments

In [2]:
board = PttCrawler('Gossiping')
total_pages = board.get_total_page()
print(total_pages)

39779


In [3]:
urls_of_one_page = board.get_urls_of_posts(total_pages)
print(urls_of_one_page)

['M.1622810270.A.4BE.html', 'M.1622810303.A.CC6.html', 'M.1622810329.A.779.html', 'M.1622810342.A.989.html', 'M.1622810346.A.BCF.html', 'M.1622810390.A.1D4.html', 'M.1622810429.A.AF2.html', 'M.1622810442.A.636.html', 'M.1622810443.A.9DD.html', 'M.1622810446.A.1DB.html', 'M.1622810495.A.176.html', 'M.1622810521.A.113.html', 'M.1622810530.A.0CC.html', 'M.1622810540.A.0F4.html', 'M.1622810550.A.549.html', 'M.1622810551.A.785.html', 'M.1622810588.A.769.html', 'M.1622810655.A.993.html', 'M.1622810656.A.CEB.html']


In [4]:
board.get_api_of_post(urls_of_one_page[0])

{'url': 'M.1622810270.A.4BE.html',
 'board': 'Elipton (壞人變態色鬼)',
 'author': 'Gossiping',
 'title': 'Re: [問卦] 我有資格開幹了吧！',
 'post_time': 'Fri Jun  4 20:37:48 2021',
 'content': '1，3，4在死亡案例的簡介都看得到\n\n今天有個發病日只是發燒\n\n3天就掰了的3X歲男子\n\n5的話\n\n只能說正常吧\n\n一個剛畢業的醫學生可能會為自己的無力痛哭\n\n久了以後\n\n他除了把資源盡可能地公平分配\n\n你還要他怎樣呢？\n\n陪你一起多愁善感嗎？\n\n--',
 'comment': {0: {'push_tag': '→',
   'user': 'kiddingsa',
   'comment': ': 冤有頭債有主 弒母之仇 別留到下輩子',
   'ip': '223.137.28.81',
   'comment_date': '06/04',
   'comment_time': '20:40'},
  1: {'push_tag': '推',
   'user': 'sulichun',
   'comment': ': 你是那位醫師嗎？我覺得你也蠻冷血的',
   'ip': '36.230.68.247',
   'comment_date': '06/04',
   'comment_time': '20:40'},
  2: {'push_tag': '推',
   'user': 'anous',
   'comment': ': 就每個人都以為自己最重要，怎麼可以不關',
   'ip': '114.136.97.188',
   'comment_date': '06/04',
   'comment_time': '20:40'},
  3: {'push_tag': '→',
   'user': 'anous',
   'comment': ': 心我，前幾天砍護理師的那個廢物還不是這',
   'ip': '114.136.97.188',
   'comment_date': '06/04',
   'comment_time': '20:40'}

In [5]:
board.get_api_of_post(urls_of_one_page[1])

{'url': 'M.1622810303.A.CC6.html',
 'board': 'derekhsu (華麗的天下無雙)',
 'author': 'Gossiping',
 'title': '[爆卦] 疫情整理:英國確診狂飆 馬國回歸校正爆量',
 'post_time': 'Fri Jun  4 20:38:15 2021',
 'content': '- 馬來西亞病例持續回歸校正，6/2日的新增死亡病例數竟比原先增加60%，達到單日128\n例，馬來西亞目前的疫情有被大幅低估可能。\n\n- 哥倫比亞新增確診與死亡病例數均創歷史新高。\n\n- 印度昨天新增確診病例數創下近期新低。\n\n\n\n- 繼昨天新增確診病例數暴增之後，今天墨西哥死亡病例數也暴增，可能是有於回歸校正\n的關係，因此墨西哥疫情也有被低估可能。\n\n- 阿富汗疫情持續惡化，新增確診病例數創歷史新高。\n\n- 非洲南部多國疫情進入緊急狀態。\n\n\n\n- 德國慕尼黑大學(Munich University)科學家的一項重大新研究發現，封鎖對降低該國\n的冠狀病毒感染率沒有效果。\n([https://t.co/JQQDhzvOmw?amp=1](https://t.co/JQQDhzvOmw?amp=1))\n\n- 根據YouGov的調查，大多數美國人認爲冠狀病毒大流行起源於武漢病毒學研究所，近四\n分之一的美國人認爲這是故意發佈的。\n([https://t.co/tXnFT6ZVSu?amp=1](https://t.co/tXnFT6ZVSu?amp=1))\n\n- 最新消息:英國將批准輝瑞爲12歲至15歲人群接種的冠狀病毒疫苗\n\n- \n([https://t.co/NUZtAuxlHT?amp=1](https://t.co/NUZtAuxlHT?amp=1))（湯姆克魯斯：\n氣氣氣氣氣氣氣）\n\n\n\n([https://t.co/Q82dQSoZLz?amp=1](https://t.co/Q82dQSoZLz?amp=1))\n\n- 有症狀病例上升了80%，西北部和蘇格蘭受影響最嚴重\n([https://t.co/B82ao6nuHB?amp=1](https://t.co/B82ao6nuHB?amp=1))\n\n- 新的數據表明，在