### Ptt 論壇爬蟲(適用各個主題版)
### Ptt Crawler

In [1]:
from bs4 import BeautifulSoup
from typing import Tuple
import requests
import re
import pandas as pd

# comment's ip of some boards was hidden.
hide_comment_ip_board = [
    'Stock',
    'Sex'
]


class PttCrawler:
    """Fetch post information on PTT.

    Guide
    1. Initialize the board you need.
    2. Get the number of maximum page.
    3. Generate urls of each page.
    4. Take API of the post in one page.
    """

    def __init__(self, board_name):
        """Initialize.

        Initialize with board homepage,
        We need cookie for passing.
        """
        self.board_name = board_name
        self.url = f'https://www.ptt.cc/bbs/{board_name}/index.html'

        session = requests.Session()
        session.get(self.url)
        ck = session.cookies.get_dict()
        ck['over18'] = '1'
        ck = ';'.join([k + '=' + v for k, v in ck.items()])

        # go homepage of the board first for header para.
        header = {
            'cookie': ck,
        }
        self.header = header

    def get_total_page(self) -> int:
        """
        Get the total number of pages of this board.
        """

        res = requests.get(self.url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        last_page_url = soup.findAll('a', text='‹ 上頁')[0]['href']
        total_page = int(re.findall(r'/index(\w+)', last_page_url)[0])
        self.total_page = total_page

        return total_page

    def get_urls_of_posts(self, page_no: int) -> list:
        """
        Generate post's url of selected page.
        """

        if page_no > self.total_page:
            raise ValueError('page_no is more than the max it had.')

        url = f'https://www.ptt.cc/bbs/{self.board_name}/index{page_no}.html'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        post_urls = [
            a['href'].split('/')[-1]
            for a in soup.find_all(
                'a', text=re.compile('\[+')
            )
        ]
        return post_urls

    def get_api_of_whole_page(self, page_no: int) -> dict:
        """
        Go through whole page with get api of each post.
        """

        output = {}
        urls = self.get_urls_of_posts(page_no)
        for i in len(range(urls)):
            url = f'https:// www.ptt.cc/bbs/{self.board_name}/{urls[i]}'
            output[i] = self.get_api_of_post(url)
        return output

    def get_api_of_post(self, post_url: str) -> dict:
        """
        Take all element on a page.
        """

        soup = self.get_soup_of_post(post_url)
        content = self.get_post_content(soup)
        board, author, title, post_time = self.get_post_info(soup)
        comment = self.get_post_comment(soup)

        return {
            'url': post_url,
            'board': author,
            'author': board,
            'title': title,
            'post_time': post_time,
            'content': content,
            'comment': comment,
        }

    def get_soup_of_post(self, post_url) -> BeautifulSoup:
        """
        Get source of a post.
        """

        url = f'https://www.ptt.cc/bbs/{self.board_name}/{post_url}'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')

        return soup

    def get_post_content(self, post_soup: BeautifulSoup) -> str:
        """Description of process.

        Strings of contents are contained in 'main-content' without tagging.
        We should retrieve content by its tag not belong to div and span.
        (Type of content belong to 'NavigableString')
        """

        output = ""
        for c in post_soup.find_all(
            'div', {'id': 'main-content'}
        )[0].contents:

            if (c.name not in ['div', 'span']) \
                    and (c.string is not None):
                output += c.string

        return output.strip()

    def get_post_info(
        self, post_soup: BeautifulSoup
    ) -> Tuple[str, str, str, str]:

        """Get basic information of a post."""

        author, board, title, post_time = post_soup.find_all(
            'span', {'class': 'article-meta-value'}
        )

        return (
            author.getText(), board.getText(),
            title.getText(), post_time.getText()
        )

    def get_post_comment(self, post_soup: BeautifulSoup) -> dict:
        """
        Get comments of a post.
        """

        comment_ele = post_soup.find_all('div', {'class': 'push'})
        comments = {}
        if len(comment_ele) > 0:
            for idx, div in enumerate(comment_ele):

                push_tag, user, comment, info = div.find_all('span')

                # some board with no ip address for comments,
                # e.g. Stock, Sex..
                if self.board_name in hide_comment_ip_board:
                    comment_date, comment_time = info.getText().split()
                    ip = None
                else:
                    ip, comment_date, comment_time = info.getText().split()
                
                comments[idx] = {
                    'push_tag': push_tag.getText().strip(),
                    'user': user.getText().strip(),
                    'comment': comment.getText().strip()[1:],
                    'ip': ip,
                    'comment_date': comment_date.strip(),
                    'comment_time': comment_time.strip(),
                }
            return comments
        else:
            return comments


In [2]:
# set 'Stock' as target board
board = PttCrawler('Stock')

# get total page of the board
total_pages = board.get_total_page()
print(total_pages)

5164


In [3]:
# demo1
# get url in one of the target board's page
urls_of_one_page = board.get_urls_of_posts(total_pages)

# take the information of one post
# ['url', 'board', 'author', 'title', 'post_time', 'content', 'comment']
data = board.get_api_of_post(urls_of_one_page[0])
data

{'url': 'M.1654966541.A.93B.html',
 'board': 'Stock',
 'author': 'DrowningPool (My broken dreams)',
 'title': '[新聞] 台股擂台 郭哲榮本周押寶晶華鳳凰',
 'post_time': 'Sun Jun 12 00:55:37 2022',
 'content': '原文標題：台股擂台/挑戰者「華爾街大亨」郭哲榮 本周押寶晶華、鳳凰\n\n原文連結：https://money.udn.com/money/story/5607/6381406\n\n發布時間：2022-6-12\n\n記者署名：張瀞文\n\n原文內容：\n\n上周戰報\n在上周，「華爾街大亨」。\n\n他上周持有的五檔個股中，以六福（2705）漲5.2%最優，主要是百張以上大戶籌碼集中，\n後市持續看好。但鳳凰漲多拉回，前一周大漲一成多，上周回吐3.3%，高檔整理，表現最\n弱，但也沒大跌。\n\n本周布局\n在新的一周，郭哲榮的現金比重維持在零，持有個股數目也維持在五檔不變，每檔都各配\n置20%的比重。其中，，全都是受惠於解封的觀光產業，郭哲榮持續看\n好解封後的受惠股。\n\n在新的投資組合中，郭哲榮最看好續抱股。隨著全球疫苗覆蓋率提升，邊境逐漸解封\n，加上政府國旅補助措施即將上路，晶華旗下擁有國際觀光旅館及餐飲等事業，業績將逐\n漸回溫，後市看好。\n\n還有三檔續抱股，看好和投資的理由都不變。將受惠下半年有望開放出境旅遊題材激\n勵，還有日本6月10日開放團客入境釋出的商機；同樣可望受惠於國境解封；旗\n下聚集飯店、遊樂園、餐飲等事業，有望迎接民眾的報復性出遊潮，且百張以上大戶籌碼\n激增，顯示法人看好後市。\n\n還有一檔新增個股，受惠國旅補助，也是解封概念股。\n\n最佳一檔\n晶華（2707）\n理由：觀光股的績優股\n\n多空診斷室\n本周有許多重要經濟事件，；美國聯準會（Fed）利率決議，預計升息2碼；加上台積電除息，但美\n國10年期公債殖利率再度衝破3%，也讓科技股承受壓力。所以本周台股要挑戰季線，難度\n不低，但只要有任何舒緩通貨膨脹的利多出現，台股就有機會放量突破。\n\n投資錦囊妙計\n近期台股走勢較為震盪，現階段的盤面群龍無首，加上量能低迷，目前選股方向，可

In [4]:
df = pd.DataFrame(data['comment']).T
df

Unnamed: 0,push_tag,user,comment,ip,comment_date,comment_time
0,推,kevin28,觀光保重,,06/12,00:56
1,推,avmm9898,星期一隨便選一檔股票放空都是賺,,06/12,00:57
2,推,dbdudsorj,觀光股下台一鞠躬,,06/12,00:58
3,推,yellone741,下週誰虧的少就算贏,,06/12,01:02
4,推,davidaustin,解封股,,06/12,01:02
...,...,...,...,...,...,...
83,推,Boxun0404,謝謝投資長放過金項鍊,,06/12,17:14
84,推,billionaire,https://i.imgur.com/hqZqP77.jpg,,06/12,19:53
85,噓,jkjkallen,這也可以炒,,06/12,22:32
86,噓,sa87a16,馬B 點名我觀光股幹嘛,,06/12,23:13


In [5]:
# demo2
urls_of_one_page = board.get_urls_of_posts(total_pages)
data = board.get_api_of_post(urls_of_one_page[1])
data

{'url': 'M.1654991667.A.8F8.html',
 'board': 'Stock',
 'author': 'Coffeewater (淡淡的咖啡)',
 'title': '[新聞] 美國物價大漲 拜登怪罪普丁通膨',
 'post_time': 'Sun Jun 12 07:54:25 2022',
 'content': '-------------------------------發文提醒----------------------------------\n1.發文前請先詳閱[新聞]分類發文規範，未依規範發文將受處份。\n2.連結過長請善用 https://bit.ly/ 等縮網址服務，連結不能點擊者板規1-2-2處份。\n3.心得/評論請盡量充實，心得過短或濫竽充數將以板規 1-2-3、4-4 水桶處份。\n4.發文請依照格式文章標明段落，不符合格式者依4-1刪文處分。\n------------------------ 按ctrl+y以上內容。 ----------------------\n\n原文標題：\n\n美國物價大漲 拜登怪罪普丁通膨\n\n原文連結：\n\nhttps://www.chinatimes.com/newspapers/20220612000495-260301?chdtv\n\n發布時間：\n\n2022/6/12\n\n記者署名：\n\n王少筠\n\n原文內容：\n\n美國5月消費者物價指數（CPI）年增8.6％，創40年來最大漲幅。全美普通無鉛汽油平均\n價格也在11日來到每加侖5.004美元，堪比2008年金融海嘯時的油價。拜登總統10日發表\n聲明，將物價上漲怪罪於「普丁通膨」（Putin’s Price Hike），表示「我們持續捍衛\n烏克蘭自由的同時，也必須迅速採取更多行動，來降低美國的物價。」\n\n\n美國勞工部統計局10日公布的數據重賞拜登一巴掌，因為一天前，白宮官員才透露5月CPI\n報告有望出現通膨趨緩、經濟穩定增長等趨勢，不料實際數據卻事與願違，不僅能源價格\n較過去一年飆升34.6％，糧食價格也上漲10.1％，整體CPI更增加整整1％。\n\n拜登表示，「普丁通膨」席捲全球，自俄軍入侵烏克蘭以來，汽油、能源、糧食的價格每\n月都在上漲，他呼籲國會協助減輕通膨造成的財政負擔，並

In [6]:
df = pd.DataFrame(data['comment']).T
df

Unnamed: 0,push_tag,user,comment,ip,comment_date,comment_time
0,推,y1896547,抗俄保美 美國價值 新美國模式,,06/12,07:55
1,推,KyleSeager,那烏俄戰爭前的通膨是？？,,06/12,07:57
2,→,AustinRivers,=.=a,,06/12,07:57
3,噓,a38654,美國蔡英文 都是阿共的錯！,,06/12,07:59
4,推,cpz,美國jobs讚,,06/12,08:02
...,...,...,...,...,...,...
152,推,Crazyfire,普維拉 沒戰爭就改怪川普了,,06/12,17:42
153,→,Rickyyy,左膠就是滿嘴仁義道德，做事亂七八糟,,06/12,18:49
154,推,JANYUJEN,普丁又沒說不賣你，自己不買怪人家？,,06/12,19:07
155,推,jrbobo,美國EE,,06/12,21:34
