### Ptt 論壇爬蟲(適用各個主題版)
### Ptt Crawler

In [61]:
from bs4 import BeautifulSoup
from typing import Tuple
import requests
import re
import pandas as pd

# comment's ip of some boards was hidden.
hide_comment_ip_board = [
    'Stock',
    'Sex'
]


class PttCrawler:
    """Fetch post information on PTT.

    Guide
    1. Initialize the board you need.
    2. Get the number of maximum page.
    3. Generate urls of each page.
    4. Take API of the post in one page.
    """

    def __init__(self, board_name):
        """Initialize.

        Initialize with board homepage,
        We need cookie for passing.
        """
        self.board_name = board_name
        self.url = f'https://www.ptt.cc/bbs/{board_name}/index.html'

        session = requests.Session()
        session.get(self.url)
        ck = session.cookies.get_dict()
        ck['over18'] = '1'
        ck = ';'.join([k + '=' + v for k, v in ck.items()])

        # go homepage of the board first for header para.
        header = {
            'cookie': ck,
        }
        self.header = header

    def get_total_page(self) -> int:
        """
        Get the total number of pages of this board.
        """

        res = requests.get(self.url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        last_page_url = soup.findAll('a', text='‹ 上頁')[0]['href']
        total_page = int(re.findall(r'/index(\w+)', last_page_url)[0])
        self.total_page = total_page

        return total_page

    def get_urls_of_posts(self, page_no: int) -> list:
        """
        Generate post's url of selected page.
        """

        if page_no > self.total_page:
            raise ValueError('page_no is more than the max it had.')

        url = f'https://www.ptt.cc/bbs/{self.board_name}/index{page_no}.html'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')
        post_urls = [
            a['href'].split('/')[-1]
            for a in soup.find_all(
                'a', text=re.compile('\[+')
            )
        ]
        return post_urls

    def get_api_of_whole_page(self, page_no: int) -> dict:
        """
        Go through whole page with get api of each post.
        """

        output = {}
        urls = self.get_urls_of_posts(page_no)
        for i in len(range(urls)):
            url = f'https:// www.ptt.cc/bbs/{self.board_name}/{urls[i]}'
            output[i] = self.get_api_of_post(url)
        return output

    def get_api_of_post(self, post_url: str) -> dict:
        """
        Take all element on a page.
        """

        soup = self.get_soup_of_post(post_url)
        content = self.get_post_content(soup)
        author, board, title, post_time = self.get_post_info(soup)
        comment = self.get_post_comment(soup)

        return {
            'url': post_url,
            'board': author,
            'author': board,
            'title': title,
            'post_time': post_time,
            'content': content,
            'comment': comment,
        }

    def get_soup_of_post(self, post_url) -> BeautifulSoup:
        """
        Get source of a post.
        """

        url = f'https://www.ptt.cc/bbs/{self.board_name}/{post_url}'
        res = requests.get(url, headers=self.header)
        soup = BeautifulSoup(res.text, 'html.parser')

        return soup

    def get_post_content(self, post_soup: BeautifulSoup) -> str:
        """Description of process.

        Strings of contents are contained in 'main-content' without tagging.
        We should retrive content by its tag not belong to div and span.
        (Type of content belong to 'NavigableString')
        """

        output = ""
        for c in post_soup.find_all(
            'div', {'id': 'main-content'}
        )[0].contents:

            if (c.name not in ['div', 'span']) \
                    and (c.string is not None):
                output += c.string

        return output.strip()

    def get_post_info(
        self, post_soup: BeautifulSoup
    ) -> Tuple[str, str, str, str]:
        """Get basic information of a post."""
        author, board, title, post_time = post_soup.find_all(
            'span', {'class': 'article-meta-value'}
        )

        return (
            author.getText(), board.getText(),
            title.getText(), post_time.getText()
        )

    def get_post_comment(self, post_soup: BeautifulSoup) -> dict:
        """
        Get comments of a post.
        """

        comment_ele = post_soup.find_all('div', {'class': 'push'})
        comments = {}
        if len(comment_ele) > 0:
            for idx, div in enumerate(comment_ele):

                push_tag, user, comment, info = div.find_all('span')

                # some board with no ip address for comments,
                # e.g. Stock, Sex..
                if self.board_name in hide_comment_ip_board:
                    comment_date, comment_time = info.getText().split()
                    ip = None
                else:
                    ip, comment_date, comment_time = info.getText().split()
                
                comments[idx] = {
                    'push_tag': push_tag.getText().strip(),
                    'user': user.getText().strip(),
                    'comment': comment.getText().strip()[1:],
                    'ip': ip,
                    'comment_date': comment_date.strip(),
                    'comment_time': comment_time.strip(),
                }
            return comments
        else:
            return comments


In [83]:
board = PttCrawler('Stock')
total_pages = board.get_total_page()
print(total_pages)

5011


In [84]:
urls_of_one_page = board.get_urls_of_posts(total_pages)
data = board.get_api_of_post(urls_of_one_page[0])
data

{'url': 'M.1623755866.A.8EB.html',
 'board': 'nyk3 (阿晃)',
 'author': 'Stock',
 'title': '[新聞] 13家上市櫃公司投資神準 成另類法人股神',
 'post_time': 'Tue Jun 15 19:17:44 2021',
 'content': '原文標題：13家上市櫃公司投資神準 成另類法人股神\n\n原文連結：https://ctee.com.tw/news/stocks/474680.html\n\n發布時間：2021.06.15\n\n原文內容：\n工商時報 李娟萍ꀊ\n台股熱錢滾滾，上市櫃公司大股東也貢獻資金活水，包括宏益（1452）、集盛（1455）等\n13檔公司，靠著投資精準，成為市場上新的另類法人股神。\n\n台股近期當沖盛行，當沖客被各界視為台股新勢力，但兆豐投顧董事長李秀利觀察到，還\n有一股新的另類法人，投資眼光精準，堪稱新股神。\n\n包括威剛、台航、中航、台驊、集盛、宏益、榮運、長榮鋼、豐興、宏遠證、康和證、華\n新、彩晶等13檔個股，除本業表現不錯，靠著投資股票，業外收入也頗豐。\n過往市場熟知擅於投資股票的上市櫃公司有中環、錸德、京城銀、龍邦等為主，今年因傳\n產鋼鐵、紡織、航運股成為當紅炸子雞，造就新一批另類法人。\n\n舉例來說，台航持有陽明4.1萬張，並參加陽明私募，成本甚低，隨著陽明股價不斷飆漲\n，台航持有陽明的潛在獲利大增。\n\n此外，榮運、長榮鋼持有長榮、長榮航，中航持有陸海、中菲行及陽明。台驊投控持有萬\n海及長榮，而券商宏遠證、康和證及威剛也紛紛加入航海王投資行列，被市場同視為航運\n好夥伴。\n\n而紡織及鋼鐵同業之間，則樂於交互投資，今年第一季獲利創下佳績的集盛及宏益，在業\n外皆有不錯的嶄獲，集盛持有南紡、陽明、儒鴻、南帝及群創等，股價今年紛紛大漲，也\n相對加持了集盛的獲利。\n\n而鋼筋大廠豐興在3～4月逢低大買中鋼、東鋼，也被各界進場時機神準，業界相互幫襯，\n進一步推升紡織及鋼鐵「粽子行情」升溫。\n\n心得/評論：                             ※必需填寫滿20字\n不是all in航運就會賺錢嗎？ㄚ還有紡織跟鋼鐵啦\n"業界相互幫襯"你買我股票,我當你股東的概念...\n\n-

In [85]:
df = pd.DataFrame(data['comment']).T
df

Unnamed: 0,push_tag,user,comment,ip,comment_date,comment_time
0,推,Player01,互相吹捧,,06/15,19:19
1,推,piercingX,怎麼沒有富旺跟皇翔投顧,,06/15,19:20
2,→,lolic,本業不好好做整天炒股的爛公司都能捧,,06/15,19:24
3,推,PTTcrazy,買中鋼，明天跌停吧,,06/15,19:39
4,→,hiyuy,貨櫃三雄交叉持股會不會變永動機,,06/15,19:42
5,推,CARBON001,怎麼可以少了偉大的中環投顧！,,06/15,19:55
6,噓,tommy1,還以為說投資3558 根本沒量啊,,06/15,20:15
7,→,asdxdew,中環有提到阿,,06/15,20:15
8,推,abc25202003,少了中環,,06/15,20:24
9,→,drsunyatsen,3558躺著中槍,,06/15,20:48


In [86]:
urls_of_one_page = board.get_urls_of_posts(total_pages)
data = board.get_api_of_post(urls_of_one_page[1])
data

{'url': 'M.1623756044.A.D7B.html',
 'board': 'Arizona989 (心平氣和，和氣生財)',
 'author': 'Stock',
 'title': '[新聞] 陸前五月糧食進口 大增五成',
 'post_time': 'Tue Jun 15 19:20:42 2021',
 'content': '原文標題：\n陸前五月糧食進口 大增五成\n   \n原文連結：\nhttps://reurl.cc/ogZA7V\n\n發布時間：\n2021-06-15\n  \n\n\n原文內容：\n\n聯合國糧農組織數據顯示，今年5月全球糧價年漲近40%，創十年來新高。而大陸前五個月\n糧食進口量年增幅50.6%，總量高達6,667.5萬噸。大陸進口最多的糧食就是大豆，主因大\n豆是美中貿易的主力產品，大豆搾油後的豆粕也是養豬的重要飼料。\n\n\n網易財經報導，按照正常的情況，國際糧價上漲，大陸應該減少糧食進口才對，但為何不\n減反增？專家稱，在大陸進口的糧食當中，主糧如大米、小麥的占比很少，首先就可以排\n除大陸主糧緊缺的情況。\n\n其次，大陸進口的糧食以大豆為主，但大豆並不是主食，境內大面積種植豆類的性價比不\n高，因此不如從海外進口。大豆在大陸主要有兩個作用，一個是煉成植物油，還有就是充\n當養殖企業的口糧，養殖業在近幾年時間擴張很快，對於大豆、玉米等農作物的需求自然\n就增加。\n\n騰訊網報導，大陸海關總署數據顯示，今年前五個月，大陸共進口大豆3,823.4噸，占全\n部糧食進口量的57.34%。分析稱，美中貿易紛爭緩和帶動大豆進口量，而未來兩個月的大\n豆進口量預計將超過1,000萬噸。\n\n分析稱，糧食自給率是評估一個國家糧食自給程度的重要指標，自給率越高，國際糧價對\n於地區糧食的影響就越小。一般來說，糧食自給率在100%，說明是完全自給；一旦小於90\n%，糧食供求的風險就會增大。而大陸商務部數據顯示，進口糧只占大陸境內糧食消耗的2\n%左右，三大主糧（稻榖、小麥與玉米）的自給率很高，基本上可以滿足自給。\n\n2020年大陸全國糧食產量約超過6.5億噸。按照今年前五個月進口糧食6,667.5噸來計算，\n全年進口量也只有1.6萬噸，在大陸自產糧數據來看，進口的部分幾乎可以忽略不計。換\n句話來說，國

In [87]:
df = pd.DataFrame(data['comment']).T
df

Unnamed: 0,push_tag,user,comment,ip,comment_date,comment_time
0,→,Orianna,豬啊 去年豬少 現在疾病沒了 豬變多了,,06/15,19:23
1,推,smallroad,大成噴噴,,06/15,19:24
2,→,Orianna,豬價漲 變成他們農民愛養豬 數量比豬瘟時多很多,,06/15,19:25
3,→,peter080808,現在中國豬肉價超的好嗎,,06/15,19:34
4,推,R521,應該剛收成 南半球的產季 剛採收完,,06/15,19:36
5,推,s7598261,海運利多吧,,06/15,19:45
6,→,fallinlove15,進口增加 黃豆佔一半左右 剩下難道都不是糧食？,,06/15,19:52
7,→,fallinlove15,之前的大雨讓四川一帶農作物收成不好不是？,,06/15,19:53
8,推,peter080808,進口糧食只佔中國糧食消耗的2%而已,,06/15,19:55
9,推,kamichu,華為用iot技術跑去養豬了？,,06/15,20:07
