<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from google.colab import files

## Variables

In [54]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 5 #number of pages to be crawled
title_keywords = ['桃園'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')

## Functions

In [49]:
def fetch(url):
  response = requests.get(url)
  response = requests.get(url, cookies={'over18': '1'})
  return response

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
    meta = {
      'title': entry.find('div.title', first=True).text,
      'push': entry.find('div.nrec', first=True).text,
      'date': entry.find('div.date', first=True).text
    }
    try:
      meta['author'] = entry.find('div.author', first=True).text
      meta['link'] = entry.find('div.title > a', first=True).attrs['href']
    except AttributeError:
      meta['author'] = '[Deleted]'
      meta['link'] = '[Deleted]'
    return meta

def get_metadata_from(url):

    def parse_next_link(doc):
        html = HTML(html=doc)
        controls = html.find('.action-bar a.btn.wide')
        link = controls[1].attrs.get('href')
        return urllib.parse.urljoin(domain, link)

    res = fetch(url)
    post_entries = parse_article_entries(res.text, 'div.r-ent')
    next_link = parse_next_link(res.text)
    
    metadata = [parse_article_meta(entry) for entry in post_entries]
    return metadata, next_link

def get_paged_meta(url, num_pages):
    collected_meta = []

    for _ in range(num_pages):
      posts, link = get_metadata_from(url)
      collected_meta += posts
      url = urllib.parse.urljoin(domain, link)

    return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
    titles.append(meta['title'])
    dates.append(meta['date'])
    links.append(urllib.parse.urljoin(domain, meta['link']))
    filtered_meta.append(meta)

    return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
    titles = []
    dates = []
    links = []
    filtered_meta = []

    metadata = get_paged_meta(url, num_pages)
    for meta in metadata:
      if len(title_keywords) != 0:
        for text in title_keywords: 
          if text in meta['title']:
            create_fields(titles, dates, links, filtered_meta, meta)
      else: 
        create_fields(titles, dates, links, filtered_meta, meta)

    return titles, dates, links, filtered_meta

def parse_content_data(main_container):
    try:
      pre_text = main_container.text.split('--')[0]
      texts = pre_text.split('\n')[2:]
      data = '\n'.join(texts)
    except AttributeError:
      data = '[Deleted]'
    return data

def parse_content_from(link):
    res = fetch(link)
    soup = BeautifulSoup(res.text, 'html.parser')
    main_container = soup.find(id='main-container')
    content_data = parse_content_data(main_container)
    return content_data

def get_contents(metadata):
    contents = []
    formatted_contents = []

    post_links = [
      urllib.parse.urljoin(domain, meta['link'])
      for meta in metadata if 'link' in meta
    ]

    with Pool(processes=8) as pool:
      contents += pool.map(parse_content_from, post_links)
      return contents

## Preview crawled data






In [55]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents})
df

Unnamed: 0,發文日期,標題,文章網址,文章內容
0,10/31,[問題] 新北跟桃園房貸,https://www.ptt.cc/bbs/Loan/M.1635680661.A.EE8...,[房屋資訊]\n地點: 新北和桃園地區(機場捷運沿線)\n類型： 電梯大樓\n坪數: 未定，...
1,11/01,[問題] 首購桃園房貸,https://www.ptt.cc/bbs/Loan/M.1635735875.A.C46...,\n【房屋資訊】\n地點：桃園平鎮區\n類型：電梯大樓\n坪數：權狀34.1坪\n屋齡：24...
2,10/30,[問題] 桃園中壢回復型房貸或理財型房貸,https://www.ptt.cc/bbs/Loan/M.1635582359.A.589...,[房屋資訊]\n地點：桃園中壢中原大學附近\n用途：投資\n類型：電梯大樓/無車位/套房/2...


## Save to CSV

In [53]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>