<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, re, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from itertools import repeat
from google.colab import files

## Variables

In [8]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 10 #number of pages to be crawled
title_keywords = ['房貸'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')

## Functions

In [9]:
def fetch(url):
  res = requests.get(url)
  res = requests.get(url, cookies={'over18': '1'})
  return res

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
  meta = {
    'title': entry.find('div.title', first=True).text,
    'push': entry.find('div.nrec', first=True).text,
    'date': entry.find('div.date', first=True).text
  }
  try:
    meta['author'] = entry.find('div.author', first=True).text
    meta['link'] = entry.find('div.title > a', first=True).attrs['href']
  except AttributeError:
    meta['author'] = '[Deleted]'
    meta['link'] = '[Deleted]'
  return meta

def get_metadata_from(url):
  
  def parse_next_link(doc):
      html = HTML(html=doc)
      controls = html.find('.action-bar a.btn.wide')
      link = controls[1].attrs.get('href')
      return urllib.parse.urljoin(domain, link)

  res = fetch(url)
  post_entries = parse_article_entries(res.text, 'div.r-ent')
  next_link = parse_next_link(res.text)
  
  metadata = [parse_article_meta(entry) for entry in post_entries]
  return metadata, next_link

def get_paged_meta(url, num_pages):
  collected_meta = []

  for _ in range(num_pages):
    posts, link = get_metadata_from(url)
    collected_meta += posts
    url = urllib.parse.urljoin(domain, link)

  return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
  titles.append(meta['title'])
  dates.append(meta['date'])
  if meta['link']:
    links.append(urllib.parse.urljoin(domain, meta['link'])) 
  filtered_meta.append(meta)

  return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
  titles = []
  dates = []
  links = []
  filtered_meta = []

  metadata = get_paged_meta(url, num_pages)
  for meta in metadata:
    if len(title_keywords) != 0:
      for text in title_keywords: 
        if text in meta['title']:
          create_fields(titles, dates, links, filtered_meta, meta)
    else: 
      create_fields(titles, dates, links, filtered_meta, meta)

  return titles, dates, links, filtered_meta

def parse_content_from(link):
  res = fetch(link)
  soup = BeautifulSoup(res.text, 'html.parser')
  main_container = soup.find(id='main-container')

  try:
    pre_text = main_container.text.split('※ 發信站')[0]
    texts = pre_text.split('\n')[2:]
    data = '\n'.join(texts)

  except AttributeError:
    data = '[Deleted]'
  return data

def get_contents(metadata):
  contents = []

  with Pool(processes=8) as pool:
    contents = pool.map(parse_content_from, links)
    return contents

## Preview crawled data






In [10]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents})
df

Unnamed: 0,發文日期,標題,文章網址,文章內容
0,11/29,[問題] 新北公務員首購房貸,https://www.ptt.cc/bbs/Loan/M.1638196377.A.2E6...,1.地點：新北市中和區（永安市場-景安）\n2.坪數：權狀坪數約18坪\n3.用途：自住\n...
1,11/30,[問題] 台中-房貸轉貸/增貸,https://www.ptt.cc/bbs/Loan/M.1638203458.A.0E8...,1.地點：台中北屯\n2.用途：自住\n3.屋齡：6年\n4.房屋類型：社區大樓\n5.坪數...
2,11/30,[問題]竹北首購房貸 30-40年,https://www.ptt.cc/bbs/Loan/M.1638205422.A.62B...,[房屋資訊]\n地點：竹北市鹿場里\n坪數：大樓/權狀57含車位6\n用途：首購自住(名下無...
3,11/30,[問題] 國泰房貸增貸,https://www.ptt.cc/bbs/Loan/M.1638238308.A.3C3...,\n請教有在國泰辦過增貸的大大\n想在設定內額度已還近半的房貸增貸\n直接問客服 或是找原本...
4,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...
...,...,...,...,...
68,11/22,[問題] 桃園青埔房貸,https://www.ptt.cc/bbs/Loan/M.1637581072.A.024...,\n地點:桃園大園區門牌，離高鐵很近\n類型:社區大樓\n坪數:權狀約80坪（含車位15....
69,11/22,[問題] 台北房貸,https://www.ptt.cc/bbs/Loan/M.1637593142.A.63F...,地點：台北市萬華區\n用途：自住\n型態：華厦（4樓/總樓層9樓）\n屋齡：37年\n房價：...
70,11/23,[問題] 台中房貸,https://www.ptt.cc/bbs/Loan/M.1637633817.A.A08...,地點：台中市\n用途：自住\n型態：透天\n屋齡：22年\n房價：3234\n權狀坪數：土地...
71,11/23,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1637636616.A.82B...,[房屋資訊]\n地點：台北市中山區長春路\n房型：電梯大樓\n坪數：除車位約27坪\n用途：...


##Add columns (e.g. housing_prices, annual_incomes)

In [11]:
#新增欄位關鍵字 (e.g. 「房價」欄位會有哪些關鍵字)
housing_prices_keywords = ['房價', '售價', '屋價', '成交']
annual_incomes_keywords = ['收入', '年收', '年薪']

housing_prices = []
annual_incomes = []

def get_data_from(content, keywords): 
  texts = content.split('\n')
  data = ''

  for text in texts:
    try: 
      if any(i in text for i in keywords):
        data = re.split(r':|：', text)[1].strip()
        return data
    except IndexError:
      data = ''
      return data

with Pool(processes=8) as pool:
    housing_prices = pool.starmap(get_data_from, zip(contents, repeat(housing_prices_keywords)))
    annual_incomes = pool.starmap(get_data_from, zip(contents, repeat(annual_incomes_keywords)))

df['房價'] = housing_prices
df['收入'] = annual_incomes
df

Unnamed: 0,發文日期,標題,文章網址,文章內容,房價,收入
0,11/29,[問題] 新北公務員首購房貸,https://www.ptt.cc/bbs/Loan/M.1638196377.A.2E6...,1.地點：新北市中和區（永安市場-景安）\n2.坪數：權狀坪數約18坪\n3.用途：自住\n...,1000萬,去年扣繳憑單70萬
1,11/30,[問題] 台中-房貸轉貸/增貸,https://www.ptt.cc/bbs/Loan/M.1638203458.A.0E8...,1.地點：台中北屯\n2.用途：自住\n3.屋齡：6年\n4.房屋類型：社區大樓\n5.坪數...,,80
2,11/30,[問題]竹北首購房貸 30-40年,https://www.ptt.cc/bbs/Loan/M.1638205422.A.62B...,[房屋資訊]\n地點：竹北市鹿場里\n坪數：大樓/權狀57含車位6\n用途：首購自住(名下無...,,來信談 負債比偏高 另一半可當保人
3,11/30,[問題] 國泰房貸增貸,https://www.ptt.cc/bbs/Loan/M.1638238308.A.3C3...,\n請教有在國泰辦過增貸的大大\n想在設定內額度已還近半的房貸增貸\n直接問客服 或是找原本...,,
4,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...,,
...,...,...,...,...,...,...
68,11/22,[問題] 桃園青埔房貸,https://www.ptt.cc/bbs/Loan/M.1637581072.A.024...,\n地點:桃園大園區門牌，離高鐵很近\n類型:社區大樓\n坪數:權狀約80坪（含車位15....,2500萬,
69,11/22,[問題] 台北房貸,https://www.ptt.cc/bbs/Loan/M.1637593142.A.63F...,地點：台北市萬華區\n用途：自住\n型態：華厦（4樓/總樓層9樓）\n屋齡：37年\n房價：...,1900,約140萬/其它60萬
70,11/23,[問題] 台中房貸,https://www.ptt.cc/bbs/Loan/M.1637633817.A.A08...,地點：台中市\n用途：自住\n型態：透天\n屋齡：22年\n房價：3234\n權狀坪數：土地...,3234,250-300萬
71,11/23,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1637636616.A.82B...,[房屋資訊]\n地點：台北市中山區長春路\n房型：電梯大樓\n坪數：除車位約27坪\n用途：...,2360萬,年收100+萬


## Save to CSV

In [12]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>