<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, re, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from itertools import repeat
from google.colab import files

## Variables

In [None]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 10 #number of pages to be crawled
title_keywords = ['房貸'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')

## Functions

In [None]:
def fetch(url):
  res = requests.get(url)
  res = requests.get(url, cookies={'over18': '1'})
  return res

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
  meta = {
    'title': entry.find('div.title', first=True).text,
    'push': entry.find('div.nrec', first=True).text,
    'date': entry.find('div.date', first=True).text
  }
  try:
    meta['author'] = entry.find('div.author', first=True).text
    meta['link'] = entry.find('div.title > a', first=True).attrs['href']
  except AttributeError:
    meta['author'] = '[Deleted]'
    meta['link'] = '[Deleted]'
  return meta

def get_metadata_from(url):
  
  def parse_next_link(doc):
      html = HTML(html=doc)
      controls = html.find('.action-bar a.btn.wide')
      link = controls[1].attrs.get('href')
      return urllib.parse.urljoin(domain, link)

  res = fetch(url)
  post_entries = parse_article_entries(res.text, 'div.r-ent')
  next_link = parse_next_link(res.text)
  
  metadata = [parse_article_meta(entry) for entry in post_entries]
  return metadata, next_link

def get_paged_meta(url, num_pages):
  collected_meta = []

  for _ in range(num_pages):
    posts, link = get_metadata_from(url)
    collected_meta += posts
    url = urllib.parse.urljoin(domain, link)

  return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
  titles.append(meta['title'])
  dates.append(meta['date'])
  if meta['link']:
    links.append(urllib.parse.urljoin(domain, meta['link'])) 
  filtered_meta.append(meta)

  return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
  titles = []
  dates = []
  links = []
  filtered_meta = []

  metadata = get_paged_meta(url, num_pages)
  for meta in metadata:
    if len(title_keywords) != 0:
      for text in title_keywords: 
        if text in meta['title']:
          create_fields(titles, dates, links, filtered_meta, meta)
    else: 
      create_fields(titles, dates, links, filtered_meta, meta)

  return titles, dates, links, filtered_meta

def parse_content_from(link):
  res = fetch(link)
  soup = BeautifulSoup(res.text, 'html.parser')
  main_container = soup.find(id='main-container')

  try:
    pre_text = main_container.text.split('※ 發信站')[0]
    texts = pre_text.split('\n')[2:]
    data = '\n'.join(texts)

  except AttributeError:
    data = '[Deleted]'
  return data

def get_contents(metadata):
  contents = []

  with Pool(processes=8) as pool:
    contents = pool.map(parse_content_from, links)
    return contents

## Preview crawled data






In [None]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents})
df

Unnamed: 0,發文日期,標題,文章網址,文章內容
0,3/07,[問題] 基隆首購房貸,https://www.ptt.cc/bbs/Loan/M.1646605162.A.2E6...,[房屋資訊]\n地點：基隆市信義區\n用途：自住首購\n屋齡：41年\n房屋類型：公寓\n權...
1,3/07,[問題] 首購房貸 基隆 新北,https://www.ptt.cc/bbs/Loan/M.1646608990.A.016...,[房屋資訊]\n地點：基隆市信義區\n用途：自住首購\n屋齡：40年\n房屋類型：透天\n權...
2,3/07,[問題] 新北首購房貸,https://www.ptt.cc/bbs/Loan/M.1646614566.A.D2C...,[房屋資訊]\n1.地點：新北市新莊區 (頭前重劃區)\n2.貸款用途：自住(首購)\n3....
3,3/07,[問題] 台中房貸 首購,https://www.ptt.cc/bbs/Loan/M.1646616330.A.74A...,[房屋資訊]\n1.地點：台中 東勢區重劃區\n2.屋齡：新成屋 2年\n3.房屋類型︰住家...
4,3/07,[問題] 新竹竹東 房貸增貸,https://www.ptt.cc/bbs/Loan/M.1646617822.A.BFC...,1.地點：新竹縣竹東鎮（兩房一廳一衛一車）\n2.貸款用途：投資理財\n3.屋齡：約3年\n...
...,...,...,...,...
76,3/01,[問題] 工程師首購房貸 （想用太太名字）,https://www.ptt.cc/bbs/Loan/M.1646104999.A.ED9...,\n\n1. 地點：土城海山\n2. 屋齡：2年\n3. 房屋類型：大樓\n4. 權狀坪數...
77,3/01,[問題] 桃園房貸增貸/轉增貸,https://www.ptt.cc/bbs/Loan/M.1646105605.A.660...,1.地點：桃園區\n2.貸款用途：自住\n3.屋齡：9年\n4.權狀坪數: 約50坪(含一平...
78,3/01,[問題] 高雄首購房貸,https://www.ptt.cc/bbs/Loan/M.1646113095.A.A91...,［房屋資訊 ]\n1.地點：高雄市三民區九如一路\n2.屋齡：29年\n3.房屋類型︰電梯大...
79,3/01,問題/房貸增貸,https://www.ptt.cc/bbs/Loan/M.1646115409.A.246...,1.地點：高雄市甲仙區\n2.貸款用途：買新北預售屋\n3.屋齡：36\n4.權狀坪數: 房...


##Add columns

In [None]:
#新增欄位關鍵字 (e.g. 「房價」欄位會有哪些關鍵字)
housing_prices_keywords = ['房價', '售價', '屋價', '成交']
annual_incomes_keywords = ['收入', '年收', '年薪']

housing_prices = []
annual_incomes = []

def get_data_from(content, keywords): 
  texts = content.split('\n')
  data = ''

  for text in texts:
    try: 
      if any(i in text for i in keywords):
        data = re.split(r':|：', text)[1].strip()
        return data
    except IndexError:
      data = ''
      return data

with Pool(processes=8) as pool:
    housing_prices = pool.starmap(get_data_from, zip(contents, repeat(housing_prices_keywords)))
    annual_incomes = pool.starmap(get_data_from, zip(contents, repeat(annual_incomes_keywords)))

df['房價'] = housing_prices
df['收入'] = annual_incomes
df

Unnamed: 0,發文日期,標題,文章網址,文章內容,房價,收入
0,3/07,[問題] 基隆首購房貸,https://www.ptt.cc/bbs/Loan/M.1646605162.A.2E6...,[房屋資訊]\n地點：基隆市信義區\n用途：自住首購\n屋齡：41年\n房屋類型：公寓\n權...,150萬（還沒買價格上不確定）,45萬
1,3/07,[問題] 首購房貸 基隆 新北,https://www.ptt.cc/bbs/Loan/M.1646608990.A.016...,[房屋資訊]\n地點：基隆市信義區\n用途：自住首購\n屋齡：40年\n房屋類型：透天\n權...,900萬（還沒買價格上不確定）,80萬
2,3/07,[問題] 新北首購房貸,https://www.ptt.cc/bbs/Loan/M.1646614566.A.D2C...,[房屋資訊]\n1.地點：新北市新莊區 (頭前重劃區)\n2.貸款用途：自住(首購)\n3....,2120萬,95-100萬
3,3/07,[問題] 台中房貸 首購,https://www.ptt.cc/bbs/Loan/M.1646616330.A.74A...,[房屋資訊]\n1.地點：台中 東勢區重劃區\n2.屋齡：新成屋 2年\n3.房屋類型︰住家...,2088,無
4,3/07,[問題] 新竹竹東 房貸增貸,https://www.ptt.cc/bbs/Loan/M.1646617822.A.BFC...,1.地點：新竹縣竹東鎮（兩房一廳一衛一車）\n2.貸款用途：投資理財\n3.屋齡：約3年\n...,約800萬（現利率1.31%、30年）,約120萬
...,...,...,...,...,...,...
76,3/01,[問題] 工程師首購房貸 （想用太太名字）,https://www.ptt.cc/bbs/Loan/M.1646104999.A.ED9...,\n\n1. 地點：土城海山\n2. 屋齡：2年\n3. 房屋類型：大樓\n4. 權狀坪數...,2350,
77,3/01,[問題] 桃園房貸增貸/轉增貸,https://www.ptt.cc/bbs/Loan/M.1646105605.A.660...,1.地點：桃園區\n2.貸款用途：自住\n3.屋齡：9年\n4.權狀坪數: 約50坪(含一平...,參考實價登錄約1400萬,200萬以上
78,3/01,[問題] 高雄首購房貸,https://www.ptt.cc/bbs/Loan/M.1646113095.A.A91...,［房屋資訊 ]\n1.地點：高雄市三民區九如一路\n2.屋齡：29年\n3.房屋類型︰電梯大...,835,60萬
79,3/01,問題/房貸增貸,https://www.ptt.cc/bbs/Loan/M.1646115409.A.246...,1.地點：高雄市甲仙區\n2.貸款用途：買新北預售屋\n3.屋齡：36\n4.權狀坪數: 房...,,80萬以上


## Save to CSV

In [None]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>