<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, re, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from itertools import repeat
from google.colab import files

## Variables

In [2]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 10 #number of pages to be crawled
title_keywords = ['房貸'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')

## Functions

In [3]:
def fetch(url):
  res = requests.get(url)
  res = requests.get(url, cookies={'over18': '1'})
  return res

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
  meta = {
    'title': entry.find('div.title', first=True).text,
    'push': entry.find('div.nrec', first=True).text,
    'date': entry.find('div.date', first=True).text
  }
  try:
    meta['author'] = entry.find('div.author', first=True).text
    meta['link'] = entry.find('div.title > a', first=True).attrs['href']
  except AttributeError:
    meta['author'] = '[Deleted]'
    meta['link'] = '[Deleted]'
  return meta

def get_metadata_from(url):
  
  def parse_next_link(doc):
      html = HTML(html=doc)
      controls = html.find('.action-bar a.btn.wide')
      link = controls[1].attrs.get('href')
      return urllib.parse.urljoin(domain, link)

  res = fetch(url)
  post_entries = parse_article_entries(res.text, 'div.r-ent')
  next_link = parse_next_link(res.text)
  
  metadata = [parse_article_meta(entry) for entry in post_entries]
  return metadata, next_link

def get_paged_meta(url, num_pages):
  collected_meta = []

  for _ in range(num_pages):
    posts, link = get_metadata_from(url)
    collected_meta += posts
    url = urllib.parse.urljoin(domain, link)

  return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
  titles.append(meta['title'])
  dates.append(meta['date'])
  if meta['link']:
    links.append(urllib.parse.urljoin(domain, meta['link'])) 
  filtered_meta.append(meta)

  return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
  titles = []
  dates = []
  links = []
  filtered_meta = []

  metadata = get_paged_meta(url, num_pages)
  for meta in metadata:
    if len(title_keywords) != 0:
      for text in title_keywords: 
        if text in meta['title']:
          create_fields(titles, dates, links, filtered_meta, meta)
    else: 
      create_fields(titles, dates, links, filtered_meta, meta)

  return titles, dates, links, filtered_meta

def parse_content_from(link):
  res = fetch(link)
  soup = BeautifulSoup(res.text, 'html.parser')
  main_container = soup.find(id='main-container')

  try:
    pre_text = main_container.text.split('※ 發信站')[0]
    texts = pre_text.split('\n')[2:]
    data = '\n'.join(texts)

  except AttributeError:
    data = '[Deleted]'
  return data

def get_contents(metadata):
  contents = []

  with Pool(processes=8) as pool:
    contents = pool.map(parse_content_from, links)
    return contents

## Preview crawled data






In [None]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents})
df

##Add columns

In [5]:
#新增欄位關鍵字 (e.g. 「房價」欄位會有哪些關鍵字)
housing_prices_keywords = ['房價', '售價', '屋價', '成交']
annual_incomes_keywords = ['收入', '年收', '年薪']

housing_prices = []
annual_incomes = []

def get_data_from(content, keywords): 
  texts = content.split('\n')
  data = ''

  for text in texts:
    try: 
      if any(i in text for i in keywords):
        data = re.split(r':|：', text)[1].strip()
        return data
    except IndexError:
      data = ''
      return data

with Pool(processes=8) as pool:
    housing_prices = pool.starmap(get_data_from, zip(contents, repeat(housing_prices_keywords)))
    annual_incomes = pool.starmap(get_data_from, zip(contents, repeat(annual_incomes_keywords)))

df['房價'] = housing_prices
df['收入'] = annual_incomes
df

Unnamed: 0,發文日期,標題,文章網址,文章內容,房價,收入
0,12/15,[問題] 新北首購房貸,https://www.ptt.cc/bbs/Loan/M.1639556459.A.919...,[房屋資訊]\n地點：新北市三重區德厚街\n坪數：20.63\n房價：1198萬 (開價)\...,1198萬 (開價),68萬
1,12/15,[問題] 房貸 台中首購 醫師,https://www.ptt.cc/bbs/Loan/M.1639557479.A.078...,"地點：台中市南屯區\n屋齡：0年（預售）\n類型：電梯大樓\n坪數：約47坪 (雙車位,另計...",2200萬,200萬
2,12/16,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1639627736.A.5B2...,[房屋資訊]\n\n地點：台北市文山區\n\n類型：公寓頂樓+頂加\n\n坪數：權狀23.4...,1250萬左右,
3,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...,,
4,12/14,[問題] 桃園房貸首購,https://www.ptt.cc/bbs/Loan/M.1639483566.A.9C1...,[房屋資訊]\n地點：桃園市中壢區環西路二段\n坪數：33.5坪(含平面車位）\n房價：89...,890萬,120萬
5,12/15,[問題] 台中首購房貸,https://www.ptt.cc/bbs/Loan/M.1639501095.A.652...,地點：台中市南屯區向上路二段\n屋齡：27年\n類型：電梯大樓\n坪數：約29坪 (含車位）...,850萬,80萬
6,12/15,[請益] 新北中和46年公寓85或8+1房貸,https://www.ptt.cc/bbs/Loan/M.1639530859.A.E3D...,徵求新北中和46年公寓85或8+1房貸\n若有以上條件請站內\n若可貸30年或以上的更好\n...,,
7,12/15,[問題] 台北市南港區首購房貸(45年公寓),https://www.ptt.cc/bbs/Loan/M.1639550275.A.A6F...,地點：台北市南港區合順街\n屋年：45年\n類型：公寓\n所在樓層：4樓+5樓頂加\n坪數：...,1000萬,110萬
8,12/15,[問題]基隆首購房貸,https://www.ptt.cc/bbs/Loan/M.1639554609.A.B41...,[房屋資訊]\n地點：基隆市\n房價：透天約1060萬\n坪數：59坪(含平面車位)\n用途...,透天約1060萬,約120萬
9,12/14,[問題] 台中首購房貸,https://www.ptt.cc/bbs/Loan/M.1639457767.A.D9B...,1.地點：台中市北屯區太順路\n2.屋齡：2年\n3.房屋類型：電梯大樓\n4.坪數：47....,1260萬,年收約82萬


## Save to CSV

In [6]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>