<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, re, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from google.colab import files

## Variables

In [None]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 5 #number of pages to be crawled
title_keywords = ['房貸'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')
housing_prices_keywords = '房價', '售價', '屋價', '成交' #文章內容關鍵字
annual_incomes_keywords = '收入', '年收', '年薪' #文章內容關鍵字

## Functions

In [None]:
def fetch(url):
  response = requests.get(url)
  response = requests.get(url, cookies={'over18': '1'})
  return response

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
    meta = {
      'title': entry.find('div.title', first=True).text,
      'push': entry.find('div.nrec', first=True).text,
      'date': entry.find('div.date', first=True).text
    }
    try:
      meta['author'] = entry.find('div.author', first=True).text
      meta['link'] = entry.find('div.title > a', first=True).attrs['href']
    except AttributeError:
      meta['author'] = '[Deleted]'
      meta['link'] = '[Deleted]'
    return meta

def get_metadata_from(url):

    def parse_next_link(doc):
        html = HTML(html=doc)
        controls = html.find('.action-bar a.btn.wide')
        link = controls[1].attrs.get('href')
        return urllib.parse.urljoin(domain, link)

    res = fetch(url)
    post_entries = parse_article_entries(res.text, 'div.r-ent')
    next_link = parse_next_link(res.text)
    
    metadata = [parse_article_meta(entry) for entry in post_entries]
    return metadata, next_link

def get_paged_meta(url, num_pages):
    collected_meta = []

    for _ in range(num_pages):
      posts, link = get_metadata_from(url)
      collected_meta += posts
      url = urllib.parse.urljoin(domain, link)

    return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
    titles.append(meta['title'])
    dates.append(meta['date'])
    links.append(urllib.parse.urljoin(domain, meta['link']))
    filtered_meta.append(meta)

    return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
    titles = []
    dates = []
    links = []
    filtered_meta = []

    metadata = get_paged_meta(url, num_pages)
    for meta in metadata:
      if len(title_keywords) != 0:
        for text in title_keywords: 
          if text in meta['title']:
            create_fields(titles, dates, links, filtered_meta, meta)
      else: 
        create_fields(titles, dates, links, filtered_meta, meta)

    return titles, dates, links, filtered_meta

def parse_content_from(link):
    res = fetch(link)
    soup = BeautifulSoup(res.text, 'html.parser')
    main_container = soup.find(id='main-container')

    try:
      pre_text = main_container.text.split('※ 發信站')[0]
      texts = pre_text.split('\n')[2:]
      data = '\n'.join(texts)

    except AttributeError:
      data = '[Deleted]'
    return data

def get_housing_price_from(link):
    texts = parse_content_from(link).split('\n')
    data = ''
    
    for text in texts:
      try: 
        if any(i in text for i in housing_prices_keywords):
          data = re.split(r':|：', text)[1].strip()
          return data
      except IndexError:
        data = ''
        return data

def get_annual_income_from(link):
    texts = parse_content_from(link).split('\n')
    data = ''

    for text in texts:
      try: 
        if any(i in text for i in annual_incomes_keywords):
          data = re.split(r':|：', text)[1].strip()
          return data
      except IndexError:
        data = ''
        return data

def get_contents(metadata):
    contents = []
    housing_prices = []
    annual_incomes = []

    post_links = [
      urllib.parse.urljoin(domain, meta['link'])
      for meta in metadata if 'link' in meta
    ]

    with Pool(processes=8) as pool:
      contents = pool.map(parse_content_from, post_links)
      housing_prices = pool.map(get_housing_price_from, post_links)
      annual_incomes = pool.map(get_annual_income_from, post_links)
      return contents, housing_prices, annual_incomes

## Preview crawled data






In [None]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents, housing_prices, annual_incomes = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents, '房價': housing_prices, '收入': annual_incomes})
df

Unnamed: 0,發文日期,標題,文章網址,文章內容,房價,收入
0,11/09,[問題] 新北首購房貸,https://www.ptt.cc/bbs/Loan/M.1636469039.A.4C5...,[房屋資訊]\n地點：新北市新店區\n類型：電梯大樓 (12/15)\n坪數：34.37坪\...,1600萬,120萬
1,11/09,[問題] 竹北首購房貸,https://www.ptt.cc/bbs/Loan/M.1636472875.A.F3A...,1.地點：竹北\n2.用途：首購\n3.屋齡：約25年\n4.房屋類型：電梯大樓\n5.房價...,約600萬,90w
2,11/09,[問題] 台北市首購套房貸款,https://www.ptt.cc/bbs/Loan/M.1636472884.A.84B...,[房屋資訊]\n地點：台北市中山區(捷運站350公尺)\n類型：電梯大樓(4/12)\n坪數...,1180萬,200+
3,11/10,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1636475502.A.239...,地點：台北市內湖區\n類型：公寓2樓/共5樓\n坪數：總計約26坪\n用途：自住\n屋齡：3...,1350（需鑑價）,約100萬 / 約77萬
4,11/10,[問題] 台中首購房貸(合庫),https://www.ptt.cc/bbs/Loan/M.1636518693.A.943...,[房屋資訊]\n1.地點：台中西屯 (櫻花路)\n2.用途：首購\n3.屋齡：0年\n4.房...,約1000萬,110w
5,11/10,[問題] 台南首購房貸,https://www.ptt.cc/bbs/Loan/M.1636523822.A.806...,\n1.地點:台南市新營區\n2.貸款用途:自住\n3.房價:1360萬\n4.屋齡:202...,1360萬,今年約200萬/年，年底轉職預計250萬+/年
6,11/10,[問題] 三重新成屋第二間房貸需求,https://www.ptt.cc/bbs/Loan/M.1636528089.A.408...,\n\n 過去一年留職停薪沒收入\n 只有名下一間不動產的租金所得\n ...,,
7,11/10,[問題] 高雄公教首購房貸,https://www.ptt.cc/bbs/Loan/M.1636528729.A.E0B...,\n地點：高雄前鎮區31期\n屋齡：15\n房屋類型：電梯大樓\n坪數：約46坪\n房價：1...,1200萬,70萬/130萬
8,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...,,
9,11/09,[問題] 新北淡水首購房貸,https://www.ptt.cc/bbs/Loan/M.1636387666.A.4DE...,\n\n[房屋資訊]\n地點：新北市淡水區(竹圍站附近)\n類型：電梯大樓 (19/23)\...,780萬,150-200萬


## Save to CSV

In [None]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>