<a href="https://colab.research.google.com/github/tingchun0113/ptt-crawler/blob/main/ptt_crawler_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and import dependencies

In [None]:
!pip install requests_html
import requests, urllib.parse, re, pandas as pd
from requests_html import HTML
from bs4 import BeautifulSoup
from multiprocessing import Pool
from google.colab import files



## Variables

In [None]:
board = 'Loan' #看板名稱
domain = 'https://www.ptt.cc/'
url = domain + 'bbs/' + board + '/index.html'

num_pages = 5 #number of pages to be crawled
title_keywords = ['房貸'] #e.g. title_keywords = ['房貸', '房屋'] (標題有 '房貸' 或 '房屋')

## Functions

In [None]:
def fetch(url):
  response = requests.get(url)
  response = requests.get(url, cookies={'over18': '1'})
  return response

def parse_article_entries(doc, el): 
  html = HTML(html=doc)
  post_entries = html.find(el)
  return post_entries

def parse_article_meta(entry):
    meta = {
      'title': entry.find('div.title', first=True).text,
      'push': entry.find('div.nrec', first=True).text,
      'date': entry.find('div.date', first=True).text
    }
    try:
      meta['author'] = entry.find('div.author', first=True).text
      meta['link'] = entry.find('div.title > a', first=True).attrs['href']
    except AttributeError:
      meta['author'] = '[Deleted]'
      meta['link'] = '[Deleted]'
    return meta

def get_metadata_from(url):

    def parse_next_link(doc):
        html = HTML(html=doc)
        controls = html.find('.action-bar a.btn.wide')
        link = controls[1].attrs.get('href')
        return urllib.parse.urljoin(domain, link)

    res = fetch(url)
    post_entries = parse_article_entries(res.text, 'div.r-ent')
    next_link = parse_next_link(res.text)
    
    metadata = [parse_article_meta(entry) for entry in post_entries]
    return metadata, next_link

def get_paged_meta(url, num_pages):
    collected_meta = []

    for _ in range(num_pages):
      posts, link = get_metadata_from(url)
      collected_meta += posts
      url = urllib.parse.urljoin(domain, link)

    return collected_meta

def create_fields(titles, dates, links, filtered_meta, meta):
    titles.append(meta['title'])
    dates.append(meta['date'])
    if meta['link']:
      links.append(urllib.parse.urljoin(domain, meta['link'])) 
    filtered_meta.append(meta)

    return titles, dates, links, filtered_meta

def filter_metadata(url, num_pages):
    titles = []
    dates = []
    links = []
    filtered_meta = []

    metadata = get_paged_meta(url, num_pages)
    for meta in metadata:
      if len(title_keywords) != 0:
        for text in title_keywords: 
          if text in meta['title']:
            create_fields(titles, dates, links, filtered_meta, meta)
      else: 
        create_fields(titles, dates, links, filtered_meta, meta)

    return titles, dates, links, filtered_meta

def parse_content_from(link):
    res = fetch(link)
    soup = BeautifulSoup(res.text, 'html.parser')
    main_container = soup.find(id='main-container')

    try:
      pre_text = main_container.text.split('※ 發信站')[0]
      texts = pre_text.split('\n')[2:]
      data = '\n'.join(texts)

    except AttributeError:
      data = '[Deleted]'
    return data

def get_contents(metadata):
    contents = []

    with Pool(processes=8) as pool:
      contents = pool.map(parse_content_from, links)
      return contents

## Preview crawled data






In [None]:
titles, dates, links, filtered_meta = filter_metadata(url, num_pages)
contents = get_contents(filtered_meta)

df = pd.DataFrame({'發文日期': dates, '標題': titles, '文章網址': links, '文章內容': contents})
df

Unnamed: 0,發文日期,標題,文章網址,文章內容
0,11/29,[問題] 頭份首購房貸,https://www.ptt.cc/bbs/Loan/M.1638163960.A.1FC...,希望貸款期限：30年 - 35年\n房子目前的鑑價： 大約800萬\n希望貸款金額：735 ...
1,11/29,[問題] 台中首購房貸,https://www.ptt.cc/bbs/Loan/M.1638170754.A.66B...,［房屋資訊]\n地點：台中市南區\n房型：電梯大樓\n坪數：47坪\n車位：機械\n用途：自...
2,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...
3,11/28,[問題] 首購房貸,https://www.ptt.cc/bbs/Loan/M.1638098595.A.97C...,\n目前名下沒房 最近想要買房\n房屋地點在台中南屯捷運站附近，\n房齡約四年 約55多坪...
4,11/28,[問題] 竹北樓中樓套房貸款,https://www.ptt.cc/bbs/Loan/M.1638105206.A.DEA...,\n〔房屋資訊]\n1. 地點：竹北家樂福附近\n2. 屋齡：15年\n3. 房屋類型︰電梯...
5,11/28,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1638108338.A.B41...,[房屋資訊]\n地點：台北市北投區\n類型：公寓\n坪數：權狀31坪\n用途：自住\n屋齡：...
6,11/28,[問題] 桃園首購房貸,https://www.ptt.cc/bbs/Loan/M.1638109044.A.01E...,\n[房屋資訊]\n地點：桃園市龜山區\n類型：公寓（華夏）\n坪數：權狀45坪\n用途：自...
7,11/28,Re: [請益]徵求8+1或是9成房貸,https://www.ptt.cc/bbs/Loan/M.1638111152.A.123...,我也有同樣貸9成的需求\n房屋在竹北\n首購\n希望也能寄給我站內信\n謝謝\n\n\n\n...
8,11/29,[問題] 台北公寓房貸,https://www.ptt.cc/bbs/Loan/M.1638116862.A.0D7...,\n[房屋資訊]\n地點：台北市士林區\n房型：3房2廳2衛\n坪數：34.82坪\n車位：...
9,11/29,[問題] 桃園內壢 公教人員房貸,https://www.ptt.cc/bbs/Loan/M.1638120899.A.281...,已簽約\n尋找鑑價/貸款成數與利率較優銀行\n\n\n[房屋資訊]\n地點：桃園市中壢區忠孝...


##Add columns (e.g. housing_prices, annual_incomes)

In [None]:
housing_prices_keywords = '房價', '售價', '屋價', '成交' #文章內容關鍵字
annual_incomes_keywords = '收入', '年收', '年薪' #文章內容關鍵字
housing_prices = []
annual_incomes = []

def get_housing_prices_from(content): 
    texts = content.split('\n')
    data = ''

    for text in texts:
      try: 
        if any(i in text for i in housing_prices_keywords):
          data = re.split(r':|：', text)[1].strip()
          return data
      except IndexError:
        data = ''
        return data

def get_annual_incomes_from(content): 
    texts = content.split('\n')
    data = ''

    for text in texts:
      try: 
        if any(i in text for i in annual_incomes_keywords):
          data = re.split(r':|：', text)[1].strip()
          return data
      except IndexError:
        data = ''
        return data

with Pool(processes=8) as pool:
    housing_prices = pool.map(get_housing_prices_from, contents)
    annual_incomes = pool.map(get_annual_incomes_from, contents)

df['房價'] = housing_prices
df['收入'] = annual_incomes
df

Unnamed: 0,發文日期,標題,文章網址,文章內容,房價,收入
0,11/29,[問題] 頭份首購房貸,https://www.ptt.cc/bbs/Loan/M.1638163960.A.1FC...,希望貸款期限：30年 - 35年\n房子目前的鑑價： 大約800萬\n希望貸款金額：735 ...,,大約 88萬
1,11/29,[問題] 台中首購房貸,https://www.ptt.cc/bbs/Loan/M.1638170754.A.66B...,［房屋資訊]\n地點：台中市南區\n房型：電梯大樓\n坪數：47坪\n車位：機械\n用途：自...,1050萬,年薪170萬
2,6/19,【 房貸-要注意些什麼 】,https://www.ptt.cc/bbs/Loan/M.1403108148.A.386...,\n\n提供自己本身在銀行服務的經驗\n\n給大家一些申辦房貸時的提醒\n\n\n1.購屋買...,,
3,11/28,[問題] 首購房貸,https://www.ptt.cc/bbs/Loan/M.1638098595.A.97C...,\n目前名下沒房 最近想要買房\n房屋地點在台中南屯捷運站附近，\n房齡約四年 約55多坪...,,無固定收入 但個人資產有四千萬以上
4,11/28,[問題] 竹北樓中樓套房貸款,https://www.ptt.cc/bbs/Loan/M.1638105206.A.DEA...,\n〔房屋資訊]\n1. 地點：竹北家樂福附近\n2. 屋齡：15年\n3. 房屋類型︰電梯...,350萬,100萬
5,11/28,[問題] 台北首購房貸,https://www.ptt.cc/bbs/Loan/M.1638108338.A.B41...,[房屋資訊]\n地點：台北市北投區\n類型：公寓\n坪數：權狀31坪\n用途：自住\n屋齡：...,1千萬左右,100萬
6,11/28,[問題] 桃園首購房貸,https://www.ptt.cc/bbs/Loan/M.1638109044.A.01E...,\n[房屋資訊]\n地點：桃園市龜山區\n類型：公寓（華夏）\n坪數：權狀45坪\n用途：自...,1450萬,約160萬
7,11/28,Re: [請益]徵求8+1或是9成房貸,https://www.ptt.cc/bbs/Loan/M.1638111152.A.123...,我也有同樣貸9成的需求\n房屋在竹北\n首購\n希望也能寄給我站內信\n謝謝\n\n\n\n...,,年薪200-300萬
8,11/29,[問題] 台北公寓房貸,https://www.ptt.cc/bbs/Loan/M.1638116862.A.0D7...,\n[房屋資訊]\n地點：台北市士林區\n房型：3房2廳2衛\n坪數：34.82坪\n車位：...,1830萬,防疫進半年月收6-8萬，醫學中心月收4.5-5萬
9,11/29,[問題] 桃園內壢 公教人員房貸,https://www.ptt.cc/bbs/Loan/M.1638120899.A.281...,已簽約\n尋找鑑價/貸款成數與利率較優銀行\n\n\n[房屋資訊]\n地點：桃園市中壢區忠孝...,925萬,年收100萬


## Save to CSV

In [None]:
df.to_csv('ptt_data.csv')
files.download('ptt_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>