In [1]:
import requests
import urllib.parse
from bs4 import BeautifulSoup

# from utils import pretty_print

INDEX = 'https://www.ptt.cc/bbs/Boy-Girl/index.html'

def get_posts_on_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    posts = list()
    for article in soup.find_all('div', 'r-ent'):
        meta = article.find('div', 'title').find('a')
        if meta:
            title = meta.getText().strip()
            if "[公告]" in title or "置底" in title: continue
            posts.append({
                'title': title,
                'link': meta.get('href'),
                'push': article.find('div', 'nrec').getText(),
                'date': article.find('div', 'date').getText(),
                'author': article.find('div', 'author').getText(),
            })

    next_link = soup.find('div', 'btn-group-paging').find_all('a', 'btn')[1].get('href')

    return posts, next_link


def get_pages(num):
    page_url = INDEX
    all_posts = list()
    for i in range(num):
        posts, link = get_posts_on_page(page_url)
        all_posts += posts
        page_url = urllib.parse.urljoin(INDEX, link)
        
    return all_posts

In [2]:
def fetch_article_content(post):
    article_info = dict() # used for output
    pusherList = list() # get pushers in the post

    articleLink = urllib.parse.urljoin(INDEX, post['link'])
    response = requests.get(articleLink)
    soup = BeautifulSoup(response.text, 'lxml')
    mainContent = soup.find('div', id='main-container').find('div',id='main-content')

    # record pusher and remove pushes
    pushes = mainContent.find_all('div','push')
    for p in pushes:
        uid = p.find('span', 'push-userid').getText()
        pusherList.append(uid)
        p.decompose()

    # get time and remove metas
    metas = mainContent.find_all('div', 'article-metaline')
    articleTime = metas[2].find('span','article-meta-value').getText()
    for meta in metas:
        meta.decompose()
        
    header = mainContent.find('div', 'article-metaline-right')
    if header: header.decompose()

    # remove noise
    noise = mainContent.find_all('span')
    for item in noise: item.decompose()
    articleContent = mainContent.getText().strip()
    
    article_info["title"] = post['title']
    article_info["push"] = post['push']
    article_info["author"] = post['author']
    article_info["datetime"] = articleTime
    article_info["content"] = articleContent
    article_info["pusher"] = pusherList
    return article_info

In [3]:
from multiprocessing import Pool

def get_articles(postList):
    with Pool(processes=8) as pool:
        article_info = pool.map(fetch_article_content, postList)
        return article_info

In [4]:
from datetime import datetime
import time
import os
import csv

if __name__ == '__main__':
    pages = 5

    start = time.time()
    
    postList = get_pages(pages)
    article_infos = get_articles(postList)
    
    print('花費: %f 秒' % (time.time() - start))

    print('共%d項結果' % len(article_infos))
    
    # sorting by datetimet
    start = time.time()
    article_infos = sorted(article_infos, key=lambda article: datetime.strptime(article['datetime'], '%a %b %d %H:%M:%S %Y'), reverse=True )
    print('花費: %f 秒做sorting' % (time.time() - start))


    outputDir_csv = "Output/csv/"
    if not os.path.isdir(outputDir_csv): os.makedirs(outputDir_csv)
    outputDir_txt = "Output/txt/"
    if not os.path.isdir(outputDir_txt): os.makedirs(outputDir_txt)
        
    docID = 1
    for article in article_infos:
        
        # write csv file
        with open(outputDir_csv + str(docID) + '.csv', 'w', newline='',encoding='utf-8-sig') as f:
            spamwriter = csv.writer(f, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
            header = ["Author", "Title", "Push", "Time", "Content"]
            spamwriter.writerow(header)
            row = (article['author'], article['title'], article['push'], article['datetime'], article['content'])
            spamwriter.writerow(row)
            f.close()
            
        # write txt file
        with open(outputDir_txt + str(docID) + '.txt', 'w') as f:
            f.write("Author:"+'\t'+article['author']+'\n')
            f.write("Title:"+'\t'+article['title']+'\n')
            f.write("Push:"+'\t'+article['push']+'\n')
            f.write("Time:"+'\t'+article['datetime']+'\n')
            f.write("\nContent:\n"+article['content']+'\n')
        docID += 1

花費: 5.252650 秒
共77項結果
花費: 0.007368 秒做sorting
