# 載入所需套件

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from concurrent import futures
from tqdm import tqdm
from datetime import datetime
import re

# 定義進入每篇文章爬取推文資訊函數

In [2]:
def details_crawler(link_list):
    link=link_list[0]
    i=link_list[1]
    r=requests.get(link)
    s=BeautifulSoup(r.text,'html.parser')
    
    #文章內文
    content=s.select('#main-content')[0]
    if  '-----' in content.text:
        temp_info=content.text.split('-----')[0]
    else:
        temp_info=content.text.split('--')[0]
    info='\n'.join(temp_info.split('\n')[1:])
    
    #暱稱和日期
    temp=content.select('.article-meta-value')
    
    if temp==[]:
        date=np.nan
        name=np.nan
    else:
        date=datetime.strptime(temp[3].text,'%a %b %d %H:%M:%S %Y')
        temp_name=re.findall(r'\((\w+)\)',temp[0].text)
        if temp_name==[]:
            name=np.nan
        else:
            name=temp_name[0]
        
    #推文和分數
    comment=[]
    total_score=0
    
    try:
        for e in content.select('.push'):
            push_tag=e.select('.hl.push-tag')[0].text.replace(' ','')
            push_user=e.select('.f3.hl.push-userid')[0].text
            push_content=e.select('.f3.push-content')[0].text.strip(': ')
            push_time=e.select('.push-ipdatetime')[0].text.strip('\n').strip(' ')

            if '推' in push_tag:
                score=1
            elif '噓' in push_tag:
                score=-1
            else:
                score=0

            total_score+=score

            comment.append({'帳號':push_user,
                            '內容':push_content,
                            '心情':push_tag,
                            '分數':score,
                            '時間':push_time})
    except:
        pass
        
    return i,info,name,date,comment,total_score

# 定義爬取單頁Womentalk板文章資訊函數

In [3]:
def ptt_womentalk_crawler(url):   
    response=requests.get(url)
    soup=BeautifulSoup(response.text,'html.parser')
    
    entrys=soup.select('div.r-ent')
    
    titles,ids,recommends,links=[],[],[],[]

    for entry in entrys:
        if entry.select('div.author')[0].text=='-':
            continue
        else:
            titles.append(entry.select('div.title a')[0].text)
            links.append('https://www.ptt.cc'+entry.select('div.title a')[0]['href'])
            ids.append(entry.select('div.author')[0].text)
            recommends.append(entry.select('div.nrec')[0].text)
    
    link_list=[]
    for i in range(len(links)):
        link_list.append([links[i],i])
    
    infos=[0]*len(links)
    names=[0]*len(links)
    dates=[0]*len(links)
    comments=[0]*len(links)
    total_scores=[0]*len(links)
    
    #進入每個連結爬取(執行thread層級的非同步任務)
    with futures.ThreadPoolExecutor(max_workers=8) as executor:
        results=list(executor.map(details_crawler,link_list))

        for future in results:
            i,info,name,date,comment,total_score=future
            infos[i]=info
            names[i]=name
            dates[i]=date
            comments[i]=comment
            total_scores[i]=total_score
    
    df=pd.DataFrame({
        '標題':titles,
        '時間':dates,
        '帳號':ids,
        '暱稱':names,
        '推數':recommends,
        '分數':total_scores,
        '內容':infos,
        '推文':comments,
        '網址':links
    })
    
    return df

# 定義爬取多頁Womentalk板文章資訊函數

In [4]:
def page_condition_function(page):    
    url='https://www.ptt.cc/bbs/WomenTalk/index.html'    
    dfs=[]
   
    for i in tqdm(range(page)):    
        d=ptt_womentalk_crawler(url)
        dfs.append(d)
        response=requests.get(url)
        soup=BeautifulSoup(response.text,'html.parser')
        paging=soup.select('div.btn-group-paging a')
        url='https://www.ptt.cc'+paging[1]['href']
    
    df=pd.concat(dfs,ignore_index=True)
    
    return df

# 抓取指定總頁數Womentalk板文章資訊

In [5]:
df=page_condition_function(10)
df

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:49<00:00,  3.92s/it]


Unnamed: 0,標題,時間,帳號,暱稱,推數,分數,內容,推文,網址
0,[閒聊] 中秋烤肉大家都去哪裡買食才卡划算,2020-09-22 15:17:51,yanguchi,caffery,2,2,去年是買外面一組那種套裝式的烤肉箱\n覺得偏貴東西不多新鮮度也還好\n今年拒絕同事的烤肉聚\...,"[{'帳號': 'eva19452002', '內容': '一年才一次中秋節，在那邊精打細算...",https://www.ptt.cc/bbs/WomenTalk/M.1600759073....
1,[閒聊] 怎麼把螞蟻徹底趕走?????,2020-09-22 15:19:28,Jeffders,濃,3,3,每次看到有人發問殺蟑螂的文章\n底下留言都會提到一點絕\n\n但我現在比較困擾的是…螞蟻!!...,"[{'帳號': 'eva19452002', '內容': '搬家', '心情': '→', ...",https://www.ptt.cc/bbs/WomenTalk/M.1600759170....
2,[問題] 喜歡看男生穿什麼衣服?,2020-09-22 15:27:07,arnold3,憂鬱型男阿雲,9,8,我看女生穿個短裙或是絲襪 還是透一點的 回頭率就很高 就能很簡單獲得\n很多男生關心 容易吸...,"[{'帳號': 'chocopoodle', '內容': 'https://i.imgur....",https://www.ptt.cc/bbs/WomenTalk/M.1600759629....
3,[閒聊] 人生巔峰在哪？,2020-09-22 15:39:15,aleley,,4,3,其中一個表姊最近生了\n\n是個頭好壯壯的兒子\n\n看她發限時動態說一家很幸福\n\n也為...,"[{'帳號': 'shao9850', '內容': '每次照鏡子的時候', '心情': '推...",https://www.ptt.cc/bbs/WomenTalk/M.1600760357....
4,[難過] 還有人能比我更廢嗎,2020-09-22 15:40:14,emillyliuu,DANISELIU_,3,3,今年得知得延畢一整學年了\n原本只是延畢一學期的\n怪我選課沒有注意好到底缺了什麼課\n選了...,"[{'帳號': 'minifat', '內容': '這文都不知道怎回 很多都在搞公司了問剛上...",https://www.ptt.cc/bbs/WomenTalk/M.1600760416....
5,[公告] 九月置底閒聊區,2020-09-01 00:31:26,gn01765288,PTT金庸,爆,125,九月到了\n\n暑假過了\n\n開學了\n\n閒聊區可以閒聊喔~~\n\n但基本的PTT禮儀...,"[{'帳號': 'hua828797', '內容': '錢', '心情': '推', '分數...",https://www.ptt.cc/bbs/WomenTalk/M.1598891489....
6,[公告] WomenTalk 板規 (2020.9.3),NaT,hua828797,,,0,║ ║\n ...,[],https://www.ptt.cc/bbs/WomenTalk/M.1599127903....
7,[公告] 板規宣導 禁止贈送禮物,2020-09-15 00:05:52,hua828797,我想吃雲之上夢見的腳皮,2,5,大家晚安\n\n\n板規13. 文章、推文不得出現資金事宜、贈送禮物或任何徵人相關訊息。\n...,"[{'帳號': 'kuijun228', '內容': '那送p幣怎辦 完了', '心情': ...",https://www.ptt.cc/bbs/WomenTalk/M.1600099556....
8,[公告] 板主徵選,2020-09-15 22:35:31,Vedan,味丹,8,18,[總覽]\n\n\n 一、徵選及就任資格\n\n 二...,"[{'帳號': 'YYC042', '內容': '我選 大家投我嗎？', '心情': '推'...",https://www.ptt.cc/bbs/WomenTalk/M.1600180533....
9,Re: [閒聊] 有人還在用iPhone 6s嗎,2020-09-22 12:42:43,silverado,西門啦,1,1,※ 引述《eric112 (eric)》之銘言：\n: iPhone第一支有2G Ram的手...,"[{'帳號': 'meishan31', '內容': '電池變大30%那空間夠嗎?', '心...",https://www.ptt.cc/bbs/WomenTalk/M.1600749765....


# 將Womentalk板文章資訊匯出成Excel檔

In [6]:
df.to_excel('Womentalk板文章資訊.xlsx',index=False)