## 爬蟲實作

參考資料：https://www.jamleecute.com/python-web-crawler-beautifulsoup-%E7%B6%B2%E8%B7%AF%E7%88%AC%E8%9F%B2/

In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests

In [2]:
# 指定要抓取的網頁URL
url = "https://www.ptt.cc/bbs/hotboards.html"
 
# 使用requests.get()得到網頁回傳內容
r = requests.get(url)
 
# request.get()回傳的是一個物件 
# 若抓成功, 則網頁原始碼會放在物件的text屬性, 我們把它存在一個變數 'web_content'
web_content = r.text

In [3]:
from bs4 import BeautifulSoup

因為從這個PPT網頁的HTML結構，可以看到我們想爬取的看板名稱資料所在的 Element 是一個div，並且用一個 “board-name” 來指定套用樣式，所以我們就可以找出所有的 class 是 “board-name” 的 div Element。

In [4]:
# 使用Beautiful Soup解析HTML程式碼 : 
soup = BeautifulSoup(web_content, 'lxml')
 
# 找出所有class為"board-name"的div elements
boardNameElements = soup.find_all('div', class_="board-name")
boardNameElements

[<div class="board-name">Gossiping</div>,
 <div class="board-name">C_Chat</div>,
 <div class="board-name">Stock</div>,
 <div class="board-name">NBA</div>,
 <div class="board-name">Baseball</div>,
 <div class="board-name">Lifeismoney</div>,
 <div class="board-name">sex</div>,
 <div class="board-name">HatePolitics</div>,
 <div class="board-name">KoreaStar</div>,
 <div class="board-name">MobileComm</div>,
 <div class="board-name">movie</div>,
 <div class="board-name">car</div>,
 <div class="board-name">Beauty</div>,
 <div class="board-name">e-shopping</div>,
 <div class="board-name">LoL</div>,
 <div class="board-name">WomenTalk</div>,
 <div class="board-name">Tech_Job</div>,
 <div class="board-name">BabyMother</div>,
 <div class="board-name">Boy-Girl</div>,
 <div class="board-name">Japan_Travel</div>,
 <div class="board-name">PC_Shopping</div>,
 <div class="board-name">creditcard</div>,
 <div class="board-name">marriage</div>,
 <div class="board-name">joke</div>,
 <div class="board-name">

In [5]:
#將每個element的文字部分(看板名稱)取出
boardNames = [e.text for e in boardNameElements]

In [6]:
boardNames

['Gossiping',
 'C_Chat',
 'Stock',
 'NBA',
 'Baseball',
 'Lifeismoney',
 'sex',
 'HatePolitics',
 'KoreaStar',
 'MobileComm',
 'movie',
 'car',
 'Beauty',
 'e-shopping',
 'LoL',
 'WomenTalk',
 'Tech_Job',
 'BabyMother',
 'Boy-Girl',
 'Japan_Travel',
 'PC_Shopping',
 'creditcard',
 'marriage',
 'joke',
 'AllTogether',
 'home-sale',
 'NSwitch',
 'marvel',
 'japanavgirls',
 'Steam',
 'iOS',
 'Tainan',
 'ToS',
 'MakeUp',
 'PlayStation',
 'Kaohsiung',
 'KoreaDrama',
 'Japandrama',
 'KR_Entertain',
 'CVS',
 'PCReDive',
 'TaichungBun',
 'BeautySalon',
 'Salary',
 'CFantasy',
 'StupidClown',
 'basketballTW',
 'FATE_GO',
 'BuyTogether',
 'Option',
 'NBA_Film',
 'HardwareSale',
 'Hearthstone',
 'WOW',
 'BTS',
 'KanColle',
 'Gamesale',
 'TypeMoon',
 'PokeMon',
 'Headphone',
 'mobilesales',
 'EAseries',
 'AC_In',
 'Aviation',
 'YuanChuang',
 'SportLottery',
 'MuscleBeach',
 'Hsinchu',
 'biker',
 'GetMarry',
 'KoreanPop',
 'forsale',
 'Food',
 'TaiwanDrama',
 'China-Drama',
 'Zastrology',
 'Soft_Jo

In [11]:
#再取出各個看板的人氣值

# 觀察網頁原始碼可以看到<div class="board-nuser">
# 裡面還有用<span>夾住人氣值資料
# 可以利用.text 直接取出所包含的文字部分 
popularityElements = soup.find_all('div', class_="board-nuser")
# 取出的文字的類型是字串, 我們可用int()轉成數字類型
popularities = [int(e.text) for e in popularityElements]

In [12]:
popularities

[17232,
 3850,
 3582,
 2649,
 2460,
 2431,
 1777,
 1523,
 1411,
 1378,
 1370,
 1268,
 1260,
 1253,
 1193,
 965,
 904,
 865,
 742,
 741,
 722,
 670,
 606,
 581,
 580,
 547,
 535,
 520,
 485,
 474,
 441,
 441,
 435,
 422,
 409,
 386,
 383,
 371,
 370,
 353,
 347,
 343,
 324,
 319,
 311,
 310,
 309,
 301,
 298,
 296,
 260,
 259,
 252,
 249,
 242,
 241,
 234,
 232,
 228,
 224,
 219,
 218,
 214,
 213,
 212,
 209,
 208,
 207,
 203,
 196,
 194,
 194,
 187,
 185,
 183,
 183,
 178,
 177,
 171,
 170,
 168,
 166,
 156,
 143,
 142,
 139,
 137,
 135,
 133,
 131,
 130,
 128,
 127,
 126,
 122,
 120,
 120,
 119,
 118,
 118,
 116,
 115,
 114,
 112,
 110,
 107,
 105,
 102,
 102,
 100,
 99,
 97,
 97,
 97,
 97,
 95,
 95,
 94,
 90,
 90,
 89,
 89,
 86,
 86,
 86,
 84,
 84,
 83]

In [14]:
#結果呈現
for bn, popu in zip(boardNames, pupularities):
    print(popu, bn)

17232 Gossiping
3850 C_Chat
3582 Stock
2649 NBA
2460 Baseball
2431 Lifeismoney
1777 sex
1523 HatePolitics
1411 KoreaStar
1378 MobileComm
1370 movie
1268 car
1260 Beauty
1253 e-shopping
1193 LoL
965 WomenTalk
904 Tech_Job
865 BabyMother
742 Boy-Girl
741 Japan_Travel
722 PC_Shopping
670 creditcard
606 marriage
581 joke
580 AllTogether
547 home-sale
535 NSwitch
520 marvel
485 japanavgirls
474 Steam
441 iOS
441 Tainan
435 ToS
422 MakeUp
409 PlayStation
386 Kaohsiung
383 KoreaDrama
371 Japandrama
370 KR_Entertain
353 CVS
347 PCReDive
343 TaichungBun
324 BeautySalon
319 Salary
311 CFantasy
310 StupidClown
309 basketballTW
301 FATE_GO
298 BuyTogether
296 Option
260 NBA_Film
259 HardwareSale
252 Hearthstone
249 WOW
242 BTS
241 KanColle
234 Gamesale
232 TypeMoon
228 PokeMon
224 Headphone
219 mobilesales
218 EAseries
214 AC_In
213 Aviation
212 YuanChuang
209 SportLottery
208 MuscleBeach
207 Hsinchu
203 biker
196 GetMarry
194 KoreanPop
194 forsale
187 Food
185 TaiwanDrama
183 China-Drama
183 Zast

In [21]:
#整理爬蟲資料

df = pd.DataFrame(
{"boardNames" : boardNames,
 "popularty" : popularities
})

In [22]:
df

Unnamed: 0,boardNames,popularty
0,Gossiping,17232
1,C_Chat,3850
2,Stock,3582
3,NBA,2649
4,Baseball,2460
5,Lifeismoney,2431
6,sex,1777
7,HatePolitics,1523
8,KoreaStar,1411
9,MobileComm,1378
