## Lab2 目標：抓取大安區公所的逐三小時預報
### 有些欄位有 colspan = ? 必須額外展開才能塞進 dataframe

In [1]:
import sys
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'http://www.cwb.gov.tw/V7/forecast/town368/3Hr/6300300.htm'
req = requests.get(url)
req.encoding = 'utf-8' # 不指定會發生編碼錯誤

In [3]:
req.status_code

200

### 如果要用類似 scrapy shell debug 的功能，uncomment 下方的 code block

In [4]:
#from scrapy.http import TextResponse
#response = TextResponse(req.url, body=req.text, encoding='utf-8') # 變數特別命名的跟 scrapy shell 開起來一樣
#soup = BeautifulSoup(response.text, 'html.parser')

### 使用 requests 得到的 html 內容，用 bs4 parse 所需資訊

In [5]:
soup = BeautifulSoup(req.text, 'html.parser')
trs = soup.find_all('tr')

In [6]:
import pandas as pd
columns = ['record_t', # 0
           'weekday', # 1
           'wx', # 2
           't', # 3 
           'at', # 4
           'beaufort', # 5
           'wind_dir', # 6
           'rh', # 7
           'pop', #8 
           'ci'] # 9
df = pd.DataFrame(columns=columns)

## 以下的 len() == 3 ，先整理 48 小時內的 timestamp

In [7]:
# day repeat time store in colspans
# date store in dates
# days store in days
# len(colspans) == len(dates) == len(days) == 2 or 3
import re
import datetime
year_s = []
year_s.append("%d" % datetime.datetime.now().year)
year_s.append("%d" % (datetime.datetime.now() + datetime.timedelta(days=1)).year) # in case tmrr is next year
year_s.append("%d" % (datetime.datetime.now() + datetime.timedelta(days=2)).year) # in case it's next year in 2 days
colspans = []
dates = []
days = []
k = 0
for idx, td in enumerate(trs[0].findAll('td')): # trs[0] 是時間相關的列
    if idx > 0:
        if td.has_attr('colspan'):
            colspans.append(td.attrs['colspan'])
        else:
            colspans.append("1")
        days.append(re.findall('[一|二|三|四|五|六|日]', td.text)[0]) # 用正規表示式把星期"幾"選出來
        month_n_date = re.findall('\d+', td.text)
        dates.append(year_s[k] + '-' + month_n_date[0] + '-' + month_n_date[1])
        k+=1

## 以下的 len() == 17，行數=17，列數不含 index = 10
### 範例

| pk | record_t(0) | weekday(1) | wx(2) | t(3) | at(4) | beaufort(5) | wind_dir(6) | rh(7) | pop(8) | ci(9) | 
| :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: |
| 1 | 2017-08-15 09:00 | 星期二 | 晴天 | 33 | 38 | 2 | 偏西風 | 78% | 0% | 悶熱

### 處理第 0:1 行

In [8]:
record_ts = []
weekdays = []
hours = trs[1].findAll('span')
k = 0
for i in range(0, len(colspans)):
    for j in range(0, int(colspans[i])):
        record_ts.append(dates[i] + 'T' + hours[k].text)
        k+=1
        weekdays.append(days[i])

df['record_t'] = record_ts
df['weekday'] = weekdays

### 處理第 2 行

In [9]:
wxs = []
for img in trs[2].findAll('img'):
    wxs.append(img.attrs['alt'])

df['wx'] = wxs

### 處理第 3:9 行，除了第 8 行

In [10]:
vals = []
for i in range(3, 10):
    if i is not 8: # 降雨機率可能要展開 colspan == 2
        tds = trs[i].findAll('td')
        for idx, td in enumerate(tds):
            if idx > 0:
                vals.append(td.text)
        df.iloc[:,i] = vals
        vals = []

### 處理第 8 行

In [11]:
pops = [] # probability of precipitation
rep = 0
for idx, td in enumerate(trs[8].findAll('td')):
    if idx > 0:
        if td.has_attr('colspan'):
            rep = int(td.attrs['colspan'])
        else:
            rep = 1
        for i in range(0, rep):
            pops.append(td.text)

df['pop'] = pops

### 呈現

In [12]:
df

Unnamed: 0,record_t,weekday,wx,t,at,beaufort,wind_dir,rh,pop,ci
0,2017-08-24 18:00,四,多雲,32,37,2,西北風,80%,10%,悶熱
1,2017-08-24 21:00,四,多雲,31,36,<=1,偏南風,85%,10%,悶熱
2,2017-08-25 00:00,五,多雲,30,35,<=1,西北風,89%,20%,悶熱
3,2017-08-25 03:00,五,多雲,29,35,<=1,偏南風,92%,20%,悶熱
4,2017-08-25 06:00,五,晴天,28,33,<=1,偏東風,88%,20%,悶熱
5,2017-08-25 09:00,五,晴天,32,36,2,偏東風,68%,20%,悶熱
6,2017-08-25 12:00,五,午後短暫雷陣雨,35,42,2,偏東風,80%,30%,悶熱
7,2017-08-25 15:00,五,午後短暫雷陣雨,34,41,<=1,偏東風,80%,30%,悶熱
8,2017-08-25 18:00,五,晴天,32,37,<=1,偏南風,72%,10%,悶熱
9,2017-08-25 21:00,五,多雲,31,36,<=1,西南風,87%,10%,悶熱


### 存檔

In [13]:
out_csv = './csv/daan_3hr.csv'
df.to_csv(out_csv, sep=',', encoding='utf-8', index=False)