# Craw TXF

In [94]:
import requests, zipfile, io, datetime, time

In [90]:
def craw(date):
    # 將傳進來的date解析成 [year, month, day]
    lst = [date.year, date.month, date.day]
    # 直接將list丟進url中, 獲取對應的日期網址
    url = 'https://www.taifex.com.tw/DailyDownload/DailyDownloadCSV/Daily_{:04d}_{:02d}_{:02d}.zip'.format(*lst)
    
    # 迴圈開始爬蟲程式
    while True:
        # sleep起手
        time.sleep(5)
        try:
            # 使用get方法, 速度比post快
            response = requests.get(url)
            # response用content方式獲取, 這是二進位輸出, 目的是用io接出使用zipfile來讀取
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                # 使用namelist拿出每一個zipfile中的檔案名字
                for file in z.namelist():
                    # 將檔案解開至指定目錄
                    z.extract(file, 'data/txf_daily_csv')
            print(date, 'successed.')
            break
        except ConnectionRefusedError as e:
            print(date, e)
            continue
        # 記得捕捉非zip格式, 在這邊這代表當天沒資料
        except zipfile.BadZipFile as e:
            print(date, e)
            break
    # 回傳, 避免沒有捕捉到的錯誤, 有回傳我們還可以使用response來補救, 不用全部重跑
    return response

In [93]:
def main():
    # 由於期交所只保留30天資料, 因此range就不要設超過30了
    # 這邊定義30天的list, 準備去抓對應的資料
    days = [datetime.date.today() - datetime.timedelta(i) for i in range(30)]
    # 跑回圈去抓資料
    responses = [craw(date) for date in days]
    # 結束
    print('Finished.')

In [92]:
if __name__ == '__main__':
    main()

2018-09-17 successed.
2018-09-16 File is not a zip file
2018-09-15 File is not a zip file
2018-09-14 successed.
2018-09-13 successed.
2018-09-12 successed.
2018-09-11 successed.
2018-09-10 successed.
2018-09-09 File is not a zip file
2018-09-08 File is not a zip file
2018-09-07 successed.
2018-09-06 successed.
2018-09-05 successed.
2018-09-04 successed.
2018-09-03 successed.
2018-09-02 File is not a zip file
2018-09-01 File is not a zip file
2018-08-31 successed.
2018-08-30 successed.
2018-08-29 successed.
2018-08-28 successed.
2018-08-27 successed.
2018-08-26 File is not a zip file
2018-08-25 File is not a zip file
2018-08-24 successed.
2018-08-23 successed.
2018-08-22 successed.
2018-08-21 successed.
2018-08-20 successed.
2018-08-19 File is not a zip file


到這邊, 這個爬蟲程式就可以抓對應的zip檔, 並將之解壓縮到指定目錄.  
為了能夠在命令列視窗中執行, 我們把這些程式碼移植到`CrawTXF.py`中.  

使用內建的`argparse`套件, 這能夠幫助我們傳入參數.  
在程式一開始新增如下的程式碼:  
`parser = argparse.ArgumentParser()
parser.add_argument('-n', dest='n_days')
args = parser.parse_args()`

透過上面的程式碼就可以在命令列視窗中帶入`-n`的參數.  
接著修改`main()`如下:  
`def main():
    if args.n_days == None:
        n_days = 30
    else:
        n_days = int(args.n_days)
    days = [datetime.date.today() - datetime.timedelta(i) for i in range(n_days)]
    [craw(date) for date in days]
    print('Finished.')`

接著我們就可以在命令列中輸入以下命令來執行程式:  
`python3 CrawTXF.py -n 1`  
如果成功執行以後就可以達到排程每天抓當天資料, 並且有問題也可以手動執行回補30天的效果了