# 下載 ETC M06A 資料
<a href="http://www.freeway.gov.tw/UserFiles/File/TIMCCC/TDCS%E4%BD%BF%E7%94%A8%E6%89%8B%E5%86%8A(tanfb)v3.0-1.pdf">國道高速公路電子收費交通資料蒐集支援系統(Traffic Data Collection System,TDCS)使用手冊</a>



In [None]:
from urllib.request import urlopen, urlretrieve
import tqdm

# 基本的資料

In [1]:
# 歷史資料網址
data_baseurl="http://tisvcloud.freeway.gov.tw/history/TDCS/M06A/"
# 壓縮檔的檔名格式
filename_format="M06A_{year:04d}{month:02d}{day:02d}.tar.gz".format
# csv 檔的路徑格式
csv_format = "M06A/{year:04d}{month:02d}{day:02d}/{hour:02d}/TDCS_M06A_{year:04d}{month:02d}{day:02d}_{hour:02d}0000.csv".format

In [63]:
# 下載檔案的程式
# 如果有 ipywidgets, 可以將 tqdm.tqdm 換成 tqdm.tqdm_notebook 比較 notebook 一點的界面

# 將 req 下載到檔案
def download_req(req, filename):
    # 取得檔案長度
    total = int(req.getheader("Content-Length"))
    # tqdm 的設定
    tqdm_conf = dict(total=total, desc=filename, unit='B', unit_scale=True)
    # 開啟 tqdm 進度條及寫入檔案
    with tqdm.tqdm(**tqdm_conf) as pbar:
        with open(filename,'wb') as f:
            # 從 req 每次讀入 8192 byte 的資料
            for data in iter(lambda: req.read(8192), b""):  
                # 寫入檔案，並且更新進度條
                pbar.update(f.write(data))
                
def download_M06A(year, month, day):
    # 依照年月日來設定檔名
    filename = filename_format(year=year, month=month, day=day)
    # 用 urlopen 開啟連結
    with urlopen(data_baseurl + filename) as req:
        download_req(req, filename)


In [64]:
download_M06A(2016,12,18)

M06A_20161218.tar.gz: 100%|██████████| 154M/154M [04:44<00:00, 540KB/s]  


In [16]:
# 其實也可以用 urlretrieve
# 下面的寫法改自 tqdm 範例
filename = filename_format(year=2015, month=6, day=26)
with tqdm.tqdm(desc=filename, unit='B', unit_scale=True) as pbar:
    last_b = 0
    def tqdmhook(b, bsize, tsize):
        nonlocal last_b
        if tsize != -1:
            pbar.total = tsize
        pbar.update((b-last_b)*bsize)
        last_b = b
    urlretrieve(data_baseurl+filename, filename=filename, reporthook=tqdmhook)

M06A_20150626.tar.gz: 142MB [03:16, 722KB/s]                             


反過來由檔名找日期，可以用 regexp 或者 datetime

In [33]:
import re
m=re.match("M06A_(\d{4})(\d\d)(\d\d).tar.gz" ,"M06A_20170103.tar.gz")
m.groups()

('2017', '01', '03')

In [34]:
import datetime
datetime.datetime.strptime("M06A_20170103.tar.gz", "M06A_%Y%m%d.tar.gz")

datetime.datetime(2017, 1, 3, 0, 0)

抓所有的壓縮檔案

In [44]:
# 使用 BeautifulSoup4 來解析
from bs4 import BeautifulSoup
# 抓下目錄頁
with urlopen(data_baseurl) as req:
    data = req.read()
# 用 BeautifulSoup 解析目錄頁
soup = BeautifulSoup(data, "html.parser")
# 找到所有 <a href=... 的 tag
files = set(x.attrs['href'] for x in soup.find_all('a') if 'href' in x.attrs)

#files = set(x for x in files if x and x.endswith(".tar.gz") and x.startswith("M06A_"))
# 過濾剩下 href 開頭為 M06A_，結尾是.tar.gz 並且解出年月日
re_M06A_tgz=re.compile("M06A_(\d{4})(\d\d)(\d\d).tar.gz")
files = (re_M06A_tgz.match(x) for x in files)
files = [x.groups() for x in files if x]
files[:10]

[('2016', '06', '12'),
 ('2015', '02', '20'),
 ('2016', '10', '31'),
 ('2016', '07', '07'),
 ('2015', '01', '18'),
 ('2016', '09', '01'),
 ('2016', '08', '17'),
 ('2015', '12', '05'),
 ('2015', '10', '19'),
 ('2017', '01', '04')]

In [17]:
# 結合上面來抓所有的資料
for y,m,d in files:
    download_M06A(int(y), int(m), int(d))

M06A_20161219.tar.gz: 100%|██████████| 136M/136M [00:11<00:00, 11.4MB/s]
M06A_20150419.tar.gz: 100%|██████████| 132M/132M [00:11<00:00, 11.6MB/s]
M06A_20161016.tar.gz: 100%|██████████| 151M/151M [00:15<00:00, 9.97MB/s]
M06A_20160914.tar.gz: 100%|██████████| 123M/123M [00:10<00:00, 11.6MB/s]
M06A_20151209.tar.gz: 100%|██████████| 126M/126M [00:11<00:00, 11.4MB/s]
M06A_20150724.tar.gz: 100%|██████████| 142M/142M [00:12<00:00, 11.1MB/s]
M06A_20150212.tar.gz: 100%|██████████| 135M/135M [00:11<00:00, 11.6MB/s]
M06A_20151024.tar.gz: 100%|██████████| 145M/145M [00:15<00:00, 9.60MB/s]
M06A_20160316.tar.gz: 100%|██████████| 130M/130M [00:12<00:00, 10.7MB/s]
M06A_20160516.tar.gz: 100%|██████████| 131M/131M [00:13<00:00, 9.67MB/s]
M06A_20161018.tar.gz: 100%|██████████| 134M/134M [00:11<00:00, 11.5MB/s]
M06A_20150627.tar.gz: 100%|██████████| 148M/148M [00:13<00:00, 11.4MB/s]
M06A_20160125.tar.gz: 100%|██████████| 136M/136M [00:14<00:00, 9.26MB/s]
M06A_20160206.tar.gz: 100%|██████████| 141M/141M [0

### 將 .tar.gz 重新打包成 .tar.xz

In [62]:
import glob
import lzma
import gzip
import os
import os.path

# 建立輸出目錄 xz
os.makedirs("xz", exist_ok=True)

def repack(filename):
    # 原來的檔名需要是 gz 結尾
    assert filename.endswith("gz")
    # 檔案大小，用來顯示進度條
    length = os.path.getsize(filename)
    # 輸出檔名
    xzfn = os.path.join("xz/", os.path.split(f)[-1][:-2]+"xz")
    # 不要覆蓋已經有的檔案
    if os.path.isfile(xzfn):
        print("skip", filename)
        return
    # 開啟檔案和進度條, lzma 的 preset 可設定 0~9 
    with gzip.open(filename, 'r') as gzfile, \
         lzma.open(xzfn, "w", preset=1) as xzfile, \
         tqdm.tqdm(total=length, desc=filename, unit='B', unit_scale=True) as pbar:
        # 從 .gz 解壓縮 data
        for data in iter(lambda: gzfile.read(1024*1024), b""):
            # 將 data 寫入 .xz
            xzfile.write(data)
            # 更新 pbar
            pbar.update(gzfile.fileobj.tell() - pbar.n)

# 找出檔案，依序重新壓縮
for f in glob.glob("M06A_201612*.gz"):
    repack(f)

M06A_20161214.tar.gz: 100%|██████████| 134M/134M [00:33<00:00, 4.03MB/s] 
M06A_20161215.tar.gz: 100%|██████████| 137M/137M [00:34<00:00, 4.04MB/s] 
M06A_20161212.tar.gz: 100%|██████████| 140M/140M [00:35<00:00, 3.98MB/s] 
M06A_20161213.tar.gz: 100%|██████████| 135M/135M [00:33<00:00, 3.98MB/s] 
M06A_20161217.tar.gz: 100%|██████████| 154M/154M [00:38<00:00, 3.95MB/s] 
