# 附錄 -- 下載資料

In [1]:
import pandas as pd
import numpy as np
import requests as rq
import zipfile as zp
import os
import io
import pathlib

---
## Youbike 相關

### Youbike OD 資料

- 時間範圍： 2021 到 2023， 01 到 12 月，所有起訖資料
- 資料格式
    - json
        - 所有月份資料 .zip 網址
            - 該月份起迄資料 .csv

政府開放資料網站下載到的資料為 json 格式，裡面有個個月份的資料網址，格式為 zip

In [5]:

YB_OB_COLLECTION_URL = "https://quality.data.gov.tw/dq_download_json.php?nid=150635&md5_url=e51f13f6f3f14df40f17c175161be3cc"

all_youbike_od_collection_list:list[dict] = \
    rq.get(YB_OB_COLLECTION_URL).json()


In [12]:
[d['fileURL'] for d in all_youbike_od_collection_list]

['https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2023/2023-02/202302_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2023/2023-01/202201_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-12/202212_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-11/202211_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-10/202210_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-09/202209_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-08/202208_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.core.windows.net/dotapp/youbike_second_ticket_opendata/2022/2022-07/202207_YouBike2.0票證刷卡資料.zip',
 'https://tcgbusfs.blob.

In [2]:
youbike_data_path = pathlib.Path('../DATA') / "Youbike"

In [23]:
all_youbike_csv_files = [f for f in youbike_data_path.glob("*.csv")]
youbike_OD_combined = pd.concat(
    [pd.read_csv(f) for f in all_youbike_csv_files]
)


In [28]:
youbike_OD_combined.to_parquet(youbike_data_path / "yb.pq")

In [63]:
all_df = pd.read_parquet(youbike_data_path / "yb.pq")

In [65]:
#remove na
all_df = all_df.dropna()

In [61]:
for name in all_df.sort_values('rent_station')['rent_station'].unique():
    print(name)

六福公園
崇仰公園(公舘路255巷)
東新國小
瑠公公園
糖廍文化園區
nan


In [55]:
have_special_char = all_df.rent_station.str.contains(r'[\.\?]', na=True)

for name in all_df[have_special_char]['rent_station'].unique():
    print(name)

In [75]:
strang_station_name_map = {
"糖?文化園區": "糖廍文化園區",
"?公公園": "瑠公公園",
"東新國小.": "東新國小",
"六福公園.": "六福公園",
"崇仰公園(公?路255巷)": "崇仰公園(公舘路255巷)",
"瓦?溝(福真里)" : "瓦磘溝(福真里)",
"瓦?截流站" : "瓦磘截流站",
"?寮公園" : "獇寮公園",
}

In [76]:
all_df.rent_station = all_df.rent_station.replace(strang_station_name_map)
all_df.return_station = all_df.return_station.replace(strang_station_name_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df.rent_station = all_df.rent_station.replace(strang_station_name_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df.return_station = all_df.return_station.replace(strang_station_name_map)


In [77]:

have_special_char = (all_df.rent_station.str.contains(r'[\.\?]', na=True)) | (all_df.return_station.str.contains(r'[\.\?]', na=True))

for name in all_df[have_special_char]['rent_station'].unique():
    print(name)

for name in all_df[have_special_char]['return_station'].unique():
    print(name)

In [78]:
all_df[have_special_char]

Unnamed: 0,rent_time,rent_station,return_time,return_station,rent,infodate


In [79]:
all_df.to_parquet(youbike_data_path / "yb.pq")

### Youbike 座標以及可用車位

In [2]:
YB_STATION_INFO_URL = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"

yb_station_info = rq.get(YB_STATION_INFO_URL).json()

In [10]:
yb_station_df = pd.DataFrame(yb_station_info)
yb_station_df.sna = yb_station_df.sna.str.replace("YouBike2.0_", '')

  yb_station_df.sna = yb_station_df.sna.str.replace("YouBike2.0_", '')


In [11]:
yb_station_df

Unnamed: 0,sno,sna,tot,sbi,sarea,mday,lat,lng,ar,sareaen,snaen,aren,bemp,act,srcUpdateTime,updateTime,infoTime,infoDate
0,500101001,捷運科技大樓站,28,1,大安區,2023-09-03 17:31:05,25.02605,121.54360,復興南路二段235號前,Daan Dist.,YouBike2.0_MRT Technology Bldg. Sta.,No.235， Sec. 2， Fuxing S. Rd.,27,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:05,2023-09-03
1,500101002,復興南路二段273號前,21,5,大安區,2023-09-03 17:31:05,25.02565,121.54357,復興南路二段273號西側,Daan Dist.,YouBike2.0_No.273， Sec. 2， Fuxing S. Rd.,No.273， Sec. 2， Fuxing S. Rd. (West),16,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:05,2023-09-03
2,500101003,國北教大實小東側門,16,2,大安區,2023-09-03 17:31:05,25.02429,121.54124,和平東路二段96巷7號,Daan Dist.,YouBike2.0_NTUE Experiment Elementary School (...,No. 7， Ln. 96， Sec. 2， Heping E. Rd,14,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:05,2023-09-03
3,500101004,和平公園東側,11,11,大安區,2023-09-03 17:31:05,25.02351,121.54282,和平東路二段118巷33號,Daan Dist.,YouBike2.0_Heping Park (East),No. 33， Ln. 118， Sec. 2， Heping E. Rd,0,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:05,2023-09-03
4,500101005,辛亥復興路口西北側,16,16,大安區,2023-09-03 17:31:05,25.02153,121.54299,復興南路二段368號,Daan Dist.,YouBike2.0_Xinhai Fuxing Rd. Intersection (Nor...,No. 368， Sec. 2， Fuxing S. Rd.,0,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:05,2023-09-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,500119087,臺大總圖書館西南側,30,23,臺大公館校區,2023-09-03 17:31:14,25.01690,121.54031,臺大圖書館西南側,NTU Dist,YouBike2.0_NTU Main Library(Southwest),NTU Main Library(Southwest),7,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:14,2023-09-03
1305,500119088,臺大黑森林西側,20,5,臺大公館校區,2023-09-03 17:31:14,25.01995,121.54347,臺大霖澤館南側,NTU Dist,YouBike2.0_NTU Black Forest(West),NTU Tsai Lecture Hall(South),15,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:14,2023-09-03
1306,500119089,臺大獸醫館南側,24,5,臺大公館校區,2023-09-03 17:31:14,25.01791,121.54242,臺大獸醫系館南側,NTU Dist,YouBike2.0_NTU Dept. of Veterinary Medicine(So...,NTU Dept. of Veterinary Medicine(South),19,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:14,2023-09-03
1307,500119090,臺大新體育館東南側,40,19,臺大公館校區,2023-09-03 17:31:14,25.02112,121.53591,臺大體育館東側,NTU Dist,YouBike2.0_NTU Sports Center(Southeast),NTU Sports Center(East),21,1,2023-09-03 17:31:39,2023-09-03 17:31:51,2023-09-03 17:31:14,2023-09-03


In [71]:
yb_station_df.to_csv("../DATA/Youbike/TempYoubikeInfo.csv")

看看有多少站，在 OD 但不在 info裏面

In [74]:
yb_all_od = pd.read_parquet("../DATA/Youbike/yb.pq")
stations_in_OD = set(yb_all_od['rent_station'])
stations_in_info = set(yb_station_df['sna'])

In [75]:
no_info_stations = stations_in_OD - stations_in_info
no_info_stations

{'3樓客服中心',
 'YB_GTS測試',
 '一號船渠景觀橋',
 '七賢洛陽街口',
 '七賢自強路口東北側',
 '三民公園南側',
 '三民國小',
 '中國醫藥大學附設醫院臺北分�',
 '中山五福路口西北側',
 '中山青年路口西北側',
 '中林測試站',
 '中正和平路口西北側',
 '中正瑞源路口西南側',
 '中正高工(汕頭街口)',
 '中興中正路口',
 '中華復興路口(西南側)',
 '中華興中路口(東南角)',
 '信義服務中心',
 '僑安地下停車場(2號出口)東南�',
 '元氣大鎮社區',
 '凹子底森林公園(明誠三路側)',
 '凹子底森林公園(龍德路側)',
 '博愛重立路口南側',
 '台北市政府',
 '台北流行音樂中心(南港路)',
 '台北流行音樂中心(市民大道)',
 '台北醫學大學',
 '台鋁廣場',
 '和興街26巷口',
 '四維行政中心(四維三路)',
 '堯山延慶街口',
 '大裕昌富街口',
 '天母國中',
 '天母東路8巷/忠誠路二段154巷�',
 '安康路34巷24弄',
 '崇光女中',
 '延平北平路口',
 '建國同愛街口西北側',
 '捷運中央公園站(3號出口)',
 '捷運信義國小站(5號出口)',
 '捷運凹子底站(4號出口)',
 '捷運北投站(1號出口)(北投路側',
 '捷運台北小巨蛋站(2號出口)',
 '捷運善導寺站(3號出口)(忠孝東',
 '捷運善導寺站(3號出口)(忠孝路',
 '捷運市議會站(2號出口)',
 '捷運後驛站(1號出口北側)',
 '捷運後驛站(3號出口)',
 '捷運技擊館站(1號出口)',
 '捷運獅甲站(4號出口)',
 '捷運生態園區站(2號出口)',
 '捷運科技大樓站(台北教育大學',
 '捷運美麗島站(10號出口)',
 '捷運美麗島站(2號出口)',
 '捷運美麗島站(5號出口)',
 '捷運萬芳醫院站(興隆路三段115',
 '捷運鳳山站(1號出口)',
 '捷運鹽埕埔站(1號出口)',
 '文山兒童四公園',
 '新光自強路口',
 '新生南路一段113號',
 '新生高架停車場(林森北路107巷',
 '新興公園(昆陽街60巷)',
 '新興高中',
 '星雲金湖街口',
 '時代大道中華路口(西北側)',
 '板橋真武廟',


... 清一下吧@@

- 台北 -> 臺北
- 捷運台北小巨蛋 -> 捷運小巨蛋
- 有`(` 但沒有`)`
- 中國醫藥大學附設醫院臺北分� -> 中國醫藥大學附設醫院臺北分院
- 僑安地下停車場(2號出口)東南� -> 僑安地下停車場(2號出口)東南側
-  '捷運善導寺站(3號出口)(忠孝東',
- '捷運善導寺站(3號出口)(忠孝路',

先找最近的

In [51]:
import difflib

def get_closest_station(what_station):
    try:
        return difflib.get_close_matches(what_station, stations_in_info, 1, 0.6)[0]
    except:
        return None


best_match = {what_station: get_closest_station(what_station) for what_station in no_info_stations }


In [52]:
best_match

{'中林測試站': None,
 '羅斯福路三段333巷9號旁': '羅斯福路三段311號前',
 '獅甲社區(興仁國中棒球場)': None,
 '捷運台北小巨蛋站(2號出口)': '捷運小巨蛋站(2號出口)',
 '臺大卓越維修中心': None,
 '科工館車站': None,
 '新生高架停車場(林森北路107巷': '新生高架停車場(林森北路107巷口)',
 '蘆洲維修中心': None,
 '僑安地下停車場(2號出口)東南�': '僑安地下停車場(2號出口)東南側',
 '時代大道中華路口(西北側)': None,
 '石牌區民活動中心': '黎忠區民活動中心',
 '臺北市立大學(忠誠路二段207巷': '臺北市立大學(忠誠路二段207巷)',
 '捷運鹽埕埔站(1號出口)': '捷運關渡站(1號出口)',
 '中華復興路口(西南側)': '市民復興路口(東南側)',
 '台北流行音樂中心(南港路)': '臺北流行音樂中心(南港路)',
 '中正和平路口西北側': '建國和平路口西北側',
 '天母東路8巷/忠誠路二段154巷�': '天母東路8巷/忠誠路二段154巷口',
 '捷運萬芳醫院站(興隆路三段115': '捷運萬芳醫院站(興隆路三段115巷)',
 '捷運善導寺站(3號出口)(忠孝路': '捷運善導寺站(3號出口)(忠孝東路側)',
 '高雄女中': None,
 '高雄國際會議中心': None,
 '河東路建國三路口': '和平東建國南路口',
 '三民公園南側': '三民公園',
 '捷運信義國小站(5號出口)': '捷運小巨蛋站(5號出口)',
 '台北流行音樂中心(市民大道)': '臺北流行音樂中心(市民大道)',
 '台北醫學大學': '臺北醫學大學',
 '星雲金湖街口': '金湖星雲街口',
 '裕誠富民路口東北側': '民權光復路口(東北側)',
 '和興街26巷口': '和興路26巷口',
 '臺北魚市': '臺北花市',
 '高雄科技大學東北側': None,
 '濱海臨海路口(西南側)': '新生民族路口(西南側)',
 '捷運後驛站(3號出口)': '捷運後山埤站(3號出口)',
 '捷運美麗島站(10號出口)': '捷運關渡站(1號出口)',
 '銘傳大學(中山北路五段280巷口': '銘傳大學(中山北路五段2

先輸出，人工編輯

In [53]:
import yaml

with open('../DATA/Youbike/temp_change_name.yaml', 'w') as f:
    yaml.dump(best_match, f, encoding='utf-8', allow_unicode=True)

In [20]:
yb_all_od[yb_all_od.rent_station.isin(no_info_stations)]

Unnamed: 0,rent_time,rent_station,return_time,return_station,rent,infodate
49,2021-07-30 09:00:00,僑安地下停車場(2號出口)東南�,2021-07-30 09:00:00,信義延吉街口,00:04:48,2021-07-30
209,2021-07-30 18:00:00,羅斯福路三段333巷9號旁,2021-07-30 18:00:00,臺大椰林小舖,00:04:19,2021-07-30
238,2021-07-30 01:00:00,僑安地下停車場(2號出口)東南�,2021-07-30 05:00:00,景勤二號公園,04:22:58,2021-07-30
479,2021-07-07 18:00:00,僑安地下停車場(2號出口)東南�,2021-07-07 18:00:00,臨江街夜市(通化街101巷口),00:13:00,2021-07-07
487,2021-07-07 18:00:00,辛亥基隆路口,2021-07-07 18:00:00,和平臥龍街口,00:10:39,2021-07-07
...,...,...,...,...,...,...
250999,2021-04-21 12:00:00,臺大仰萃樓東南側,2021-04-21 12:00:00,臺大總圖書館西南側,00:06:03,2021-04-21
251006,2021-04-21 12:00:00,羅斯福路三段333巷9號旁,2021-04-21 12:00:00,臺大鹿鳴堂東側,00:05:44,2021-04-21
251011,2021-04-21 12:00:00,臺大仰萃樓東南側,2021-04-21 12:00:00,臺大樂學館東側,00:06:08,2021-04-21
251019,2021-04-21 12:00:00,臺大仰萃樓東南側,2021-04-21 12:00:00,臺大管理學院二館北側,00:09:30,2021-04-21


In [65]:
query_text = "辛亥路五段73巷口"
yb_all_od[yb_all_od.rent_station.isin(no_info_stations)].query(f"rent_station.str.contains('{query_text}')")#.query("rent_time < '2021-07-31'")

Unnamed: 0,rent_time,rent_station,return_time,return_station,rent,infodate
128419,2022-10-23 06:00:00,辛亥路五段73巷口,2022-10-23 06:00:00,捷運辛亥站,00:31:12,2022-10-23
128420,2022-10-23 07:00:00,辛亥路五段73巷口,2022-10-23 08:00:00,捷運大坪林站(1號出口),00:17:03,2022-10-23
128421,2022-10-23 19:00:00,辛亥路五段73巷口,2022-10-23 19:00:00,捷運辛亥站,00:02:42,2022-10-23
128422,2022-10-23 21:00:00,辛亥路五段73巷口,2022-10-23 21:00:00,辛亥路五段73巷口,00:06:52,2022-10-23
128423,2022-10-23 22:00:00,辛亥路五段73巷口,2022-10-23 23:00:00,木柵光輝路口,00:06:23,2022-10-23
...,...,...,...,...,...,...
2576872,2023-01-19 06:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:04:55,2023-01-19
2576873,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:04:57,2023-01-19
2576874,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運萬芳醫院站(興隆路三段115,00:05:09,2023-01-19
2576875,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:06:54,2023-01-19


有一些站點被移除，但其實很直得探討。我將手動增加資訊
- 辛亥路五段73巷口
- 機場捷運台北車站(6號出口)

In [3]:
import yaml
with open('../DATA/Youbike/temp_change_name＿changed.yaml', 'r') as f:
    manual_mapping_stations = yaml.safe_load(f)

In [4]:
manual_mapping_stations

{'中國醫藥大學附設醫院臺北分�': '中國醫藥大學附設醫院臺北分院',
 '僑安地下停車場(2號出口)東南�': '僑安地下停車場(2號出口)東南側',
 '台北市政府': '臺北市政府',
 '台北流行音樂中心(南港路)': '臺北流行音樂中心(南港路)',
 '台北流行音樂中心(市民大道)': '臺北流行音樂中心(市民大道)',
 '台北醫學大學': '臺北醫學大學',
 '和興街26巷口': '和興路26巷口',
 '天母東路8巷/忠誠路二段154巷�': '天母東路8巷/忠誠路二段154巷口',
 '安康路34巷24弄': '安康路32巷24弄',
 '捷運北投站(1號出口)(北投路側': '捷運北投站(1號出口)(北投路側)',
 '捷運台北小巨蛋站(2號出口)': '捷運小巨蛋站(2號出口)',
 '捷運善導寺站(3號出口)(忠孝東': '捷運善導寺站(3號出口)(忠孝東路側)',
 '捷運善導寺站(3號出口)(忠孝路': '捷運善導寺站(3號出口)(忠孝東路側)',
 '捷運科技大樓站(台北教育大學': '捷運科技大樓站(台北教育大學)',
 '捷運萬芳醫院站(興隆路三段115': '捷運萬芳醫院站(興隆路三段115巷)',
 '新生高架停車場(林森北路107巷': '新生高架停車場(林森北路107巷口)',
 '新興公園(昆陽街60巷)': '新新公園(昆陽街60巷)',
 '星雲金湖街口': '金湖星雲街口',
 '民善新湖二路口(家樂福內湖店': '民善新湖二路口(家樂福內湖店)',
 '民權東龍江路口': '民權龍江路口',
 '環南市場': '環南綜合市場',
 '臺北市立大學(忠誠路二段207巷': '臺北市立大學(忠誠路二段207巷)',
 '臺大仰萃樓東南側': '臺大禮賢樓東南側',
 '萬華國中': '萬華國中_1',
 '辛亥基隆路口': '臺大土木研究大樓前',
 '銘傳大學(中山北路五段280巷口': '銘傳大學(中山北路五段280巷口)',
 '糖?文化園區': '糖廍文化園區',
 '?公公園': '瑠公公園',
 '東新國小.': '東新國小',
 '六福公園.': '六福公園',
 '崇仰公園(公?路255巷)': '崇仰公園(公舘路255巷)',
 '瓦?溝(福真里)': '瓦磘溝(福真里)',
 '瓦?截流站':

In [80]:
yb_all_od.rent_station = yb_all_od.rent_station.replace(manual_mapping_stations)
yb_all_od.return_station = yb_all_od.return_station.replace(manual_mapping_stations)

In [81]:
query_text = "辛亥路五段73巷口"
yb_all_od[yb_all_od.rent_station.isin(no_info_stations)].query(f"rent_station.str.contains('{query_text}')")#.query("rent_time < '2021-07-31'")

Unnamed: 0,rent_time,rent_station,return_time,return_station,rent,infodate
128419,2022-10-23 06:00:00,辛亥路五段73巷口,2022-10-23 06:00:00,捷運辛亥站,00:31:12,2022-10-23
128420,2022-10-23 07:00:00,辛亥路五段73巷口,2022-10-23 08:00:00,捷運大坪林站(1號出口),00:17:03,2022-10-23
128421,2022-10-23 19:00:00,辛亥路五段73巷口,2022-10-23 19:00:00,捷運辛亥站,00:02:42,2022-10-23
128422,2022-10-23 21:00:00,辛亥路五段73巷口,2022-10-23 21:00:00,辛亥路五段73巷口,00:06:52,2022-10-23
128423,2022-10-23 22:00:00,辛亥路五段73巷口,2022-10-23 23:00:00,木柵光輝路口,00:06:23,2022-10-23
...,...,...,...,...,...,...
2576872,2023-01-19 06:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:04:55,2023-01-19
2576873,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:04:57,2023-01-19
2576874,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運萬芳醫院站(興隆路三段115巷),00:05:09,2023-01-19
2576875,2023-01-19 07:00:00,辛亥路五段73巷口,2023-01-19 07:00:00,捷運辛亥站,00:06:54,2023-01-19


## 手動增加一些已移除之站點後，重新 merge

In [85]:
yb_station_info = (pd.read_csv('../DATA/Youbike/TempYoubikeInfo.csv')[["sna", "tot", "lat", "lng", 'ar']]
                   .rename(columns = {
                       "sna":"name", 
                       "tot": "total",
                       "ar": "address"}))

In [86]:
yb_station_info

Unnamed: 0,name,total,lat,lng,address
0,捷運科技大樓站,28,25.02605,121.54360,復興南路二段235號前
1,復興南路二段273號前,21,25.02565,121.54357,復興南路二段273號西側
2,國北教大實小東側門,16,25.02429,121.54124,和平東路二段96巷7號
3,和平公園東側,11,25.02351,121.54282,和平東路二段118巷33號
4,辛亥復興路口西北側,16,25.02153,121.54299,復興南路二段368號
...,...,...,...,...,...
1306,臺大獸醫館南側,24,25.01791,121.54242,臺大獸醫系館南側
1307,臺大新體育館東南側,40,25.02112,121.53591,臺大體育館東側
1308,臺大明達館北側(員工宿舍),18,25.01816,121.54469,明達館北側前空地
1309,辛亥路五段73巷口,23,24.99818,121.55312,已移除。手動增加資訊


In [88]:
yb_station_info.to_parquet("../DATA/Youbike/yb_info.pq")

注意！先不急著把站點不存在的OD刪掉，因為可能起沒有在台北，但迄在台北。這樣會低估還車使用率

# 7-11 座標

In [9]:
import requests as rq
import pandas as pd
from xml.etree import ElementTree

In [18]:
all_taipei_district = [
"松山區", "信義區", "大安區", "中山區", "中正區",
"大同區", "萬華區"," 文山區", "南港區", "內湖區", "士林區", "北投區",]

SEVEN_ELEVEN_API_URL = "https://emap.pcsc.com.tw/EMapSDK.aspx"

def get_dict_from_tree_element(tree: ElementTree.Element):

    def fix_lat_lng(lat_lng_str:str)->str:
        return float(lat_lng_str)/1e6

    return {
        'store_name': tree.find("POIName").text,
        'lat': fix_lat_lng(tree.find("Y").text),     # 2512312
       'lng': fix_lat_lng(tree.find("X").text),       #121353245
    }

def get_all_711_in_dicstrict(district:str) -> pd.DataFrame:
    try:
        data = {
            'commandid': 'SearchStore',
            'city': '台北市',
            'town': district,
        }
        res_xml = rq.post(SEVEN_ELEVEN_API_URL, data = data)
        res_dict = ElementTree.fromstring(res_xml.content)
        all_711_locations = res_dict.findall("GeoPosition")
        return [get_dict_from_tree_element(tree) for tree in all_711_locations]
    except Exception as e:
        print(f"Error in {district}. {e}")

In [21]:
all_711_coord_df = pd.concat(
    [pd.DataFrame(
        get_all_711_in_dicstrict(district)
    )
        for district in all_taipei_district
    ]
)

In [25]:
all_711_coord_df.to_parquet("../DATA/711/all_711_coord.pq", index=False)

In [26]:
pd.read_parquet("../DATA/711/all_711_coord.pq")

Unnamed: 0,store_name,lat,lng
0,上弘,25.056391,121.548287
1,小巨蛋,25.050944,121.549433
2,中崙,25.048396,121.552737
3,北體,25.050888,121.552850
4,台場,25.048086,121.551158
...,...,...,...
898,懷得,25.114096,121.519656
899,關渡,25.121540,121.467483
900,關渡站,25.125037,121.467181
901,鐏賢,25.117453,121.506854


# 人口學變量

各里
- 平均人口
- 收入
    - 平均
    - Q1,2,3,4
- 人口比例
    - 高中生比例 15~18
    - 大學生比例 18~24
    - 24~40
    - 40~65
    - 65~75
    - 75+

In [28]:
import pandas as pd
from pandas_ods_reader import read_ods

In [46]:
tpe_demographics = read_ods("../DATA/Demographic/111臺北市每年人口數依性別及年齡分.ods")
tpe_demographics.columns = tpe_demographics.iloc[0,:]
tpe_demographics.drop(index = 0, axis=0, inplace=True)

In [47]:
tpe_demographics

Unnamed: 0,行政區,里別,性別,總計,合計_0~4歲,0歲,1歲,2歲,3歲,4歲,...,92歲,93歲,94歲,合計_95~99歲,95歲,96歲,97歲,98歲,99歲,100歲以上
1,總計,總計,計,2480681.0,87542.0,13411.0,16694.0,17530.0,19392.0,20515.0,...,3512.0,2979.0,2409.0,5574.0,1882.0,1453.0,1009.0,730.0,500.0,1078.0
2,總計,總計,男,1177022.0,45147.0,6867.0,8572.0,9131.0,9937.0,10640.0,...,1557.0,1323.0,1064.0,2485.0,843.0,643.0,479.0,303.0,217.0,538.0
3,總計,總計,女,1303659.0,42395.0,6544.0,8122.0,8399.0,9455.0,9875.0,...,1955.0,1656.0,1345.0,3089.0,1039.0,810.0,530.0,427.0,283.0,540.0
4,松山區,松山區,計,189939.0,6562.0,955.0,1250.0,1298.0,1459.0,1600.0,...,299.0,263.0,215.0,564.0,211.0,133.0,90.0,74.0,56.0,125.0
5,松山區,松山區,男,88635.0,3432.0,499.0,637.0,704.0,759.0,833.0,...,130.0,114.0,81.0,261.0,92.0,68.0,44.0,31.0,26.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,北投區,大屯里,男,665.0,31.0,6.0,4.0,8.0,5.0,8.0,...,2.0,1.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0
1404,北投區,大屯里,女,567.0,17.0,5.0,3.0,7.0,1.0,1.0,...,2.0,1.0,1.0,5.0,0.0,3.0,2.0,0.0,0.0,0.0
1405,北投區,湖田里,計,871.0,25.0,5.0,3.0,4.0,6.0,7.0,...,3.0,3.0,1.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0
1406,北投區,湖田里,男,479.0,11.0,1.0,2.0,2.0,3.0,3.0,...,1.0,1.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0


## 按里統計

In [None]:
def get_aggregate_li(df:pd.DataFrame)->pd.DataFrame:

    is_li_mask = df['里別'].str.contains("里")
    is_agg_gender = df['性別'] == "計"
    return df[(is_li_mask & is_agg_gender)].drop("性別", axis=1)

get_aggregate_li(tpe_demographics)

In [None]:
# 移除合計
def remove_age_interval_level(df:pd.DataFrame)->pd.DataFrame:
    col_with_int_str = df.columns.str.contains("合計")
    return df.loc[:, ~col_with_int_str]

remove_age_interval_level(tpe_demographics)

In [None]:
# to proportion
def to_proportion(df:pd.DataFrame)-> pd.DataFrame:
    # all columns after "總計"
    df = df.copy()
    index_of_total = df.columns.get_loc("總計")
    total_pop = df.iloc[:, index_of_total]
    columns_need_prop = df.columns[index_of_total+1:]
    for column in columns_need_prop:
        df[column] = df[column] / total_pop * 100
    return df

to_proportion(get_aggregate_li(tpe_demographics))

## 根據想要的年齡分層

In [110]:
from typing import Callable


age_boundary = [15, 18, 24, 40, 65, 75]
"""
    0~15, 16~18, 19~~24, 25~40, 41~65, 66~75, 76~+
"""

def agg_by_age(age_bound:list[int]) -> Callable[[pd.DataFrame], pd.DataFrame]:
    lower_bounds = [0] + [age+1 for age in age_bound]
    age_bound = age_bound + [100]    ## 200 years old as upper: 76~200

    bound_tuples = [(l, u + 1) for l,u in zip(lower_bounds, age_bound)]
    def p(df:pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        for age_tuple in bound_tuples:
            column_names = [f"{age}歲" + ("以上" if age == 100 else "") for age in range(*age_tuple)]
            df[f'age_{age_tuple[0]}_{age_tuple[1]-1}'] = \
                df.loc[:, column_names].sum(axis=1)
        return df
    return p


def drop_chinese_age(df:pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.str.contains("歲")]

In [111]:
agg_by_age([2,4])(tpe_demographics)

Unnamed: 0,行政區,里別,性別,總計,合計_0~4歲,0歲,1歲,2歲,3歲,4歲,...,合計_95~99歲,95歲,96歲,97歲,98歲,99歲,100歲以上,age_0_2,age_3_4,age_5_100
1,總計,總計,計,2480681.0,87542.0,13411.0,16694.0,17530.0,19392.0,20515.0,...,5574.0,1882.0,1453.0,1009.0,730.0,500.0,1078.0,47635.0,39907.0,2393139.0
2,總計,總計,男,1177022.0,45147.0,6867.0,8572.0,9131.0,9937.0,10640.0,...,2485.0,843.0,643.0,479.0,303.0,217.0,538.0,24570.0,20577.0,1131875.0
3,總計,總計,女,1303659.0,42395.0,6544.0,8122.0,8399.0,9455.0,9875.0,...,3089.0,1039.0,810.0,530.0,427.0,283.0,540.0,23065.0,19330.0,1261264.0
4,松山區,松山區,計,189939.0,6562.0,955.0,1250.0,1298.0,1459.0,1600.0,...,564.0,211.0,133.0,90.0,74.0,56.0,125.0,3503.0,3059.0,183377.0
5,松山區,松山區,男,88635.0,3432.0,499.0,637.0,704.0,759.0,833.0,...,261.0,92.0,68.0,44.0,31.0,26.0,52.0,1840.0,1592.0,85203.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,北投區,大屯里,男,665.0,31.0,6.0,4.0,8.0,5.0,8.0,...,3.0,1.0,1.0,0.0,1.0,0.0,0.0,18.0,13.0,634.0
1404,北投區,大屯里,女,567.0,17.0,5.0,3.0,7.0,1.0,1.0,...,5.0,0.0,3.0,2.0,0.0,0.0,0.0,15.0,2.0,550.0
1405,北投區,湖田里,計,871.0,25.0,5.0,3.0,4.0,6.0,7.0,...,3.0,1.0,1.0,0.0,0.0,1.0,0.0,12.0,13.0,846.0
1406,北投區,湖田里,男,479.0,11.0,1.0,2.0,2.0,3.0,3.0,...,3.0,1.0,1.0,0.0,0.0,1.0,0.0,5.0,6.0,468.0


把全部 pipe 在一起

In [113]:
tpe_demographics_tidy = (tpe_demographics
 .pipe(get_aggregate_li)
 .pipe(remove_age_interval_level)
 .pipe(to_proportion)
 .pipe(agg_by_age(age_boundary))
 .pipe(drop_chinese_age)
 )
tpe_demographics_tidy

Unnamed: 0,行政區,里別,總計,age_0_15,age_16_18,age_19_24,age_25_40,age_41_65,age_66_75,age_76_100
7,松山區,莊敬里,5045.0,11.912785,2.418236,5.748266,20.118930,38.691774,13.280476,7.829534
10,松山區,東榮里,7799.0,17.348378,2.295166,5.064752,17.027824,36.145660,13.437620,8.680600
13,松山區,三民里,6380.0,12.351097,2.147335,5.564263,17.789969,38.605016,13.620690,9.921630
16,松山區,新益里,4326.0,11.534905,2.265372,4.808137,20.850670,37.586685,14.193250,8.760980
19,松山區,富錦里,4942.0,14.225010,2.185350,4.917038,19.607446,36.806961,14.467827,7.790368
...,...,...,...,...,...,...,...,...,...,...
1393,北投區,關渡里,10982.0,11.946822,2.522309,5.554544,22.272810,37.606993,12.793662,7.302859
1396,北投區,泉源里,2267.0,7.190119,2.514336,6.793119,21.614468,40.979268,13.056903,7.851787
1399,北投區,湖山里,1492.0,6.300268,2.077748,4.423592,19.235925,41.085791,14.946381,11.930295
1402,北投區,大屯里,1232.0,11.444805,1.948052,6.250000,20.535714,37.662338,12.662338,9.496753


In [114]:
tpe_demographics_tidy.to_parquet("../DATA/Demographic/demographic.pq", index=False)

In [115]:
pd.read_parquet("../DATA/Demographic/demographic.pq")

Unnamed: 0,行政區,里別,總計,age_0_15,age_16_18,age_19_24,age_25_40,age_41_65,age_66_75,age_76_100
0,松山區,莊敬里,5045.0,11.912785,2.418236,5.748266,20.118930,38.691774,13.280476,7.829534
1,松山區,東榮里,7799.0,17.348378,2.295166,5.064752,17.027824,36.145660,13.437620,8.680600
2,松山區,三民里,6380.0,12.351097,2.147335,5.564263,17.789969,38.605016,13.620690,9.921630
3,松山區,新益里,4326.0,11.534905,2.265372,4.808137,20.850670,37.586685,14.193250,8.760980
4,松山區,富錦里,4942.0,14.225010,2.185350,4.917038,19.607446,36.806961,14.467827,7.790368
...,...,...,...,...,...,...,...,...,...,...
451,北投區,關渡里,10982.0,11.946822,2.522309,5.554544,22.272810,37.606993,12.793662,7.302859
452,北投區,泉源里,2267.0,7.190119,2.514336,6.793119,21.614468,40.979268,13.056903,7.851787
453,北投區,湖山里,1492.0,6.300268,2.077748,4.423592,19.235925,41.085791,14.946381,11.930295
454,北投區,大屯里,1232.0,11.444805,1.948052,6.250000,20.535714,37.662338,12.662338,9.496753


## 財政資料

In [1]:
import pandas as pd

income_by_li = pd.read_csv("../DATA/Demographic/109 所得.csv")
income_by_li

Unnamed: 0,鄉鎮市區,村里,納稅單位(戶),綜合所得總額,平均數,中位數,第一分位數,第三分位數,標準差,變異係數
0,臺北市松山區,中崙里,1438,1892464,1316,727,298,1560,3655.30,277.75
1,臺北市松山區,自強里,3063,3779338,1234,602,250,1421,2806.64,227.47
2,臺北市松山區,鵬程里,1844,1948080,1056,532,246,1246,1640.97,155.33
3,臺北市松山區,東榮里,2864,4331587,1512,723,287,1758,3365.64,222.53
4,臺北市松山區,介壽里,1545,1976673,1279,694,272,1526,1884.48,147.29
...,...,...,...,...,...,...,...,...,...,...
477,臺北市信義區,三張里,3104,3377225,1088,542,251,1178,2872.58,264.02
478,臺北市信義區,其他,205,410987,2005,585,262,2145,3283.38,163.77
479,臺北市信義區,合計,73112,75191086,1028,544,243,1166,2444.32,237.67
480,臺北市其他,其他,149,177043,1188,431,243,1306,2181.57,183.60


In [3]:
"""
Cell generated by Data Wrangler.
"""
def keep_useful_income_data(income_by_li):
    # Filter rows based on column: '村里'
    income_by_li = income_by_li[income_by_li['村里'] != "其他"]
    # Filter rows based on column: '村里'
    income_by_li = income_by_li[income_by_li['村里'] != "合計"]
    # Drop column: '鄉鎮市區'
    income_by_li = income_by_li.drop(columns=['鄉鎮市區', "標準差", "變異係數"])
    return income_by_li

income_by_li_clean = keep_useful_income_data(income_by_li.copy())
income_by_li_clean.head()

Unnamed: 0,村里,納稅單位(戶),綜合所得總額,平均數,中位數,第一分位數,第三分位數
0,中崙里,1438,1892464,1316,727,298,1560
1,自強里,3063,3779338,1234,602,250,1421
2,鵬程里,1844,1948080,1056,532,246,1246
3,東榮里,2864,4331587,1512,723,287,1758
4,介壽里,1545,1976673,1279,694,272,1526


In [6]:
df_demographic = pd.read_parquet("../DATA/Demographic/demographic.pq")
df_demographic = df_demographic.merge(income_by_li_clean, left_on="里別", right_on="村里", )

df_demographic

Unnamed: 0,行政區,里別,總計,age_0_15,age_16_18,age_19_24,age_25_40,age_41_65,age_66_75,age_76_100,村里,納稅單位(戶),綜合所得總額,平均數,中位數,第一分位數,第三分位數
0,松山區,莊敬里,5045.0,11.912785,2.418236,5.748266,20.118930,38.691774,13.280476,7.829534,莊敬里,1613,1360919,844,543,242,1054
1,松山區,東榮里,7799.0,17.348378,2.295166,5.064752,17.027824,36.145660,13.437620,8.680600,東榮里,2864,4331587,1512,723,287,1758
2,松山區,三民里,6380.0,12.351097,2.147335,5.564263,17.789969,38.605016,13.620690,9.921630,三民里,2438,2717151,1115,625,260,1320
3,松山區,新益里,4326.0,11.534905,2.265372,4.808137,20.850670,37.586685,14.193250,8.760980,新益里,1634,1501434,919,538,231,1181
4,松山區,富錦里,4942.0,14.225010,2.185350,4.917038,19.607446,36.806961,14.467827,7.790368,富錦里,1810,2023802,1118,580,262,1346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,北投區,關渡里,10982.0,11.946822,2.522309,5.554544,22.272810,37.606993,12.793662,7.302859,關渡里,3638,3125376,859,482,218,1007
452,北投區,泉源里,2267.0,7.190119,2.514336,6.793119,21.614468,40.979268,13.056903,7.851787,泉源里,672,481998,717,395,190,724
453,北投區,湖山里,1492.0,6.300268,2.077748,4.423592,19.235925,41.085791,14.946381,11.930295,湖山里,480,464206,967,452,216,980
454,北投區,大屯里,1232.0,11.444805,1.948052,6.250000,20.535714,37.662338,12.662338,9.496753,大屯里,359,318595,887,457,237,997


In [22]:
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point

In [None]:


li_geodata:gpd.GeoDataFrame = gpd.read_file("../DATA/Demographic/OFiles_80aff061-baea-4ab2-a7b6-d02f9c9f8060/VILLAGE_NLSC_121_1120317.shp", encoding = "utf-8")

台灣用 EPSG:3826
TWD97 / TM2 zone 121

In [23]:
tpe_li = li_geodata[li_geodata["COUNTYNAME"]== "臺北市"][["TOWNNAME", "geometry"]]
tpe_li

Unnamed: 0,TOWNNAME,geometry
121,南港區,"POLYGON ((309803.669 2772329.942, 309896.039 2..."
122,南港區,"POLYGON ((310038.796 2772556.157, 310098.384 2..."
123,內湖區,"POLYGON ((311631.369 2773130.775, 311633.200 2..."
124,內湖區,"POLYGON ((310899.967 2773360.726, 310900.238 2..."
4391,中山區,"POLYGON ((303958.754 2771642.927, 304161.596 2..."
...,...,...
7401,北投區,"POLYGON ((302332.607 2784355.092, 302365.510 2..."
7402,士林區,"POLYGON ((308170.928 2784399.922, 308194.655 2..."
7403,北投區,"POLYGON ((300710.056 2785069.337, 300738.995 2..."
7404,士林區,"POLYGON ((308542.775 2787541.572, 308555.720 2..."


In [28]:
tpe_li.to_crs(4326).distance(Point(121.59274,25.05797))
## https://stackoverflow.com/questions/72073417/userwarning-geometry-is-in-a-geographic-crs-results-from-buffer-are-likely-i


  tpe_li.to_crs(4326).distance(Point(121.59274,25.05797))


121     0.000000
122     0.000001
123     0.004044
124     0.001242
4391    0.056268
          ...   
7401    0.107304
7402    0.064265
7403    0.123898
7404    0.070823
7405    0.105612
Length: 456, dtype: float64

# 捷運站

In [5]:
import pandas as pd
import numpy as np
import requests as rq
from io import StringIO

將 2022 全部合在一起，再加總

In [7]:
all_mrt_od_df = pd.read_csv("../DATA/MRT/臺北捷運每日分時各站OD流量統計資料.csv")
od_mrt_2022 = all_mrt_od_df[all_mrt_od_df['年月'].astype(int) // 100 == 2022]

mrt_dfs = []
for _, row in od_mrt_2022.iterrows():
    month = row['年月']
    url = row['URL']

    res = rq.get(url)
    with open(f"../DATA/MRT/raw/{month}", 'wb') as f:
        f.write(res.content)
        




In [20]:
all_mrt_od_df = pd.concat([pd.read_csv(f"../DATA/MRT/raw/2022{month:02d}", encoding = "utf-8") for month in range(1,13)])


In [30]:
agg_enter_station_mrt = all_mrt_od_df.groupby(by=["進站", "時段"])[["人次"]].sum().sort_values("人次", ascending=False).apply(np.log10)
agg_exit_station_mrt = all_mrt_od_df.groupby(by=["出站", "時段"])[["人次"]].sum().sort_values("人次", ascending=False).apply(np.log10)

agg_enter_station_mrt.to_parquet("../DATA/MRT/agg_enter_station_station_mrt.pq")
agg_exit_station_mrt.to_parquet("../DATA/MRT/agg_exit_station_station_mrt.pq")

In [26]:
all_mrt_od_df['時段'].unique()

array([ 0,  1,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23,  5,  2,  3,  4], dtype=int64)

In [24]:
od_data[od_data["進站"].str.contains("板橋")]

Unnamed: 0,日期,時段,進站,出站,人次
7973,2021-01-01,0,BL板橋,松山機場,0
7974,2021-01-01,0,BL板橋,中山國中,3
7975,2021-01-01,0,BL板橋,南京復興,1
7976,2021-01-01,0,BL板橋,忠孝復興,2
7977,2021-01-01,0,BL板橋,大安,0
...,...,...,...,...,...
9204288,2021-01-31,23,Y板橋,板新,5
9204289,2021-01-31,23,Y板橋,Y板橋,1
9204290,2021-01-31,23,Y板橋,新埔民生,7
9204291,2021-01-31,23,Y板橋,幸福,26


In [6]:
agg_enter_station_mrt = pd.read_parquet("../DATA/MRT/agg_enter_station_station_mrt.pq")
agg_exit_station_mrt = pd.read_parquet("../DATA/MRT/agg_exit_station_station_mrt.pq")

In [7]:
agg_enter_station_mrt.query("時段 == 3").head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,人次
進站,時段,Unnamed: 2_level_1
市政府,3,3.516932
台北101/世貿,3,3.392521
台北車站,3,3.342817
西門,3,3.178977
忠孝敦化,3,3.174351
象山,3,3.044148
忠孝復興,3,3.030195
信義安和,3,3.0
國父紀念館,3,2.991669
中山,3,2.720159


捷運站出口座標

In [8]:
mrt_coord = pd.read_csv("../DATA/MRT/臺北捷運車站出入口座標.csv", encoding="big5")
mrt_coord

Unnamed: 0,項次,出入口名稱,出入口編號,經度,緯度,是否為無障礙用
0,1,頂埔站出口1,1,121.418336,24.959327,是
1,2,頂埔站出口2,2,121.418357,24.958947,否
2,3,頂埔站出口3,3,121.419280,24.959503,是
3,4,頂埔站出口4,4,121.419969,24.960255,是
4,5,松山機場站出口1,1,121.552043,25.063631,是
...,...,...,...,...,...,...
382,383,板橋站出口5,5,121.464312,25.015502,是
383,384,新埔民生站出口,0,121.466839,25.026125,是
384,385,幸福站出口1,1,121.460146,25.050126,是
385,386,幸福站出口2,2,121.460236,25.049759,否


In [9]:
def remove_mrt_exit_string(df:pd.DataFrame) -> pd.DataFrame:

    df['station'] = df['出入口名稱'].apply(lambda x: x.split('站出口')[0].strip())
    # 处理例外情况：台北車站出口1 对应台北車站
    df.loc[df['出入口名稱'].str.contains('台北車站'), 'station'] = '台北車站'
    return df

def rename_mrt_columns(df:pd.DataFrame) -> pd.DataFrame:
    extract_cols =["station", "出入口編號", "經度","緯度"]
    rename_cols = ["station", "No", "lng", "lat"]

    df_p = df[extract_cols]
    return df_p.rename(columns = {a:b for a, b in zip(extract_cols, rename_cols)})


mrt_coord_clean = rename_mrt_columns(
    remove_mrt_exit_string(mrt_coord))

In [11]:
mrt_coord_clean.to_parquet("../DATA/MRT/mrt_coord.pq")