# 參考文獻

*   [氣象資料開放平台](https://opendata.cwb.gov.tw/index)
*   [農情調查](https://data.coa.gov.tw/Query/ServiceDetail.aspx?id=038)
*   [農情預測](https://data.coa.gov.tw/Query/ServiceTransDetail.aspx?id=4P84xEv6hd22)
*   CWB-CA3A0F5C-455E-436A-9FB7-530DA1A044E7

# Note

統計中咖啡期作每年都只有一期全年；5-6年生；每年收成月份 不定
產量資料最早在97年；氣候天氣資料應該要從92開始

In [11]:
from lxml import etree
from scipy.spatial import distance
import matplotlib.pyplot as plt
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib,dask,lxml,re,os,time,io
from IPython.display import display, HTML
from pathos.helpers import cpu_count
from dask.diagnostics import ProgressBar

# 蒐集資料及預處理資料

## 產量資料

In [12]:
def get_yield_data_df():
  yield_data_raw = [dask.delayed(requests.get)("https://data.coa.gov.tw/Service/OpenData/FromM/TownCropData.aspx?cropCode=337&year="+str(n)) for n in range(97,109)]
  yield_data_raw = dask.compute(*yield_data_raw)
  for i,x in enumerate(yield_data_raw): yield_data_raw[i].encoding = 'UTF-8'
  yield_data_json = [yd.json() for yd in yield_data_raw]
  yield_data_df = list()
  for sublist in yield_data_json: yield_data_df.extend(sublist)
  yield_data_df = pd.DataFrame.from_dict(yield_data_df)
  yield_data_df['年度'] = yield_data_df['年度'].astype(int)
  yield_data_df['yield_year'] = yield_data_df['年度']+1911
  yield_data_df['yield_year_str'] = yield_data_df['yield_year'].astype(str)
  return yield_data_df

In [23]:
yield_data_df = get_yield_data_df().rename(columns={"縣市": "city", "鄉鎮": "district"})
yield_data_df['city'] = yield_data_df['city'].str.replace(pat="台", repl="臺")
#yield_data_df['city'] = yield_data_df['city'].str.replace(pat="巿", repl="市")
#yield_data_df['district'] = yield_data_df['district'].str.replace(pat="巿", repl="市")
yield_data_df['district'] = yield_data_df['district'].str.replace(pat="台", repl="臺")
display(yield_data_df)

Unnamed: 0,年度,期作,city,district,作物代碼,作物,收穫面積(公頃),種植面積(公頃),每公頃收穫量(公斤),收量(公斤),yield_year,yield_year_str
0,97,全年,臺北市,文山區,337,咖啡,0.10,0.10,5000.0,500.0,2008,2008
1,97,全年,新北市,石碇區,337,咖啡,1.00,1.00,50.0,50.0,2008,2008
2,97,全年,新北市,石門區,337,咖啡,0.00,1.20,0.0,0.0,2008,2008
3,97,全年,新北市,三峽區,337,咖啡,0.00,0.80,0.0,0.0,2008,2008
4,97,全年,桃園市,龍潭區,337,咖啡,0.20,1.20,800.0,160.0,2008,2008
...,...,...,...,...,...,...,...,...,...,...,...,...
1537,108,全年,臺東縣,達仁鄉,337,咖啡,31.28,31.28,1100.0,34408.0,2019,2019
1538,108,全年,臺東縣,卑南鄉,337,咖啡,17.92,17.92,1100.0,19712.0,2019,2019
1539,108,全年,臺東縣,東河鄉,337,咖啡,11.86,12.31,790.0,9369.0,2019,2019
1540,108,全年,臺東縣,池上鄉,337,咖啡,9.18,9.18,900.0,8262.0,2019,2019


### 找出有產量資料中有完整資料（所有年度都有）的鄉鎮市區

In [14]:
completed_cases_stations_df = yield_data_df.groupby(['city','district'])['收量(公斤)'].agg(['mean', 'count']).sort_values(by=['count','mean'],ascending=False).reset_index(drop=False)
completed_cases_stations_df = completed_cases_stations_df[completed_cases_stations_df['count']==max(completed_cases_stations_df['count'])]
#completed_cases_stations = completed_cases_stations_df['鄉鎮'].to_list()
display(yield_data_df.head(n=5)) #.head(n=50)
display(completed_cases_stations_df)
completed_cases_stations_df = completed_cases_stations_df[['city','district']]
#display(completed_cases_stations_df.head(n=5)) #.head(n=50)

Unnamed: 0,年度,期作,city,district,作物代碼,作物,收穫面積(公頃),種植面積(公頃),每公頃收穫量(公斤),收量(公斤),yield_year,yield_year_str
0,97,全年,臺北市,文山區,337,咖啡,0.1,0.1,5000.0,500.0,2008,2008
1,97,全年,新北市,石碇區,337,咖啡,1.0,1.0,50.0,50.0,2008,2008
2,97,全年,新北市,石門區,337,咖啡,0.0,1.2,0.0,0.0,2008,2008
3,97,全年,新北市,三峽區,337,咖啡,0.0,0.8,0.0,0.0,2008,2008
4,97,全年,桃園市,龍潭區,337,咖啡,0.2,1.2,800.0,160.0,2008,2008


Unnamed: 0,city,district,mean,count
0,屏東縣,三地門鄉,76045.166667,12
1,雲林縣,古坑鄉,64502.916667,12
2,南投縣,國姓鄉,62808.583333,12
3,臺南市,東山區,60991.833333,12
4,嘉義縣,阿里山鄉,52203.666667,12
...,...,...,...,...
66,屏東縣,屏東市,128.666667,12
67,臺北市,文山區,121.166667,12
68,宜蘭縣,員山鄉,56.583333,12
69,南投縣,集集鎮,45.750000,12


### 各鄉鎮市區地理中心資訊

In [19]:
district_center_df = requests.get("http://quality.data.gov.tw/dq_download_csv.php?nid=25489&md5_url=a85eb04185242dd82bf3db71756eb6da")
district_center_df.encoding = 'UTF-8'
district_center_df = district_center_df.text
district_center_df = io.StringIO(district_center_df) #.replace("巿","市")
district_center_df = pd.read_csv(district_center_df,encoding='UTF-8')
district_center_df['city'] = district_center_df['行政區名'].str.extract("(.{2}[縣|市]{1}){1}?")
district_center_df['district'] = district_center_df['行政區名'].str.extract(".{2}[縣|市]{1}([\u4E00-\u9FFF]{1,5}?[鄉鎮市區]{1,2}){1}?")
#district_center_df['district'].at[district_center_df['district']=='彰化市'] = '彰化巿'

#district_center_df.to_excel('R:/test.xlsx')
display(district_center_df)
district_center_df[district_center_df['district']=='彰化市']

Unnamed: 0,行政區名,_x0033_碼郵遞區號,中心點經度,中心點緯度,TGOS_URL,city,district
0,臺北市中正區,100,121.519884,25.032405,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,臺北市,中正區
1,臺北市大同區,103,121.513042,25.063424,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,臺北市,大同區
2,臺北市中山區,104,121.538160,25.069699,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,臺北市,中山區
3,臺北市松山區,105,121.557588,25.059991,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,臺北市,松山區
4,臺北市大安區,106,121.543445,25.026770,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,臺北市,大安區
...,...,...,...,...,...,...,...
366,花蓮縣瑞穗鄉,978,121.407347,23.515612,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,花蓮縣,瑞穗鄉
367,花蓮縣萬榮鄉,979,121.318953,23.727726,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,花蓮縣,萬榮鄉
368,花蓮縣玉里鎮,981,121.360448,23.371436,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,花蓮縣,玉里鎮
369,花蓮縣卓溪鄉,982,121.180422,23.390629,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,花蓮縣,卓溪鄉


Unnamed: 0,行政區名,_x0033_碼郵遞區號,中心點經度,中心點緯度,TGOS_URL,city,district
141,彰化縣彰化市,500,120.569421,24.075329,http://tgos.nat.gov.tw/tgos/Web/MetaData/TGOS_...,彰化縣,彰化市


### 各氣象站基本資料

In [16]:
def get_weather_station_attrs_df():
  weather_station_attrs = requests.get("https://e-service.cwb.gov.tw/wdps/obs/state.htm")
  weather_station_attrs.encoding = 'UTF-8'
  weather_station_attrs = weather_station_attrs.text.replace(
      re.match(pattern="(<meta{1}[\w\d\s\W\D\S]+</style>{1})", string=weather_station_attrs.text)[0],""
  )
  weather_station_etree = etree.HTML(weather_station_attrs)
  r = weather_station_etree.xpath("//div[@id='existing_station']/table")[0]
  weather_station_existing_df = pd.read_html(etree.tostring(r))[0]
  weather_station_existing_df['status'] = "現存"
  r = weather_station_etree.xpath("//div[@id='revoked_station']/table")[0]
  weather_station_revoked_df = pd.read_html(etree.tostring(r))[0]
  weather_station_revoked_df['status'] = '已撤站'
  weather_station_attr_df = pd.concat([weather_station_existing_df,weather_station_revoked_df]).reset_index(drop=True).rename(columns={'站號':'stationid','站名':'stationname','城市':'city'})
  #weather_station_attr_df['city'] = weather_station_attr_df['城市']
  weather_station_attr_df['district'] = weather_station_attr_df['地址'].str.extract(r"([\u4E00-\u9FFF]{1,5}?[鄉鎮市區]{1,2}){1}?")
  weather_station_attr_df['begin_year'] = weather_station_attr_df['資料起始日期'].str.extract("(\d+)\/{1}").astype(int)
  return weather_station_attr_df

weather_station_attr_df_preferred = get_weather_station_attrs_df()
weather_station_attr_df_preferred = weather_station_attr_df_preferred[weather_station_attr_df_preferred['status']=='現存']
weather_station_attr_df_preferred = weather_station_attr_df_preferred[weather_station_attr_df_preferred['begin_year']<=2003].sort_values(by=['city','district'],ascending=False)
display(weather_station_attr_df_preferred)
#display(set(weather_station_attr_df_preferred['城市'].tolist()) )
#weather_station_attr_df_preferred = pd.merge(left=weather_station_attr_df_preferred,right=completed_cases_stations_df,how="inner")

Unnamed: 0,stationid,stationname,海拔高度(m),經度,緯度,city,地址,資料起始日期,撤站日期,備註,原站號,新站號,status,district,begin_year
591,C1V160,達卡努瓦,1040.0,120.705319,23.279811,高雄市,那瑪夏區達卡努瓦里,1992/01/21,,原名為民生(達卡努瓦)，於2015/10/13更名為達卡努瓦。本站於2015/10/13起進...,,,現存,那瑪夏區,1992
383,C0V310,美濃,46.0,120.519153,22.898742,高雄市,美濃區福安里中山路二段204巷65號(美濃區福安國小後方),1992/01/22,,本站於 2013/6/15 進行儀器汰換，並於2013/8/1完成驗收。,,,現存,美濃區,1992
381,C0V250,甲仙,298.0,120.591758,23.080106,高雄市,甲仙區(甲仙國民中學操場後方小山丘上),1992/01/21,,本站於2013/3/13進行儀器汰換，並於2013/4/18完成驗收。,,,現存,甲仙區,1992
386,C0V370,古亭坑,74.0,120.401947,22.893178,高雄市,田寮區古亭里,1992/05/01,,本站因儀器汰換，自2013/8/31起暫停資料提供，並於2013/12/1完成驗收。,,,現存,田寮區,1992
599,C1V390,尖山,60.0,120.367789,22.813153,高雄市,燕巢區尖山里尖山巷11之1號(燕巢國小尖山分校),1992/01/22,,本站於2013/9/3進行儀器汰換，並於2013/12/1完成驗收。,,,現存,燕巢區,1992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,C1H900,清流,934.0,120.962997,24.080775,南投縣,仁愛鄉互助村,1991/12/12,,本站於2015/7/24起進行儀器汰換，並於2015/12/31完成驗收。,,,現存,仁愛鄉,1991
514,C1I020,萬大,1120.0,121.132261,23.979817,南投縣,仁愛鄉親愛村萬大水庫宿舍旁,1992/05/12,,本站於2015/8/11起進行儀器汰換，並於2015/12/31完成驗收。,,,現存,仁愛鄉,1992
515,C1I030,武界,948.0,121.052483,23.913225,南投縣,仁愛鄉法治村,1992/05/03,,本站於2015/8/14起進行儀器汰換，並於2015/12/31完成驗收。,,,現存,仁愛鄉,1992
533,C1I320,奧萬大,1275.0,121.178556,23.952956,南投縣,仁愛鄉奧萬大森林遊樂區內(員工宿舍),2002/05/15,,短距離移位。本站於2015/8/19起進行儀器汰換，並於2015/12/31完成驗收。,,,現存,仁愛鄉,2002


## 從產量資料找出相對應的氣象測站
要將產量資料和測站觀測資料配對串連

In [7]:
#一年觀測資料 一年觀測資料-局屬地面測站觀測資料 https://opendata.cwb.gov.tw/dataset/climate/C-B0024-002
#每月氣象-過去9年局屬地面測站每月氣象資料 https://opendata.cwb.gov.tw/dataset/climate/C-B0026-002
#每日雨量-過去9年局屬地面測站每日雨量資料 https://opendata.cwb.gov.tw/dataset/climate/C-B0025-002
#抓資料介面 https://opendata.cwb.gov.tw/dist/opendata-swagger.html?urls.primaryName=openAPI#/
#https://e-service.cwb.gov.tw/HistoryDataQuery/
#view-source:https://e-service.cwb.gov.tw/HistoryDataQuery/QueryDataController.do?command=viewMain

stationinfos = list()
succeed_match_stations = list()
succeed_match_stations_df = list()
failed_match_stations = list()
stationsinfos_mismatched = list()
mismatched_stations_df = list()
duplicated_match_stations = list()
paddedmonths = [str(m).rjust(2, '0') for m in range(1,13)]
"""
stationinfos a list of dict where its content is as follows

    {'stationid': 'C0A920',
      'stationname': '富貴角',
      'city': '新北市',
      'district': '石門區',
      'month': '07',
      'year': '2008',
      'yield_year': 2008},

succeed_match_stations: a list of the row number of yield data where there is a matched station

succeed_match_stations_df

failed_match_stations: a list of the row number of yield data where there is a mismatched station

duplicated_match_stations: a list of the row number of yield data where there are multiple matched stations(multiple stations in a district)
"""

for yield_data_df_n in range(len(yield_data_df.index)):
  temp_basis_df = yield_data_df.loc[[yield_data_df_n],['city','district','yield_year']].reset_index(drop=True)
  print("yield_data_df_n={} and temp_basis_df is:".format(yield_data_df_n))
  #display(temp_basis_df)
  temp_basis_series = yield_data_df.loc[yield_data_df_n,['city','district','yield_year']]
  targetfetchyears = list(range(temp_basis_series['yield_year']-5, temp_basis_series['yield_year']+1))
  temp_basis_df_2 = temp_basis_df.drop(columns=['yield_year']).reset_index(drop=True)
  #print("temp_basis_df_2 is:")
  #display(temp_basis_df_2)
  needed_station_df = pd.merge(left=weather_station_attr_df_preferred,right=temp_basis_df_2,how='inner')
  if len(needed_station_df.index)>0:
    """
    needed_station_df found 讀取天氣觀測資料
    """
    succeed_match_stations.append(yield_data_df_n)
    print("needed_station_df found")
    if len(needed_station_df.index)>1: duplicated_match_stations.append(yield_data_df_n)
    needed_station_df_simplified = needed_station_df.loc[:,['stationid','stationname','city','district']]
    needed_station_df_simplified = pd.concat([needed_station_df_simplified.assign(month=m) for m in paddedmonths]  ).reset_index(drop=True)
    needed_station_df_simplified = pd.concat([needed_station_df_simplified.assign(year=str(y)) for y in targetfetchyears]  ).reset_index(drop=True)
    needed_station_df_simplified['matchingtype'] = 'samedistrict'
    stationinfos.extend(
        [{**nd,**temp_basis_df.to_dict('records')[0]} for nd in needed_station_df_simplified.to_dict('records')]
        )
  else:
    """
    needed_station_df not found, find nearest station instead
    """
    print("needed_station_df not found")
    failed_match_stations.append(yield_data_df_n)
    axis = pd.merge(left=district_center_df,right=temp_basis_df_2,how='inner')
    try:
      axis = axis.loc[:,['中心點經度','中心點緯度']].to_dict('records')[0]
    except Exception as wrongaxis:
      display(temp_basis_df_2)
      display(axis)
      print(str(wrongaxis))
      break
    #計算離產量資料鄉鎮市區最近的氣象站並且串接
    axis_np = list(axis.values())
    nearest_station_axis = weather_station_attr_df_preferred
    nearest_station_axis['combined_axis'] = nearest_station_axis.loc[:,['經度','緯度']].values.tolist()
    nearest_station_axis['distance'] = nearest_station_axis['combined_axis'].apply(distance.euclidean, v=axis_np)
    nearest_station_axis = nearest_station_axis.sort_values(by=['distance'], ascending=True).reset_index(drop=True).loc[[0],:].reset_index(drop=True)
    needed_station_df_simplified = temp_basis_df.rename(columns={'city':'yieldcity','district':'yielddistrict'}).reset_index(drop=True)
    needed_station_df_simplified = pd.concat([nearest_station_axis, needed_station_df_simplified], axis=1).reset_index(drop=True)
    mismatched_stations_df.append(needed_station_df_simplified)
    #display(nearest_station_axis)
    #display(needed_station_df_simplified)
    #break
    needed_station_df_simplified = pd.concat([needed_station_df_simplified.assign(month=m) for m in paddedmonths]  ).reset_index(drop=True)
    needed_station_df_simplified = pd.concat([needed_station_df_simplified.assign(year=str(y)) for y in targetfetchyears]  ).reset_index(drop=True)
    needed_station_df_simplified['matchingtype'] = 'nearest'
    needed_station_df_simplified = needed_station_df_simplified.loc[:,['stationid','stationname','city','district','month','year','yield_year']].to_dict('records')
    stationsinfos_mismatched.extend(needed_station_df_simplified)
  #if yield_data_df_n>11:
  #  break
  if False:
    urlencoded_name = urllib.parse.quote(temp_basis_df_for_mergeweather['stationname'][0], safe='~@#$&()*!+=:;,.?/\'', encoding='UTF-8').replace("%","%25")
    sinfo_weather_data_raw = "https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station={}&datepicker=-&stname={}".format(temp_basis_df_for_mergeweather['stationid'][0], urlencoded_name)
    station_info_message_ori = temp_basis_df_for_mergeweather.loc[:,['stationid','stationname','yield_year']].to_dict('records')[0]
    try:
      station_info_message = map(str,list(station_info_message_ori.values()))
      t = "processing "+" ".join(station_info_message+[sinfo_weather_data_raw])
    except Exception as joinerror:
      print(str(joinerror))
      display(temp_basis_df_for_mergeweather)
      display(station_info_message_ori)
      break

=1253 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1254 and temp_basis_df is:
needed_station_df found
yield_data_df_n=1255 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1256 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1257 and temp_basis_df is:
needed_station_df found
yield_data_df_n=1258 and temp_basis_df is:
needed_station_df found
yield_data_df_n=1259 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1260 and temp_basis_df is:
needed_station_df found
yield_data_df_n=1261 and temp_basis_df is:
needed_station_df found
yield_data_df_n=1262 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1263 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1264 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1265 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1266 and temp_basis_df is:
needed_station_df not found
yield_data_df_n=1267 and temp_basis_df is

### 各氣象站觀測資料

In [8]:
def get_station_monthly_data_retrieve(stationinfo):
  station_info_message = list(map(str,stationinfo.values()))
  try:
    #stationinfo = {'stationid':'466850','stationname':'五分山雷達站','year':'2021','month':'01'}
    stationinfo['urlencoded_name'] = urllib.parse.quote(stationinfo['stationname'], safe='~@#$&()*!+=:;,.?/\'', encoding='UTF-8').replace("%","%25")
    weather_data_raw = "https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station={}&datepicker={}-{}&stname={}".format(stationinfo['stationid'], stationinfo['year'], stationinfo['month'], stationinfo['urlencoded_name'])
    print("processing "+" ".join(station_info_message+[weather_data_raw] ))
    weather_data_raw = requests.get(weather_data_raw)
    weather_data_raw.encoding = 'UTF-8'
    return [weather_data_raw.text,stationinfo]
  except Exception as e:
    print("retrieving error at "+" ".join(station_info_message)+" for "+str(e))
    return ["retrieving error",stationinfo]

def get_station_monthly_data_parse(stationinfo):
  retrieved_text = stationinfo[0]
  stationinfo = stationinfo[1]
  station_info_message = map(str,list(stationinfo.values()))
  try:
    weather_data_etree = etree.HTML(retrieved_text)
    #觀測時間(day)	測站氣壓(hPa)	海平面氣壓(hPa)	測站最高氣壓(hPa)	測站最高氣壓時間(LST)	測站最低氣壓(hPa)	測站最低氣壓時間(LST)	氣溫(℃)	最高氣溫(℃)	最高氣溫時間(LST)	最低氣溫(℃)	最低氣溫時間(LST)	露點溫度(℃)	相對溼度(%)	最小相對溼度(%)	最小相對溼度時間(LST)	風速(m/s)	風向(360degree)	最大陣風(m/s)	最大陣風風向(360degree)	最大陣風風速時間(LST)	降水量(mm)	降水時數(hour)	最大十分鐘降水量(mm)	最大十分鐘降水量起始時間(LST)	最大六十分鐘降水量(mm)	最大六十分鐘降水量起始時間(LST)	日照時數(hour)	日照率(%)	全天空日射量(MJ/㎡)	能見度(km)	A型蒸發量(mm)	日最高紫外線指數	日最高紫外線指數時間(LST)	總雲量(0~10)
    r = weather_data_etree.xpath("//table[@id='MyTable']//tr[position()>2]")
    weather_data_df = [etree.tostring(tr, encoding='unicode') for tr in r]
    weather_data_df = "<table>"+"".join(weather_data_df)+"</table>"
    weather_data_df = pd.read_html(weather_data_df)[0]
    for key,value in stationinfo.items(): weather_data_df[key] = value
  except Exception as e:
    parsing_err_text = "parsing error at "+" ".join(station_info_message)+" for "+str(e)
    weather_data_df = parsing_err_text
    print(parsing_err_text)
  weather_data_df = "retrieving error at "+" ".join(station_info_message) if retrieved_text=="retrieving error" else weather_data_df
  return weather_data_df

## 抓氣象觀測資料
花時間很長

In [9]:
get_station_monthly_data_parse(get_station_monthly_data_retrieve({**stationinfos[0], **{'year':'2004'}}))

processing C0A920 富貴角 新北市 石門區 01 2004 2008 https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station=C0A920&datepicker=2004-01&stname=%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592


Unnamed: 0,ObsTime,StnPres,SeaPres,StnPresMax,StnPresMaxTime,StnPresMin,StnPresMinTime,Temperature,T Max,T Max Time,...,UVI Max Time,Cloud Amount,stationid,stationname,city,district,month,year,yield_year,urlencoded_name
0,1,995.4,...,/,...,/,...,15.5,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
1,2,996.4,...,/,...,/,...,15.0,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
2,3,996.7,...,/,...,/,...,15.8,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
3,4,996.3,...,/,...,/,...,18.6,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
4,5,995.4,...,/,...,/,...,20.0,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
5,6,996.3,...,/,...,/,...,17.4,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
6,7,998.9,...,/,...,/,...,16.2,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
7,8,1000.3,...,/,...,/,...,15.8,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
8,9,999.2,...,/,...,/,...,16.9,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
9,10,995.9,...,/,...,/,...,18.1,/,...,...,...,...,C0A920,富貴角,新北市,石門區,1,2004,2008,%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592


In [10]:
start_time = time.time()
overall_stationinfos = stationinfos+stationsinfos_mismatched
unique_stationinfos = pd.DataFrame(overall_stationinfos).drop(columns=['yield_year']).drop_duplicates().reset_index(drop=True).to_dict('records')
station_monthly_data = [dask.delayed(get_station_monthly_data_retrieve)(stationinfo) for stationinfo in unique_stationinfos[0:101]]
needscheduler = "threads" if True else "processes"
"""
for sinfo in unique_stationinfos:
    station_info_message = list(sinfo.values())
    sinfo['urlencoded_name'] = urllib.parse.quote(sinfo['stationname'], safe='~@#$&()*!+=:;,.?/\'', encoding='UTF-8').replace("%","%25")
    sinfo_weather_data_raw = "https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station={}&datepicker={}-{}&stname={}".format(sinfo['stationid'], sinfo['year'], sinfo['month'], sinfo['urlencoded_name'])
    try:
        t = "processing "+" ".join(station_info_message+[sinfo_weather_data_raw])
    except Exception as e:
        print(str(e))
        display(sinfo)
        break
"""
if True:
    with dask.config.set(num_workers=30, scheduler=needscheduler):  #204
        with ProgressBar():
            station_monthly_data_retrieved = dask.compute(*station_monthly_data)
    with dask.config.set(num_workers=600, scheduler=needscheduler):
        with ProgressBar():
            station_monthly_data = [dask.delayed(get_station_monthly_data_parse)(smd) for smd in station_monthly_data_retrieved]
            station_monthly_data = dask.compute(*station_monthly_data)
needtime = time.time() - start_time
print("--- %s seconds ---" % (needtime))
#110.41256833076477 12 workers processes
#110.30883741378784 12 workers threads
#35.71750545501709 60 workers processes
#26.521867990493774 60 workers threads
#300 jobs
#269.1925961971283 60 workers threads 600 workers threads
#104.53193426132202 80 workers threads 600 workers threads
#104.78928446769714 84 workers threads 600 workers threads
#103.09000277519226 96 workers threads 600 workers threads
#84.36256146430969 108 workers threads 600 workers threads
#83.18865132331848 120 workers threads 600 workers threads
#81.80956053733826 132 workers threads 600 workers threads
#65.56309628486633 156 workers threads 600 workers threads
#65.40118169784546 180 workers threads 600 workers threads

ery/MonthDataController.do?command=viewMain&station=C0A920&datepicker=2006-09&stname=%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
processing C0A920 富貴角 新北市 石門區 08 2007 https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station=C0A920&datepicker=2007-08&stname=%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
processing C0A920 富貴角 新北市 石門區 01 2006 https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station=C0A920&datepicker=2006-01&stname=%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
processing C0A920 富貴角 新北市 石門區 08 2008 https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station=C0A920&datepicker=2008-08&stname=%25E5%25AF%258C%25E8%25B2%25B4%25E8%25A7%2592
processing C1I250 鯉潭 南投縣 埔里鎮 06 2003 https://e-service.cwb.gov.tw/HistoryDataQuery/MonthDataController.do?command=viewMain&station=C1I250&datepicker=2003-06&stname=%25E9%25AF%2589%25E6%25BD%25AD
processing C1I260 北坑 南投縣 埔里鎮 09 2003 http

In [None]:
yield_data_df.loc[failed_match_stations,['city','district','yield_year']]

In [None]:
station_monthly_data_col_definition = dict(zip(
    list(station_monthly_data[0]),
    "觀測時間(day)	測站氣壓(hPa)	海平面氣壓(hPa)	測站最高氣壓(hPa)	測站最高氣壓時間(LST)	測站最低氣壓(hPa)	測站最低氣壓時間(LST)	氣溫(℃)	最高氣溫(℃)	最高氣溫時間(LST)	最低氣溫(℃)	最低氣溫時間(LST)	露點溫度(℃)	相對溼度(%)	最小相對溼度(%)	最小相對溼度時間(LST)	風速(m/s)	風向(360degree)	最大陣風(m/s)	最大陣風風向(360degree)	最大陣風風速時間(LST)	降水量(mm)	降水時數(hour)	最大十分鐘降水量(mm)	最大十分鐘降水量起始時間(LST)	最大六十分鐘降水量(mm)	最大六十分鐘降水量起始時間(LST)	日照時數(hour)	日照率(%)	全天空日射量(MJ/㎡)	能見度(km)	A型蒸發量(mm)	日最高紫外線指數	日最高紫外線指數時間(LST)	總雲量(0~10)".split("	")
))
station_monthly_data_all_directly_matched = pd.concat(station_monthly_data).reset_index(drop=True)
#display(station_monthly_data_col_definition)
display(station_monthly_data_all_directly_matched.rename(columns=station_monthly_data_col_definition).head(n=5))
station_monthly_data_all.to_csv('station_monthly_data_all_directly_matched.csv')

In [None]:
model_input_x = list()
model_input_y = list()
for succeed_match_station in succeed_match_stations:
  t = pd.merge(
      left=pd.DataFrame(stationinfos).loc[[succeed_match_station],:],
      right=station_monthly_data_all,
      how='inner'
  )
  t = t.loc[:,['StnPres','Temperature','WS','WD','Precp','SunShine']].values.tolist()
  model_input_x.append(t)
  model_input_y.append(yield_data_df.loc[succeed_match_station,'每公頃收穫量(公斤)'])
#.values.tolist()

In [None]:
yield_data_df.index

In [None]:
for i,x in enumerate(model_input_x):
  print("I is "+str(i))
  print(model_input_y[i])
  #display(yield_data_df.iloc[model_input_y[i],:])
  print(x)
