In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob

In [2]:
# 測站
stations = [i.split("\\")[1] for i in glob.glob("../data/Agricultural_Weather_Datas/*")]
stations

['七股研究中心',
 '凍頂茶改',
 '口湖工作站',
 '嘉義分場',
 '嘉義農試',
 '四湖植物園',
 '埔里分場',
 '恆春工作站',
 '恆春畜試',
 '旗南農改',
 '東港工作站',
 '林試六龜中心',
 '林試扇平站',
 '林試畢祿溪站',
 '溪口農場',
 '畜試所',
 '種苗繁殖',
 '義竹分場',
 '臺中農改',
 '臺南蘭花園區',
 '臺南農改',
 '臺大內茅埔',
 '臺大和社',
 '臺大溪頭',
 '臺大竹山',
 '臺大雲林校區',
 '臺西水試所',
 '蓮華池',
 '農業試驗所',
 '雲林分場',
 '高雄農改',
 '魚池茶改',
 '鳳山農試']

In [3]:
pd.Timestamp.now().year

2021

In [4]:
def csv_to_df(csv, year):
    
    # 檔案名稱 => 注意斜線與反斜線
    global file_name
    file_name = csv.split("\\")[1].split(".")[0][4:]
    
    # 處理dataframe
    df = pd.read_csv(csv, header=None).T.fillna("0")
    df.drop(columns=[0,32], axis=1, inplace=True)
    df.drop(index=0, inplace=True)
    
    # 該年度所有日期
    whole_year_date = pd.date_range(f"{year}-01-01", f"{year}-12-31")
    days = []
    for day in whole_year_date:
        days.append((day.month, day.day))
    
    # 利用日期對應到df的位置
    output = []     
    for d in days:
        month = d[0] - 1
        day = d[1] - 1
        data = df.iloc[month, day]
        
        # 若為數字才append，非數字字元append nan
        if re.match('\d', data):
            output.append(data)
        else:
            output.append(np.nan)
    
    # create dataframe
    if year == pd.Timestamp.now().year:
        df_col = pd.DataFrame({"日期": whole_year_date,
                              f"{file_name}": output}).fillna(0)
    else:
        df_col = pd.DataFrame({"日期": whole_year_date,
                              f"{file_name}": output}).fillna(method="ffill")
    
    return df_col


In [5]:
def merged_dataframe_to_csv(target_stations, start_year, end_year):
    # 測站
    for station in target_stations:
        path = f"./cleaned data/Agricultural_Weather_Datas(merged)/{station}"
        if os.path.exists(path) != True:
            os.makedirs(path)
        
        # 年份
        for year in range(start_year, end_year + 1):
            whole_year = pd.date_range(f"{year}-01-01", f"{year}-12-31")
            df_output = pd.DataFrame({"日期": whole_year})
            
            # 合併同年度資料
            for csv in glob.glob(f"../data/Agricultural_Weather_Datas/{station}/{year}*"):
                try:
                    df_col = csv_to_df(csv, year)
                    df_output = pd.merge(df_output, df_col, on="日期", how="left")


                except:
                    print(f"{station}-{year}-{file_name}: Something got wrong!")
                
            # 輸出為csv檔
            df_output.to_csv(f"{path}/{str(year)}_{station}.csv", index=False)
        
            print(f"{station}-{year}: Done" + "="*20)

In [6]:
# 測試
target_stations = stations
start_year = 2011
end_year = 2021

merged_dataframe_to_csv(target_stations, start_year, end_year)

