In [2]:
import pandas as pd
import re
import glob

### 轉換單一農測站的單一資料 => 單一欄位(Series)

In [3]:
df = pd.read_csv("../data/Agricultural_Weather_Datas/凍頂茶改/2020平均氣溫(℃).csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,2020年 日/月,1月,2月,3月,4月,5月,6月,7月,8月,9月,10月,11月,12月
1,1,15.6,12.4,21.1,20.9,23.7,26.3,26.6,26.2,26.0,24.7,21.4,19.1
2,2,16.3,14.6,18.4,18.9,24.2,25.7,25.3,25.8,26.4,25.1,21.8,19.4
3,3,17.0,15.4,19.3,19.5,24.4,25.8,24.9,24.0,25.0,25.3,20.8,16.7
4,4,17.1,14.8,19.1,18.9,25.3,26.5,25.9,24.8,26.1,25.6,21.0,16.9


In [4]:
# DROP掉不必要之ROW 以及COLUMN
df.drop(index=[0,32], axis=0, inplace=True)
df.drop(columns=0, inplace=True)
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
1,15.6,12.4,21.1,20.9,23.7,26.3,26.6,26.2,26.0,24.7,21.4,19.1
2,16.3,14.6,18.4,18.9,24.2,25.7,25.3,25.8,26.4,25.1,21.8,19.4
3,17.0,15.4,19.3,19.5,24.4,25.8,24.9,24.0,25.0,25.3,20.8,16.7
4,17.1,14.8,19.1,18.9,25.3,26.5,25.9,24.8,26.1,25.6,21.0,16.9
5,17.3,15.6,14.7,16.7,25.8,26.5,25.8,25.4,25.5,24.5,22.2,18.2
6,17.2,16.3,17.7,17.3,25.8,25.2,26.3,27.3,24.3,24.0,22.1,19.1
7,17.9,16.5,20.5,17.8,26.3,24.5,27.4,26.8,24.9,22.7,22.8,20.0
8,17.6,14.9,20.0,18.7,26.5,23.4,26.5,26.4,25.2,22.3,19.5,17.8
9,17.5,13.7,22.8,19.8,26.2,24.1,26.6,26.7,25.9,23.5,18.6,18.0
10,17.3,15.6,18.5,21.0,26.7,25.8,27.1,25.4,24.8,23.9,18.4,19.9


## 方法一
依照資料順序與2020每日做結合

優點:
- 快速

缺點:
- 若有缺值出現在1~30天 => 那麼整個順序就會亂掉，牛頭不對馬嘴

In [6]:
# 前提為全年都無缺值情況下，dropna會把該月份不存在的日期刪掉
# 一旦在其他位置有None值，那麼整個順序就會亂掉，牛頭不對馬嘴

output = []
for col in df.columns:
    data = df[col].dropna().tolist()
    output += data

output[:5]

['15.6', '16.3', '17.0', '17.1', '17.3']

In [7]:
time_2020 = pd.date_range("2020-01-01", "2020-12-31")

In [22]:
df_2020_tmp1 = pd.DataFrame({"日期":time_2020,
                           "最高溫度(℃)":output})
df_2020_tmp1

Unnamed: 0,日期,最高溫度(℃)
0,2020-01-01,15.6
1,2020-01-02,16.3
2,2020-01-03,17.0
3,2020-01-04,17.1
4,2020-01-05,17.3
...,...,...
361,2020-12-27,18.1
362,2020-12-28,19.4
363,2020-12-29,20.3
364,2020-12-30,12.6


## 方法2
日期(2020-month-day)以df.iloc[month, day]去匹配，如此一來可以確保每個值都完美匹配且避開缺值，但效率較差

In [9]:
time_2020 = pd.date_range("2020-01-01", "2020-12-31")

In [14]:
# 取出日期後，轉為string
days = []
for day in time_2020:
    days.append(str(day))
days[:5]

['2020-01-01 00:00:00',
 '2020-01-02 00:00:00',
 '2020-01-03 00:00:00',
 '2020-01-04 00:00:00',
 '2020-01-05 00:00:00']

In [15]:
# 日期作為dataframe的index
data = list(map(lambda x: (int(x[5:7]), int(x[8:10])), days))

In [20]:
# 透過iloc取值，並放入output
output = []
for ele in data:
#     print(ele)
    day = ele[1] - 1
    mon = ele[0] - 1
    output.append(df.iloc[day, mon])
output[:5]

['15.6', '16.3', '17.0', '17.1', '17.3']

In [23]:
df_2020_tmp2 = pd.DataFrame({"日期":time_2020,
                           "最高溫度(℃)":output})

df_2020_tmp2

Unnamed: 0,日期,最高溫度(℃)
0,2020-01-01,15.6
1,2020-01-02,16.3
2,2020-01-03,17.0
3,2020-01-04,17.1
4,2020-01-05,17.3
...,...,...
361,2020-12-27,18.1
362,2020-12-28,19.4
363,2020-12-29,20.3
364,2020-12-30,12.6


#### 檢驗資料 => 方法一&方法二產出的值是否一樣?

In [24]:
test = pd.merge(df_2020_tmp1, df_2020_tmp2, on="日期", how="left")
test

Unnamed: 0,日期,最高溫度(℃)_x,最高溫度(℃)_y
0,2020-01-01,15.6,15.6
1,2020-01-02,16.3,16.3
2,2020-01-03,17.0,17.0
3,2020-01-04,17.1,17.1
4,2020-01-05,17.3,17.3
...,...,...,...
361,2020-12-27,18.1,18.1
362,2020-12-28,19.4,19.4
363,2020-12-29,20.3,20.3
364,2020-12-30,12.6,12.6


In [27]:
test["相等與否"] = test["最高溫度(℃)_x"] == test["最高溫度(℃)_y"]

In [30]:
# 資料都相同
test["相等與否"].unique()

array([ True])

### 合併單一測站多份資料(延續上面的單份資料轉換，依據年份統合各欄位成一個dataframe)

將各測站每年資料統合成一個DATAFRAME，因為年份越久，資料完整度越差，因此先做2018~2020即可