In [2]:
# import geopandas as gpd
import matplotlib.pyplot as plt
import requests
import pandas as pd
import numpy as np
import json
import re
pd.options.mode.chained_assignment = None  # default='warn'

## data cleaning

In [3]:
# read csv
df = pd.read_csv("../data/產量_臺灣地區果品生產概況(2011~2019).csv", encoding="utf8")
df

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
7066,2011,基隆市,其他水果,327,327,1.01,1.01,10,3237.62,3.27
7067,2011,新竹市,其他水果,2538,2538,1.84,1.84,10.56,14560.87,26.792
7068,2011,嘉義市,其他水果,25711,25711,38.43,38.43,16.66,11147.46,428.397
7069,2011,金門縣,其他水果,0,0,0,0,0,0,0


### 保留香蕉資料

In [4]:
df_banana = df.loc[df["果品類別"] == "香蕉"]
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.922
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.6,960.856
6321,2011,金門縣,香蕉,0,0,0,0,0,0,0


### 去除不必要地區別["臺灣省","福建省"]

In [5]:
df_banana = df_banana.loc[~(df_banana["地區別"].isin(["臺灣省","福建省"]))]
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.922
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.6,960.856
6321,2011,金門縣,香蕉,0,0,0,0,0,0,0


In [29]:
# replace "-" with 0
df_banana = df_banana.replace("-", 0)

In [30]:
target_cols= [ '種植株數', '收穫株數', '種植面積_公頃', '收穫面積_公頃', '每株平均產量_公斤', '每公頃平均產量_公斤', '產量_公噸']
df_banana[target_cols] = df_banana[target_cols].apply(pd.to_numeric)
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.130
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.922
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.610
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.60,960.856
6321,2011,金門縣,香蕉,0,0,0.00,0.00,0.00,0.00,0.000


In [33]:
df_banana["產量_公噸"] = df_banana["產量_公噸"].map(lambda x: round(x ,2))


In [34]:
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.54
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.03
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.23
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.14
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.92
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.60,960.86
6321,2011,金門縣,香蕉,0,0,0.00,0.00,0.00,0.00,0.00


In [35]:
# save as cleaned data
df_banana.to_csv("cleaned data/產量_臺灣香蕉生產概況(2011~2019).csv", index=False)

### 選取欲使用欄位

In [None]:
wanted_col = ['年度', '地區別', '果品類別', '種植面積_公頃', '收穫面積_公頃', '產量_公噸']
df = df[wanted_col]

### 去除含有字串的資料，將其以0取代，完成後將target_col轉成numeric資料型態

In [None]:
target_col = ['種植面積_公頃', '收穫面積_公頃', '產量_公噸']
df.loc[(df["收穫面積_公頃"].str.contains(r"[A-Z-]", na=False)), target_col] = 0
df[target_col] = df[target_col].apply(pd.to_numeric)

In [None]:
df.dtypes

### 視覺化在地圖上(2011~2019變化)

In [None]:
df_banana.groupby("年度").sum()

#### 總產量趨勢圖

In [None]:
x = np.arange(2011, 2020)
y = df_banana.groupby("年度").sum()

In [None]:
fig, ax1 = plt.subplots(figsize=(8,6))

width = 0.2
ax1.set_xticks(x)
ax1.plot(x, y["產量_公噸"], "--", marker="o", color="black")
ax1.set_xlabel("Year")
ax1.set_ylabel("Production(ton)", color="black")
ax1.tick_params(axis="y", labelcolor="black")

ax2 = ax1.twinx()
ax2.bar(x-width, y["種植面積_公頃"], width=0.4, alpha=0.4, label="planted area")
ax2.bar(x+width, y["收穫面積_公頃"], width=0.4, alpha=0.4, label="harvested area")
ax2.set_ylabel("Area(hectare)", color="blue")
ax2.tick_params(axis="y", labelcolor="blue")

fig.tight_layout()
plt.legend()
plt.show()

## 依照年度、果品類別、地區別進行groupby

In [None]:
grouped_df = df.groupby(["年度", "果品類別", "地區別"]).sum()[['種植面積_公頃', '收穫面積_公頃', '產量_公噸']]

In [None]:
grouped_df

### 取得2019年度全國香蕉生產資料

In [None]:
merger_data = df.loc[(df["年度"] == "2019") & (df["果品類別"] == "香蕉")]

In [None]:
df_2019banana = merger_data[["地區別", "果品類別", "產量_公噸"]].sort_values(by="產量_公噸", ascending=False)

In [None]:
df_2019banana["地區別"] = df_2019banana["地區別"].str.replace("台", "臺")

In [None]:
df_2019banana.columns = ["COUNTYNAME","fruits", "production"]

In [None]:
df_2019banana.reset_index(drop=True, inplace=True)

In [None]:
df_2019banana

## 透過GeoPandas將上面的結果顯示於地圖

### 從政府資料開放平台取得台灣縣市行政圖的邊界&經緯度

[台灣縣市地圖](https://data.gov.tw/dataset/7441)

In [None]:
city_shp = gpd.read_file("./mapdata202008310842/COUNTY_MOI_1090820.shp", encoding="utf-8")

In [None]:
city_shp

### 台灣地圖

In [None]:
fig,ax = plt.subplots(figsize = (10, 10))
city_shp.plot(ax=ax, color="grey") #cmap="RdBu")

ax.set_xlim(117,123)
ax.set_ylim(21,26)

### 將df_2019banana資料與city_shp進行merge

In [None]:
whole_data = pd.merge(city_shp, df_2019banana, on="COUNTYNAME", how="left")

In [None]:
whole_data

### Visualize the result on map

In [None]:
fig,ax = plt.subplots(figsize = (10, 10))
whole_data.plot(ax=ax, column="production", cmap="YlGn")

ax.set_xlim(118,123)
ax.set_ylim(21,26)
ax.set_title("2019 Taiwan banana production overview", size = 20)
ax.set_facecolor("skyblue")

### 後續問題
- 如何加上顏色條?
- 排行榜?