In [1]:
#import geopandas as gpd
import matplotlib.pyplot as plt
import requests
import pandas as pd
import numpy as np
import json
import re
pd.options.mode.chained_assignment = None  # default='warn'

## 資料清洗

In [2]:
# read csv
df = pd.read_csv("../data/產量_臺灣地區果品生產概況(2011~2019).csv", encoding="utf8")
df

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
7066,2011,基隆市,其他水果,327,327,1.01,1.01,10,3237.62,3.27
7067,2011,新竹市,其他水果,2538,2538,1.84,1.84,10.56,14560.87,26.792
7068,2011,嘉義市,其他水果,25711,25711,38.43,38.43,16.66,11147.46,428.397
7069,2011,金門縣,其他水果,0,0,0,0,0,0,0


### 保留香蕉資料

In [3]:
df_banana = df.loc[df["果品類別"] == "香蕉"]
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.922
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.6,960.856
6321,2011,金門縣,香蕉,0,0,0,0,0,0,0


### 去除不必要地區別["臺灣省","福建省"]

In [4]:
df_banana = df_banana.loc[~(df_banana["地區別"].isin(["臺灣省","福建省"]))]
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.535
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.027
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.226
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.141
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.922
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.6,960.856
6321,2011,金門縣,香蕉,0,0,0,0,0,0,0


In [5]:
# 將 "-" 取代為 0
df_banana = df_banana.replace("-", 0)

In [6]:
target_cols= ['種植株數', '收穫株數', '種植面積_公頃', '收穫面積_公頃', '每株平均產量_公斤', '每公頃平均產量_公斤', '產量_公噸']
df_banana[target_cols] = df_banana[target_cols].apply(pd.to_numeric)
df_banana["產量_公噸"] = df_banana["產量_公噸"].map(lambda x: round(x ,2))
df_banana

Unnamed: 0,年度,地區別,果品類別,種植株數,收穫株數,種植面積_公頃,收穫面積_公頃,每株平均產量_公斤,每公頃平均產量_公斤,產量_公噸
0,2019,新北市,香蕉,108619,102262,74.99,71.01,13.74,19793.48,1405.54
1,2019,台北市,香蕉,12830,12830,10.36,10.36,12.55,15543.15,161.03
2,2019,桃園市,香蕉,44420,43940,57.11,56.47,18.85,14666.65,828.23
3,2019,台中市,香蕉,574129,560455,629.71,614.39,15.54,14178.52,8711.14
4,2019,台南市,香蕉,1481610,1476420,1062.65,1057.46,16.26,22703.58,24008.13
...,...,...,...,...,...,...,...,...,...,...
6318,2011,基隆市,香蕉,3786,3786,3.57,3.57,9.22,9782.07,34.92
6319,2011,新竹市,香蕉,8802,8802,5.64,5.64,14.38,22448.58,126.61
6320,2011,嘉義市,香蕉,79710,79710,53.14,53.14,12.05,18081.60,960.86
6321,2011,金門縣,香蕉,0,0,0.00,0.00,0.00,0.00,0.00


In [7]:
# 將整理後的檔案存入 cleaned data
df_banana.to_csv("cleaned data/產量_臺灣香蕉生產概況(2011~2019).csv", index=False)