In [2]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import Select
import time

### Create function to run selenium and retrieve soup from HTML

<font size=3 color=Blue>**參數說明**</font>
> special_day: 查詢日期（單日 or 開始日期）<br>
> period_mode: 查詢特定日期到今日為止的期間模式，預設為False<br>
> fruit_number: 水果編號

In [3]:
# 鳳梨以金鑽為大宗
fruit_dict = {"香蕉":"A1", "鳳梨":"B2"}

In [7]:
def All_Market_Prices(specific_day, fruit_name, period_mode=False):
    
    url = "https://amis.afa.gov.tw/fruit/FruitProdDayTransInfo.aspx"
    driver = Chrome("../../chromedriver")
    driver.get(url)
    
    ## 輸入日期 => 除去日期input只能read限制，然後清空既有input並放入keys
    driver.execute_script("$('input[id=ctl00_contentPlaceHolder_txtSTransDate]').removeAttr('readonly')")
    driver.find_element_by_id('ctl00_contentPlaceHolder_txtSTransDate').clear()         
    driver.find_element_by_id('ctl00_contentPlaceHolder_txtSTransDate').send_keys(specific_day)
    
    ## 期間模式 => 範圍別點選為期間
    if period_mode == True: 
        driver.find_element_by_xpath("//*[@id='ctl00_contentPlaceHolder_ucDateScope_rblDateScope_1']").click()
    
    ## 選取市場
    driver.find_element_by_xpath("//*[@id='ctl00_contentPlaceHolder_txtMarket']").click() 
    iframe = driver.find_elements_by_tag_name("iframe")[0]
    driver.switch_to.frame(iframe)
    radio_target = driver.find_element_by_xpath("//*[@id='radlMarketRange_0']")
    radio_target.click()
    time.sleep(2)
    
    ## 選取水果種類
    driver.find_element_by_xpath("//*[@id='ctl00_contentPlaceHolder_txtProduct']").click()
    iframe = driver.find_elements_by_tag_name("iframe")[0]
    driver.switch_to.frame(iframe)
    select = Select(driver.find_element_by_name('lstProduct'))
    select.select_by_value(fruit_dict[fruit_name])
    driver.find_element_by_xpath("//*[@id='btnConfirm']").click()
    time.sleep(2)
    
    ## 點選查詢button
    driver.find_element_by_xpath("//*[@id='ctl00_contentPlaceHolder_btnQuery']").click()
    time.sleep(2)
    
    ## 取得soup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    ## 關閉視窗
    driver.close()
    
    return soup

In [8]:
soup = All_Market_Prices("110/06/04", "鳳梨", period_mode=True)
soup

<html xmlns="http://www.w3.org/1999/xhtml"><head><title>
	農產品批發市場交易行情站
</title><meta content="no-cache" http-equiv="Pragma"/><meta content="no-cache, no-store, must-revalidate" http-equiv="Cache-Control"/><meta content="0" http-equiv="Expires"/><meta content="IE=7.0000" http-equiv="X-UA-Compatible"/><meta content="width=device-width; initial-scale=1.0" name="viewport"/><meta content="MSHTML 11.00.9600.17344" name="GENERATOR"/>
<script src="../Scripts/jquery-1.11.3.min.js" type="text/javascript"></script>
<script src="../Scripts/jquery-ui-1.11.4/jquery-ui.min.js" type="text/javascript"></script>
<link href="../Scripts/jquery-ui-1.11.4/jquery-ui.min.css" rel="stylesheet" type="text/css"/>
<script defer="defer" src="../Scripts/amis_utils.js" type="text/javascript"></script>
<script defer="defer" src="../Scripts/amis_calendar.js?token=20210220152500" type="text/javascript"></script>
<link href="../css/style_01.css" rel="stylesheet" type="text/css"/><link href="../css/reset.css" rel="styles

### Raw data processing

In [9]:
raw_data = re.sub(" ", "", soup.select("table[border='1']")[0].text).split("\n")
data = list(filter(lambda x: len(x)>0, raw_data))
data

['日期',
 '市場',
 '產品',
 '上價',
 '中價',
 '下價',
 '平均價(元/公斤)',
 '跟前一交易日比較%',
 '交易量(公斤)',
 '跟前一交易日比較%',
 '小計',
 '\xa0',
 '\xa0',
 '\xa0',
 '17.2',
 '\xa0',
 '12,438,032',
 '\xa0',
 '110/06/04',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '35.1',
 '19.7',
 '9.9',
 '20.8',
 '-15',
 '36,593',
 '+62',
 '110/06/05',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '33.6',
 '18.6',
 '10.4',
 '20.0',
 '-4',
 '24,927',
 '-32',
 '110/06/06',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '34.8',
 '18.6',
 '10.2',
 '20.2',
 '+1',
 '21,739',
 '-13',
 '110/06/08',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '36.7',
 '21.3',
 '11.0',
 '22.3',
 '+10',
 '20,115',
 '-7',
 '110/06/09',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '40.7',
 '23.8',
 '10.9',
 '24.6',
 '+10',
 '21,845',
 '+9',
 '110/06/10',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '36.4',
 '24.4',
 '12.0',
 '24.3',
 '-1',
 '22,865',
 '+5',
 '110/06/11',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '33.6',
 '23.8',
 '12.9',
 '23.6',
 '-3',
 '24,449',
 '+7',
 '110/06/12',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '35.2',
 '22.1',
 '12.9',
 '22.9',
 '-3',
 '22,626',
 '-7',
 '110/06/13',
 '104台北二',
 'B2鳳梨

In [10]:
# columns
cols = data[:10]
cols

['日期',
 '市場',
 '產品',
 '上價',
 '中價',
 '下價',
 '平均價(元/公斤)',
 '跟前一交易日比較%',
 '交易量(公斤)',
 '跟前一交易日比較%']

In [11]:
# table data
table = data[18:]
table

['110/06/04',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '35.1',
 '19.7',
 '9.9',
 '20.8',
 '-15',
 '36,593',
 '+62',
 '110/06/05',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '33.6',
 '18.6',
 '10.4',
 '20.0',
 '-4',
 '24,927',
 '-32',
 '110/06/06',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '34.8',
 '18.6',
 '10.2',
 '20.2',
 '+1',
 '21,739',
 '-13',
 '110/06/08',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '36.7',
 '21.3',
 '11.0',
 '22.3',
 '+10',
 '20,115',
 '-7',
 '110/06/09',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '40.7',
 '23.8',
 '10.9',
 '24.6',
 '+10',
 '21,845',
 '+9',
 '110/06/10',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '36.4',
 '24.4',
 '12.0',
 '24.3',
 '-1',
 '22,865',
 '+5',
 '110/06/11',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '33.6',
 '23.8',
 '12.9',
 '23.6',
 '-3',
 '24,449',
 '+7',
 '110/06/12',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '35.2',
 '22.1',
 '12.9',
 '22.9',
 '-3',
 '22,626',
 '-7',
 '110/06/13',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '32.8',
 '19.4',
 '10.5',
 '20.3',
 '-11',
 '22,967',
 '+2',
 '110/06/14',
 '104台北二',
 'B2鳳梨金鑽鳳梨',
 '35.3',
 '19.0',
 '10.6',
 '20.6',
 '+1',
 '17,488',
 '-24',
 '1

In [12]:
# 將data分割，每天資料存成一個小list放在大list裡面
output = []
end = 10
for start in range(0, len(table), 10):
    output.append(table[start: end])
    end += 10
output

[['110/06/04',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '35.1',
  '19.7',
  '9.9',
  '20.8',
  '-15',
  '36,593',
  '+62'],
 ['110/06/05',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '33.6',
  '18.6',
  '10.4',
  '20.0',
  '-4',
  '24,927',
  '-32'],
 ['110/06/06',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '34.8',
  '18.6',
  '10.2',
  '20.2',
  '+1',
  '21,739',
  '-13'],
 ['110/06/08',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '36.7',
  '21.3',
  '11.0',
  '22.3',
  '+10',
  '20,115',
  '-7'],
 ['110/06/09',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '40.7',
  '23.8',
  '10.9',
  '24.6',
  '+10',
  '21,845',
  '+9'],
 ['110/06/10',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '36.4',
  '24.4',
  '12.0',
  '24.3',
  '-1',
  '22,865',
  '+5'],
 ['110/06/11',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '33.6',
  '23.8',
  '12.9',
  '23.6',
  '-3',
  '24,449',
  '+7'],
 ['110/06/12',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '35.2',
  '22.1',
  '12.9',
  '22.9',
  '-3',
  '22,626',
  '-7'],
 ['110/06/13',
  '104台北二',
  'B2鳳梨金鑽鳳梨',
  '32.8',
  '19.4',
  '10.5',
  '20.3',
  '-11',
  '22,967',
  '+2'],
 ['1

### 轉換成dataframe & 資料清洗

In [13]:
# create an empty dataframe
df = pd.DataFrame(columns=cols)

# 將資料逐筆放入
for i in range(len(output)):
    df.loc[i] = output[i]
    
df

Unnamed: 0,日期,市場,產品,上價,中價,下價,平均價(元/公斤),跟前一交易日比較%,交易量(公斤),跟前一交易日比較%.1
0,110/06/04,104台北二,B2鳳梨金鑽鳳梨,35.1,19.7,9.9,20.8,-15,36593,+62
1,110/06/05,104台北二,B2鳳梨金鑽鳳梨,33.6,18.6,10.4,20.0,-4,24927,-32
2,110/06/06,104台北二,B2鳳梨金鑽鳳梨,34.8,18.6,10.2,20.2,+1,21739,-13
3,110/06/08,104台北二,B2鳳梨金鑽鳳梨,36.7,21.3,11.0,22.3,+10,20115,-7
4,110/06/09,104台北二,B2鳳梨金鑽鳳梨,40.7,23.8,10.9,24.6,+10,21845,+9
...,...,...,...,...,...,...,...,...,...,...
776,110/08/15,930台東市,B2鳳梨金鑽鳳梨,25.0,15.2,13.0,15.4,-17,1208,-32
777,110/08/17,930台東市,B2鳳梨金鑽鳳梨,23.0,16.7,11.0,16.1,+5,3539,+193
778,110/08/18,930台東市,B2鳳梨金鑽鳳梨,25.0,17.3,12.0,18.5,+15,2105,-41
779,110/08/19,930台東市,B2鳳梨金鑽鳳梨,25.0,16.4,14.0,15.9,-14,3236,+54


In [14]:
df["交易量(公斤)"] = df["交易量(公斤)"].str.replace(",","").apply(pd.to_numeric)

In [15]:
target_cols = ["上價", "中價", "下價", "平均價(元/公斤)"]
df[target_cols] = df[target_cols].apply(pd.to_numeric)

In [16]:
df

Unnamed: 0,日期,市場,產品,上價,中價,下價,平均價(元/公斤),跟前一交易日比較%,交易量(公斤),跟前一交易日比較%.1
0,110/06/04,104台北二,B2鳳梨金鑽鳳梨,35.1,19.7,9.9,20.8,-15,36593,+62
1,110/06/05,104台北二,B2鳳梨金鑽鳳梨,33.6,18.6,10.4,20.0,-4,24927,-32
2,110/06/06,104台北二,B2鳳梨金鑽鳳梨,34.8,18.6,10.2,20.2,+1,21739,-13
3,110/06/08,104台北二,B2鳳梨金鑽鳳梨,36.7,21.3,11.0,22.3,+10,20115,-7
4,110/06/09,104台北二,B2鳳梨金鑽鳳梨,40.7,23.8,10.9,24.6,+10,21845,+9
...,...,...,...,...,...,...,...,...,...,...
776,110/08/15,930台東市,B2鳳梨金鑽鳳梨,25.0,15.2,13.0,15.4,-17,1208,-32
777,110/08/17,930台東市,B2鳳梨金鑽鳳梨,23.0,16.7,11.0,16.1,+5,3539,+193
778,110/08/18,930台東市,B2鳳梨金鑽鳳梨,25.0,17.3,12.0,18.5,+15,2105,-41
779,110/08/19,930台東市,B2鳳梨金鑽鳳梨,25.0,16.4,14.0,15.9,-14,3236,+54


In [17]:
df.dtypes

日期            object
市場            object
產品            object
上價           float64
中價           float64
下價           float64
平均價(元/公斤)    float64
跟前一交易日比較%     object
交易量(公斤)        int64
跟前一交易日比較%     object
dtype: object

In [18]:
df.columns

Index(['日期', '市場', '產品', '上價', '中價', '下價', '平均價(元/公斤)', '跟前一交易日比較%', '交易量(公斤)',
       '跟前一交易日比較%'],
      dtype='object')

In [19]:
fun_dicts = {
    '上價':np.mean, 
    '中價':np.mean, 
    '下價':np.mean, 
    '平均價(元/公斤)':np.mean, 
    '交易量(公斤)':np.sum
}

In [21]:
df_agg = df.groupby("市場").agg(fun_dicts)
df_agg

Unnamed: 0_level_0,上價,中價,下價,平均價(元/公斤),交易量(公斤)
市場,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
104台北二,38.313846,22.907692,11.690769,23.749231,959177
109台北一,38.936923,23.196923,12.84,24.275385,1607176
220板橋區,15.006154,11.938462,9.289231,12.023077,458860
241三重區,25.276923,15.073846,11.590769,16.412308,2023690
260宜蘭市,39.6,25.538462,15.753846,25.538462,206250
338桃　農,14.090625,10.13125,6.914062,10.276562,97374
400台中市,22.478462,14.233846,7.963077,17.363077,4341813
420豐原區,22.324242,15.662121,9.637879,15.792424,595336
600嘉義市,20.070423,12.661972,8.267606,12.661972,324200
800高雄市,27.318182,20.171212,15.575758,20.681818,386498
