In [150]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import pandas as pd
import re


def convert_date_thai(date): # "22 กันยายน  2565" -> 2022-09-22
	day, month, year = re.split(' +', date)
	year = int(year) - 543
	if len(day) == 1:
		day = '0' + day
	month = {
		'มกราคม':'01',
		'กุมภาพันธ์':'02',
		'มีนาคม':'03',
		'เมษายน':'04',
		'พฤษภาคม':'05',
		'มิถุนายน':'06',
		'กรกฎาคม':'07',
		'กรกฏาคม':'07',
		'สิงหาคม':'08',
		'กันยายน':'09',
		'ตุลาคม':'10',
		'พฤศจิกายน':'11',
		'ธันวาคม':'12'}[month]
	return f'{year}-{month}-{day}'


In [122]:
## instantiate driver
driver = webdriver.Chrome()

## get page
url = 'http://www.raot.co.th/rubber2012/menu5.php'
driver.get(url)

In [154]:
## make DataFrame
header = ['date','ยางแผ่นดิบ','น้ำยางสด','ยางแผ่นดิบ','ยางแผ่นรมควัน ชั้น3','น้ำยางสด','FOB']
df = pd.DataFrame(columns=header)

In [168]:
for year in tqdm(range(2565, 2566)):
    for month in range(1, 11):
        ## set month and year in the left selectbox
        ## target selectboxes has id "select" and "year"
        ## run JavaScript command
        driver.execute_script(f"document.getElementById('select').value={month};")
        driver.execute_script(f"document.getElementById('year').value={year};")

        ## click the GO button
        driver.find_element(By.NAME, 'Submit').click()
        sleep(3)

        ## convert to BeautifulSoup
        ## this page DOES NOT use utf-8 encoding
        soup = BeautifulSoup(driver.page_source.encode('TIS-620'))

        ## target table has no id, so use CSS selector instead, but structures are not same, e.g.
        ## #ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table:nth-child(23) -- 2553
        ## #ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table:nth-child(25) -- 2544
        ## the last table of #ewt_main_structure > tbody > tr:nth-child(2) > td > div > center
        css_selector = "#ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table"
        table = soup.select(css_selector)[-1].find('tbody')

        ## first two <tr> are header -> ignore
        ## <tr> of target table MAY contain inner tables, especially later years' structure
        ## so get only 'children' (not grandchildren)
        ## recursive = False
        table_rows = table.find_all('tr', recursive=False)
        len(table_rows)

        ## iterate rows after 2
        ## final row is mean -> ignore
        for row_index in range(2, len(table_rows)-1):
            ## get first column = date
            date = table_rows[row_index].find('td').text.strip()
            ## iterate each inner-table in the row (len = 6)
            data_in_row = [convert_date_thai(date)]
            ## for inner-table in table (later years)
            if len(table_rows[row_index].find_all('table')) != 0:
                for inner_table in table_rows[row_index].find_all('table'):
                    value = inner_table.find('td').text.strip().strip('?q')
                    if value not in  ['', '-']:
                        value = float(value)
                    data_in_row.append(value)
            ## table with nomal <tr><td>
            else:
                for td in table_rows[row_index].find_all('td'):
                    value = inner_table.find('td').text.strip().strip('?q')
                    if value not in  ['', '-']:
                        value = float(value)
                    data_in_row.append(value)
            df.loc[len(df)] = data_in_row

        ## browser back
        driver.back()

100%|██████████| 1/1 [00:38<00:00, 38.20s/it]


In [169]:
df


Unnamed: 0,date,ยางแผ่นดิบ,น้ำยางสด,ยางแผ่นดิบ.1,ยางแผ่นรมควัน ชั้น3,น้ำยางสด.1,FOB
0,2001-01-03,22.3,20.0,23.55,24.26,,26.7
1,2001-01-04,22.3,19.5,23.25,24.3,,26.25
2,2001-01-05,22.3,19.25,23.26,24.16,,26.25
3,2001-01-08,22.3,19.25,23.45,24.2,,25.85
4,2001-01-09,22.2,19.25,23.26,24.26,,25.9
...,...,...,...,...,...,...,...
5255,2022-10-25,46.35,47.1,-,50.88,-,60.4
5256,2022-10-26,46.25,47.3,-,50.1,50.1,60.0
5257,2022-10-27,45.8,47.3,0.0,49.67,,59.85
5258,2022-10-28,45.6,47.3,0.0,49.37,,59.6


In [170]:
df.to_csv(f'../data/rubber_all.csv', encoding='utf8', index=False)