In [1]:
from scrape_school_url import *
import pandas as pd
from bs4 import BeautifulSoup
import re
from os import path, makedirs

In [2]:
sdi = SchoolDataIndex()

In [3]:
school_ids = list(sdi.school_ids())

In [13]:
interested_urls = {key: re.sub('^.*/', '', val) for key, val in SCRAPING_URLS.items()}

def pref_param_dict(soup):
  param_dict = {}
  for anchor in soup.find_all('a'):
    topic = [kurl for kurl, iurl in interested_urls.items() if iurl in anchor.attrs['href']]
    if topic:
      param_dict[topic[0]] = re.sub('^.*\?', '', anchor.attrs['href'])
  return param_dict

In [22]:
sd = load_school_data(school_ids[490], only=['general'])
soup = load_soup(sd['general'])
sd = load_school_data('1020080005', pref_param_dict(soup))
sd

{'general': 'html/2022-09/general/1020080005.html',
 'student': 'html/2022-09/student/1020080005.html',
 'staff': 'html/2022-09/staff/1020080005.html',
 'computer_internet': 'html/2022-09/computer_internet/1020080005.html',
 'building': 'html/2022-09/building/1020080005.html',
 'durable_goods': 'html/2022-09/durable_goods/1020080005.html'}

In [7]:
trim_re = r'[\:\s]*$'

In [8]:
about_school = []
def _clean_text(string: str) -> str:
  string = string.strip()
  string = re.sub(trim_re, '', string)
  string = re.sub('\s+',' ', string)
  return string

table = soup.find('table').find('table', attrs={'width': '521'})
for table_row in table.find_all('tr'):
  cells = table_row.find_all('td')

  if len(cells) == 2:
    key_name = _clean_text(cells[0].text)
    value = _clean_text(cells[1].text)

    if cells[1].find_all('a'):
      value = [{
        'text': a_tag.text,
        'href': a_tag.attrs['href']}
        for a_tag in cells[1].find_all('a') if a_tag.text]
    
    about_school.append({
        'key': key_name,
        'value': value
      })
  else:
    about_school.append({'value': _clean_text(cells[0].text)})
about_school

[{'key': 'รหัสโรงเรียน 10 หลัก', 'value': '1020080005'},
 {'key': 'รหัส Smis 8 หลัก', 'value': '20010029'},
 {'key': 'รหัส Obec 6 หลัก', 'value': '080005'},
 {'key': 'ชื่อสถานศึกษา(ไทย)', 'value': 'วัดศรีพโลทัย'},
 {'key': 'ชื่อสถานศึกษา(อังกฤษ)', 'value': 'WATSRIPALOTHAI'},
 {'key': 'ที่อยู่', 'value': 'หมู่ที่ 1 บ้านหนองไม้แดง'},
 {'key': 'ตำบล', 'value': 'หนองไม้แดง'},
 {'key': 'อำเภอ', 'value': 'เมืองชลบุรี'},
 {'key': 'จังหวัด', 'value': 'ชลบุรี'},
 {'key': 'รหัสไปรษณีย์', 'value': '20000'},
 {'key': 'โทรศัพท์', 'value': '038146124'},
 {'key': 'โทรสาร', 'value': '038146124'},
 {'key': 'ระดับที่เปิดสอน', 'value': 'อนุบาล-ประถมศึกษา'},
 {'key': 'วัน-เดือน-ปี ก่อตั้ง', 'value': '16 พฤษภาคม 2503'},
 {'key': 'อีเมล์', 'value': 'splt.school@gmail.com'},
 {'key': 'เว็บไซต์',
  'value': [{'text': 'เว็บไซต์โรงเรียน (สารสนเทศ)',
    'href': '../web/?School_ID=1020080005'}]},
 {'key': 'เครือข่ายพัฒนาคุณภาพการศึกษา', 'value': ''},
 {'key': 'องค์กรปกครองส่วนท้องถิ่น', 'value': 'หนองไม้แดง'},
 

In [9]:
for comment in soup.children: break
url = re.findall('url: (.*)\n', comment)[0]
parent_url = re.sub('[^/]*$', '', url)

In [10]:
image_file_image_re = r'(.*/|\?.*)'
re.sub(image_file_image_re,'','https://data.bopp-obec.info/emis/pic_school/1020080005.jpg?ivonb=42940')

'1020080005.jpg'

In [21]:
load_school_data('1020080005', pref_param_dict(soup))

{'general': 'html/2022-09/general/1020080005.html',
 'student': 'html/2022-09/student/1020080005.html',
 'staff': 'html/2022-09/staff/1020080005.html',
 'computer_internet': 'html/2022-09/computer_internet/1020080005.html',
 'building': 'html/2022-09/building/1020080005.html',
 'durable_goods': 'html/2022-09/durable_goods/1020080005.html'}

In [15]:
# test_image_file_name = ['1020080005_0.jpg',
# '1020080005_30.jpg',
# '1020080005_390.jpg',
# '1020080005_39.jpg',
# '1020080005_(10).jpg',
# '1020080005_(10).jpg',
# '1020080005_10.jpg',
# ]

# image_file_extention_re = r'\.[^\.]*$'
# [re.sub(image_file_extention_re, '', imname) for imname in test_image_file_name]

In [16]:
url_test = 'https://data.bopp-obec.info/emis/schooldata-view_student.php?School_ID=1041680824&Area_CODE2=410001'
re.sub('^.*\?', '', url_test)

'School_ID=1041680824&Area_CODE2=410001'

In [17]:
def download_image(image_url: str):
  image = requests.get(image_url).content
  image_file_name = re.sub(image_file_image_re,'', image_url)
  image_file_path = image_file_name
  if is_path_existed(image_file_path):
    return
  #   file_ext = re.findall(image_file_extention_re, image_file_path)
  #   print(file_ext)
  #   if not file_ext: return
  #   file_ext = file_ext[0]
  #   file_name = re.sub(image_file_extention_re, '', image_file_path)
  #   image_file_path = file_name+'_'+'0'+file_ext
  with open(image_file_path, 'wb') as im_file:
    im_file.write(image)

In [18]:
for div in soup.find_all('div'):
  if not div.find('div'):
    if 'ผู้อำนวยการโรงเรียน' in div.text:
      dir_name = 'principal'
    elif 'ตราสัญลักษณ์' in div.text:
      dir_name = 'logo'
    else:
      continue
    image_src = div.find('img').attrs['src']
    download_image(parent_url+image_src)