# Photo Scraping of Best Educational Organization Website

Teng-Jui Lin

Created: 2022-09-05

Edited: 2022-09-05

Objective: Scrape photos from [Best Educational Organization website](https://www.bestsch.cn/best/public/news)'s news articles, including those from Best International Primary School and Kinglee High School.

Assumption: Project root directory has an `output` folder that contains `Primary School` and `KHS` folders.

Note that the website is slow to open, so the code takes time to run. BeautifulSoup is not used because the images are loaded in with JavaScript.

In [1]:
import os
import urllib

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def get_imgs(base, conditions):
    # get full url
    url_conditions = urllib.parse.urlencode(conditions)
    URL = base + url_conditions
    
    # publication number
    pubid = conditions['pubid']
    # school id: 7=best primary, 14=kinglee high
    sid = conditions['sid']
    
    # create headless chrome browser
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), 
        options=options
    )
    # open url
    driver.get(URL)
    
    # get publication title
    # if article does not exist, it will be sent to the main page that does not have h4
    try:
        title = driver.find_element(By.TAG_NAME, 'h4').text.replace('/', '|')
    except:
        driver.close()
        return
    
    # get time and output path
    time = driver.find_elements(By.XPATH, 'html/body/div/div/section/div/div/div/div')[1].text
    if sid == 7:
        time = time.strip('贝斯特外语小学 /').replace(':', '-')
        output_path = f'output/Primary School/{pubid} {time} {title}'
    elif sid == 14:
        time = time.strip('勤礼外语中学 / ').replace(':', '-')
        output_path = f'output/KHS/{pubid} {time} {title}'
    else:
        raise ValueError

    # get images
    img_elements = driver.find_elements(By.CLASS_NAME, 'News_Img')
    
    # make output path
    if not os.path.isdir(output_path) and img_elements:
        os.mkdir(output_path)
        
    # save images
    for i, img_element in enumerate(img_elements):
        img_url = img_element.get_property('src')
        urllib.request.urlretrieve(img_url, f'{output_path}/{pubid} {time} {title} {i}.jpg')
        
    # close browser
    driver.close()

In [3]:
def get_pubids(base, conditions):
    # get full url
    url_conditions = urllib.parse.urlencode(conditions)
    URL = base + url_conditions

    # create headless chrome browser
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), 
        options=options
    )
    # open url
    driver.get(URL)
    
    # get pubids
    pubids = []
    article_elements = driver.find_elements(By.XPATH, 'html/body/div/section/div/div/div/table/tbody/tr')
    for i, article_element in enumerate(article_elements):
        pubid = article_element.get_attribute('auto-id')
        pubids.append(pubid)
    
    # close browser
    driver.close()
    return pubids

In [4]:
ARTICLE_LIST_BASE_URL = 'https://www.bestsch.cn/best/public/news?'
ARITCLE_BASE_URL = 'https://www.bestsch.cn/best/public/news/single?'
PRIMARY_SCHOOL_SID = 7
HIGH_SCHOOL_SID = 14

In [None]:
for article_list_page in range(171, 0, -1):  # page 171 - 1 of article list for primary school
    article_list_conditions = {'page': article_list_page, 'dpid': PRIMARY_SCHOOL_SID, 'kw': ''}
    article_pubids = get_pubids(ARTICLE_LIST_BASE_URL, article_list_conditions)
    for article_pubid in article_pubids:
        article_conditions = {'pubid': article_pubid, 'pubtp': 'news', 'mid': 1, 'sid': PRIMARY_SCHOOL_SID}
        get_imgs(ARITCLE_BASE_URL, article_conditions)

In [None]:
for article_list_page in range(64, 0, -1):  # page 64 - 1 of article list for high school
    article_list_conditions = {'page': article_list_page, 'dpid': HIGH_SCHOOL_SID, 'kw': ''}
    article_pubids = get_pubids(ARTICLE_LIST_BASE_URL, article_list_conditions)
    for article_pubid in article_pubids:
        article_conditions = {'pubid': article_pubid, 'pubtp': 'news', 'mid': 1, 'sid': HIGH_SCHOOL_SID}
        get_imgs(ARITCLE_BASE_URL, article_conditions)