# フロアマップ画像スクレイピングプログラム
- 設定ファイルはlib/scraping_configurations.json  
- images/にフロアマップ画像を保存  
- 今のところ伊勢丹, 三越, 岩田屋, 丸井今井の各店舗のフロアマップ画像を取得できるようにしてあります  
- 画像ファイル名は, <テナントID>_<建物(館)名>_<階情報>.<拡張子>  
    例. 1_北館_5階.jpeg

### Usage 
- Shift+Enterで選択中のセル内のコードを実行 ([詳しくはこちら](https://jupyter-notebook.readthedocs.io/en/stable/notebook.html))

In [1]:
import os
import re
import json
import codecs
import urllib3
import requests
import numpy as np
import urllib.parse
from PIL import Image
from tqdm import tqdm
from io import BytesIO
from bs4 import BeautifulSoup
urllib3.disable_warnings()

In [2]:
image_save_dir = './images'
if not os.path.exists(image_save_dir):
    os.mkdir(image_save_dir)
scraping_configurations = json.load(codecs.open('./scraping_configurations.json', 'r','utf-8'))

In [3]:
depart_name = "isetan" # or mitsukoshi or iwataya or maruiimai
scraping_configuration = scraping_configurations[depart_name]
floor_selector = scraping_configuration["floor_selector"]
tenants = scraping_configuration["tenants"]
for tenant in tqdm(tenants):
    tenant_id = tenant["id"]
    tenant_name = tenant["name"]
    buildings = tenant["buildings"]
    for building in buildings:
        building_suffix = building["suffix"]
        building_name = building["name"]
        floor_guide_url = scraping_configuration["floor_guide_url"].format(tenant_name=tenant_name, building_suffix=building_suffix)
        map_page_selector = scraping_configuration["map_page_selector"]
        map_image_selector = scraping_configuration["map_image_selector"]
        floor_selector = scraping_configuration["floor_selector"]
        floor_guide = requests.get(floor_guide_url, verify=False)
        floor_guide_soup = BeautifulSoup(floor_guide.content, 'lxml')
        map_page_elements = floor_guide_soup.select(map_page_selector)
        for map_page_element in map_page_elements:
            map_page_url = urllib.parse.urljoin(floor_guide_url, map_page_element['href'])
            map_page = requests.get(map_page_url, verify=False)
            map_page_soup = BeautifulSoup(map_page.content, 'lxml')
            floor_element = map_page_soup.select_one(floor_selector)
            floor_text = floor_element.text.replace('/','').replace(building_name, '')
            image_element = map_page_soup.select_one(map_image_selector)
            image_uri = urllib.parse.urljoin(map_page_url, image_element['src'])
            response = requests.get(image_uri, stream=True)
            response_status_code = response.status_code
            if response_status_code != 200:
                e = Exception("HTTP status: "+response_status_code)
                raise e
            response_content_type = response.headers["content-type"]
            if 'image' not in response_content_type:
                e = Exception("Content-Type: "+response_content_type)
                raise e
            image_ext = response_content_type.split('/')[1]
            image = Image.open(BytesIO(response.content))
            image_save_path = os.path.join(image_save_dir, tenant_id+'-'+building_name+'-'+floor_text+'.'+image_ext)
            image.save(image_save_path)

100%|██████████| 6/6 [01:20<00:00, 13.49s/it]
