In [1]:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import tqdm
import pandas as pd
import re
import pykka
import time

In [2]:
root_url = 'https://collections.mfa.org'
img_dir = 'dataset/true/images/'

In [3]:
page_nums = list(range(1, 36117))

In [4]:
def download_paint_grids(grid_soup):
    paint_url_raw = grid_soup.find(class_='primaryMedia').find('a')['href']
    paint_url_rel = re.match(r'/objects/(\d+)', paint_url_raw).group(0)
    paint_url = urllib.parse.urljoin(root_url, paint_url_rel)
    #
    try:
        with urllib.request.urlopen(paint_url) as response:
            paint_html = response.read()
    except Exception as e:
        print(e)
        return
    paint_soup = BeautifulSoup(paint_html)
    # Get Image Object ID
    try:
        img_rel = paint_soup.find(class_="download-image").img.get('src').split(';')[0]
    except Exception as e:
        object_id = None
    else:
        img_abs = urllib.parse.urljoin(paint_url, img_rel)
        object_id = img_abs.split('/')[-2]
        img_bn = '{}.jpg'.format(object_id)
        img_path = os.path.join(img_dir, img_bn)
        # download
        try:
            urllib.request.urlretrieve(img_abs, img_path)
        except Exception as e:
            print(e)
    # Get Painting Information
    try:
        details = str(paint_soup.find(class_='item-details-inner'))
    except Exception as e:
        details = None
    return paint_url, object_id, details

In [5]:
def download_browse_page(page_num):
    browse_url = 'https://collections.mfa.org/objects/images?page={}'.format(page_num)
    with urllib.request.urlopen(browse_url) as response:
        browse_html = response.read()
    browse_soup = BeautifulSoup(browse_html, 'html.parser')
    paint_grids = browse_soup.find_all(class_='grid-item-inner')
    records = []
    for grid_soup in paint_grids:
        record = download_paint_grids(grid_soup)
        if record is None:
            continue
        else:
            records.append(record)
    return records

# 2-stage

## stage 1

In [6]:
def get_object_id(grid_soup):
    paint_url_raw = grid_soup.find(class_='primaryMedia').find('a')['href']
    object_id = re.match(r'/objects/(\d+)', paint_url_raw).group(1)
    return object_id
def get_page_object_ids(page_num):
    browse_url = 'https://collections.mfa.org/objects/images?page={}'.format(page_num)
    with urllib.request.urlopen(browse_url) as response:
        browse_html = response.read()
    browse_soup = BeautifulSoup(browse_html, 'html.parser')
    paint_grids = browse_soup.find_all(class_='grid-item-inner')
    object_ids = [get_object_id(grid_soup) for grid_soup in paint_grids]
    return object_ids

In [7]:
object_ids = [
    object_id
    for page_num in tqdm.tqdm(page_nums)
    for object_id in get_page_object_ids(page_num)
]

  0%|                                                                 | 0/36116 [00:00<?, ?it/s]

URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

# Pykka

In [7]:
class PageKeeper(pykka.ThreadingActor):
    def __init__(self):
        super().__init__()
        self.pages = self.get_pages()
    def get_pages(self):
        for page_num in tqdm.tqdm(page_nums):
            yield page_num
    def on_receive(self, message):
        try:
            return next(self.pages)
        except StopIteration:
            return None

In [8]:
GetRecord = object()
class ResultKeeper(pykka.ThreadingActor):
    def __init__(self):
        super().__init__()
        self.records = []
    def on_receive(self, records):
        if records is GetRecord:
            return self.records
        else:
            self.records.extend(records)

In [9]:
page_keeper = PageKeeper.start()
result_keeper = ResultKeeper.start()

In [10]:
class PageCrawler(pykka.ThreadingActor):
    def __init__(self):
        super().__init__()
    def on_receive(self, _):
        while True:
            page_num = page_keeper.ask(None)
            if page_num is None:
                return
            else:
                records = download_browse_page(page_num)
                result_keeper.ask(records)

In [None]:
pc_num = 5
pcs = [PageCrawler.start() for _ in range(pc_num)]
for pc in pcs:
    pc.tell('start crawl')
for pc in pcs:
    pc.stop()

  0%|▎                                                   | 177/36116 [18:16<69:08:54,  6.93s/it]

In [None]:
results = result_keeper.ask(GetRecord)
len(results)

# pandas

In [None]:
import pandas as pd

In [None]:
paint_urls, object_ids, detailss = zip(*results)

In [None]:
df = pd.DataFrame({
    'paint_url': paint_urls,
    'object_id': object_ids,
    'details': detailss
})
df.head()

In [None]:
df.to_pickle('dataset/true/records.pkl')