## Scrape search result

In [None]:
!{'python3 -m pip install -U dataflows'}

In [None]:
import requests, time, json
from dataflows import Flow, checkpoint, printer

PAGE_URL_TEMPLATE = 'http://www.museumsinisrael.gov.il/_Layouts/15/Tmit.SP2013.Mozionim.UI/GetSearch.ashx?type=items&Culture=he-IL&refiners=HebArtDomainName%3A%22%D7%90%D7%9E%D7%A0%D7%95%D7%AA%22&page={page}'
ITEM_URL_TEMPLATE = 'http://www.museumsinisrael.gov.il/he/items/Pages/ItemCard.aspx?IdItem={link}'

session = requests.Session()

def get_page_items(page, retry_num=0):
    if retry_num > 10:
        raise Exception('too many retries for page {}'.format(page))
    url = PAGE_URL_TEMPLATE.format(page=page)
    try:
        res = session.get(url).json()
    except Exception:
        time.sleep(15)
        print('page {} retry_num {}'.format(page, retry_num+1))
        return get_page_items(page, retry_num+1)
    return res

def get_all_page_items():
    total_found = 0
    num_yielded = 0
    for page in range(0,800):
        print(page)
        res = get_page_items(page)
        assert len(res) == 1
        assert res[0]['Type'] == 'פריטים'
        total_found = res[0]['TotalFound']
        cur_page_num_yielded = 0
        for item in res[0]['items']:
            row = {str(k): str(v) for k, v in item.items()}
            row['item_url'] = ITEM_URL_TEMPLATE.format(link=row.pop('Link'))
            yield row
            num_yielded += 1
            cur_page_num_yielded += 1
        print(num_yielded)
        if cur_page_num_yielded < 1:
            break
        time.sleep(2)

Flow(
    get_all_page_items(),
    checkpoint('all_page_items'),
    printer(tablefmt='html', num_rows=1)
).process()[1]

## Download item pages

The following takes ~4-5 days

```
python3 musportal/download_item_pages.py
```

Inspect the data

In [None]:
from dataflows import Flow, load, printer, filter_rows

Flow(
    load('../data/musportal-item-pages/datapackage.json'),  printer(tablefmt='html', num_rows=1),
    filter_rows(not_equals=[{"downloaded_status_code": 200}]), printer(tablefmt='html', num_rows=1),
    filter_rows(not_equals=[{"downloaded_status_code": 502}]), printer(tablefmt='html', num_rows=1),
).process()[1]


## Parse the item pages

In [None]:
!{'python3 -m pip install -U requests-html'}

In [None]:
from dataflows import Flow, load, printer, checkpoint, add_field
from requests_html import HTMLSession
from retrying import retry
import datetime
from datapackage import Package
import os
from requests_html import HTML
from time import sleep
import subprocess
import json

def super_strip(string):
    return string.strip().strip(':').strip()

FIELDS = {'artist_names': 'string',
          'main_image_url': 'string',
          'אורך': 'string',
          'רוחב': 'string',
          'מוזאון': 'string',
          'תחום': 'string',
          'סיווג': 'string',
          'טכניקה': 'string',
          'קוד פריט': 'string',
          'צלמים': 'string',
          'מקום': 'string',
          'תאריך': 'string',
          'קרדיט': 'string',
          'extra': 'array',
          'תקופה': 'string',
          'description': 'string',
          'image_urls': 'array'}

for extras in [[['תקופה'], ['תאריך סיום'], ['צבע'], ['גובה'], ['קוטר'], ['חומר'], ['אוצר/ת']],]:
    for extra in extras:
        if extra[0] not in FIELDS.keys():
            FIELDS[extra[0]] = 'string'

for extra in ['מלות מפתח', 'זכויות יוצרים', 'מספר רישום',
              'תיאור פנים', 'עובי', 'רוחב 299', 'רוחב 204',
              'אסכולה', 'עומק', 'קוטר מקסימלי', 'בעלות', 'רוחב 313', 'תיאור גב', 'מעצב', 'מידע נוסף',
              'תולדות היוצר/אמן', 'מספר בנקודת האיסוף בוויסבאדן', 'סגנון', 'אורך 296', 'אורך 203',
              'מזמין', 'רוחב מקסימלי', 'גובה 300', 'אורך 312', 'הערות', 'רוחב 297']:
    if extra not in FIELDS.keys():
        FIELDS[extra] = 'string'

all_extra_keys = set()
            
def set_row_field(row, k, v):
    if k in FIELDS:
        row[k] = v
    else:
        row['extra'].append([k, v])
        all_extra_keys.add(k)

def scrape_item_pages():
    mutportal_item_descriptions = []

    def _scrape(rows):
        yielded_rows = 0
        for i, row in enumerate(rows):
            row['extra'] = []
            row['image_urls'] = []
            item_url = row['item_url']
            yielded_rows += 1
            html = None
            if row['downloaded_status_code'] == 200:
                with open('../' + row['downloaded_file_name']) as f:
                    html = HTML(html=f.read())
            if not html or 'Website under construction' in html.html:
                print(f'{item_url}: missing item')
                for n, t in FIELDS.items():
                    row[n] = '' if t == 'string' else None
            else:
                item_main_pics = html.find('.ItemMainPic')
                assert len(item_main_pics) == 1
                imgs = item_main_pics[0].find('img')
                try:
                    for img in imgs:
                        row['image_urls'].append(imgs[0].attrs['src'])                    
                except Exception as e:
                    print(f'{item_url}: exception parsing main image urls: {e}')
                    row['image_urls'] = []
                articles = html.find('article.ItmeDetailsZone')
                assert len(articles) == 1
                article = articles[0]
                description = "\n".join((item_description_allinfo.text for item_description_allinfo 
                                         in article.find('.ItemDescripion .allinfo')))
                mutportal_item_descriptions.append({'item_url': row['item_url'],
                                                    'SecendRow': row['SecendRow'], 
                                                    'therdRow': row['therdRow'],
                                                    'description': description})
                detail_infos = article.find('div.detailInfo')
                for detail_info in detail_infos:
                    item_label_names = detail_info.find('.itemlablename')
                    if len(item_label_names) == 0: continue
                    assert len(item_label_names) == 1, detail_info.html
                    item_label_name = super_strip(item_label_names[0].text)
                    item_text_names = detail_info.find('.itemTextname')
                    size_tables = detail_info.find('.sizeTable')
                    ic_artist_list = detail_info.find('.ICArtiistList')
                    all_infos = detail_info.find('.allinfo')
                    if len(item_text_names) == 1:
                        assert len(size_tables) == 0 and len(ic_artist_list) == 0 and len(all_infos) == 0, detail_info.html
                        assert len(item_text_names) == 1, detail_info.html
                        set_row_field(row, item_label_name, item_text_names[0].text.strip())
                    elif len(size_tables) == 1:
                        assert len(item_text_names) == 0 and len(ic_artist_list) == 0 and len(all_infos) == 0, detail_info.html
                        trs = size_tables[0].find('tr')
                        for tr in trs:
                            meas_labels = tr.find('.MeasLabel')
                            meas_values = tr.find('.MeasValue')
                            assert len(meas_labels) == 1 and len(meas_values) == 1, detail_info.html
                            meas_label = super_strip(meas_labels[0].text)
                            set_row_field(row, meas_label, super_strip(meas_values[0].text))
                    elif len(ic_artist_list) == 1:
                        assert len(size_tables) == 0 and len(item_text_names) == 0 and len(all_infos) == 0, detail_info.html
                        artist_names = []
                        for ic_artist_name in ic_artist_list[0].find('.ICArtistName'):
                            artist_names.append(super_strip(ic_artist_name.text))
                        set_row_field(row, 'artist_names', ', '.join(artist_names))
                    elif len(all_infos) == 1:
                        set_row_field(row, item_label_name, all_infos[0].text.strip())
                    else:
                        raise Exception(detail_info.html)
            yield row
        
    def _split_description(package):
        descriptor = package.pkg.descriptor
        assert len(descriptor['resources']) == 1
        descriptor['resources'][0].update(**{'name': 'musportal_items', 'path': 'musportal_items.csv',
                                             'schema': {'fields': (descriptor['resources'][0]['schema']['fields'] 
                                                                   + [{'name': n, 'type': t}
                                                                      for n, t in FIELDS.items()])}})
        descriptor['resources'].append({'name': 'musportal_item_descriptions', 'path': 'musportal_item_descriptions.csv',
                                        'schema': {'fields': [{'name': 'item_url', 'type': 'string'},
                                                              {'name': 'SecendRow', 'type': 'string'},
                                                              {'name': 'therdRow', 'type': 'string'},
                                                              {'name': 'description', 'type': 'string'}]}})
        yield Package(descriptor)
        for i, resource in enumerate(package):
            assert i == 0
            yield _scrape(resource)
        yield (row for row in mutportal_item_descriptions)
    
    return _split_description

!{'rm -rf .checkpoints/all_items'}

Flow(
    # checkpoint('all_page_items'),
    load('../data/musportal-item-pages/datapackage.json'),
    scrape_item_pages(),
    checkpoint('all_items'),
    printer(tablefmt='html', num_rows=1)
).process()[1]

print(f'extra keys = {all_extra_keys}')