In [None]:
!{'pip install dataflows==0.0.58'}

In [2]:
from dataflows import Flow, load, printer

all_items_by_he_url = {}

def load_all_items(row):
    assert row['item_url'] not in all_items_by_he_url, 'duplicate item: {}'.format(row['item_url'])
    all_items_by_he_url[row['item_url']] = row

Flow(
    load('../data/musportal_items_with_english_titles.xlsx'),
    load_all_items
).process()

(<datapackage.package.Package at 0x7fcd45a912b0>, {})

In [4]:
all_items_by_en_url = {}

def load_en_pages(rows):
    for rownum, row in enumerate(rows):
        assert row['item_url'] not in all_items_by_en_url
        row['filename'] = '../data/musportal-item-pages-en-puppeteer/rownum{}.txt'.format(rownum)
        all_items_by_en_url[row['item_url']] = row
        yield row
    
Flow(
    load('.checkpoints/all_page_items_en/datapackage.json'),
    load_en_pages
).process()

(<datapackage.package.Package at 0x7fcd45015940>, {})

In [7]:
print('he items: {}'.format(len(all_items_by_he_url)))
print('en items: {}'.format(len(all_items_by_en_url)))

he items: 23812
en items: 23995


In [98]:
from pyquery import PyQuery as pq
from dataflows import dump_to_path

mutportal_item_descriptions = []

def super_strip(string):
    return string.strip().strip(':').strip()

FIELDS = {
  'Museum': 'string',
  'Item Type': 'string',
  'artist_names': 'string',
  'Period': 'string',
  'Domain': 'string',
  'Classification': 'string',
  'length': 'string',
  'width': 'string',
  'Technique': 'string',
  'Notes': 'string',
  'Item Code': 'string',
  'Photographers': 'array',
  'Height': 'string',
  'Depth': 'string',
  'Length': 'string',
  'Artist\\Maker history': 'string',
  'Inviter': 'string',
  'Maximal width': 'string',
  'Description (obverse)': 'string',
  'Location': 'string',
  'Date': 'string',
  'Wiesbaden collecting point number': 'string',
  'Registration No.': 'string',
  'Copyright': 'string',
  'Keywords': 'string',
  'Curator': 'string',
  'End Date': 'string',
  'Style': 'string',
  'Credit': 'string',
  'Color': 'string',
  'Ownership': 'string',
  'Thikness': 'string',
  'Maximal diameter': 'string',
  'Thickness': 'string',
  'Descirption (reverse)': 'string',
  'Width': 'string',
  'Designer': 'string',
  'Diameter': 'string',
  'Additional Information': 'string',
  'Material': 'string',
  'School': 'string',
}

SKIP_ITEM_URLS = [
    'http://www.museumsinisrael.gov.il/en/items/Pages/ItemCard.aspx?IdItem=ICMS-CAR-00143',
]

all_extra_keys = set()

def set_row_field(row, k, v):
    if k in FIELDS:
        row[k] = v
    else:
        row['extra'].append([k, v])
        all_extra_keys.add(k)


def load_rows():
    for i, row in enumerate(all_items_by_en_url.values()):
        item_url = row['item_url']
        if (item_url in SKIP_ITEM_URLS): 
            continue
        with open(row['filename']) as f:
            text = f.read()
        splitext = text.split('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        assert splitext[1].strip() == row['item_url']
        htmltext = splitext[2].strip()
        d = pq(htmltext)
        row['image_urls'] = []
        row['extra'] = []
        for k,v in FIELDS.items():
            row[k] = [] if v == 'array' else ''
        item_main_pics = d('.ItemMainPic')
        assert len(item_main_pics) == 1
        pic = pq(item_main_pics[0])
        imgs = pq(pic.find('img'))
        try:
            for img in imgs:
                row['image_urls'].append(imgs[0].attrib['src'])                    
        except Exception as e:
            print(f'{item_url}: exception parsing main image urls: {e}')
            row['image_urls'] = []
        articles = d('article.ItmeDetailsZone')
        assert len(articles) == 1
        article = articles[0]
        description = "\n".join((item_description_allinfo.text for item_description_allinfo 
                                 in pq(article).find('.ItemDescripion .allinfo')))
        mutportal_item_descriptions.append({'item_url': row['item_url'],
                                            'SecendRow': row['SecendRow'], 
                                            'therdRow': row['therdRow'],
                                            'description': description})
        detail_infos = pq(article).find('.detailInfo')
        for detail_info in detail_infos:
            item_label_names = pq(detail_info).find('.itemlablename')
            if len(item_label_names) == 0: continue
            assert len(item_label_names) == 1, pq(detail_info).html()
            item_label_name = super_strip(pq(item_label_names[0]).text())
            item_text_names = pq(detail_info).find('.itemTextname')
            size_tables = pq(detail_info).find('.sizeTable')
            ic_artist_list = pq(detail_info).find('.ICArtiistList')
            all_infos = pq(detail_info).find('.allinfo')
            if len(item_text_names) == 1:
                assert len(size_tables) == 0 and len(ic_artist_list) == 0 and len(all_infos) == 0, pq(detail_info).html()
                assert len(item_text_names) == 1, pq(detail_info).html()
                set_row_field(row, item_label_name, pq(item_text_names[0]).text().strip())
            elif len(size_tables) == 1:
                assert len(item_text_names) == 0 and len(ic_artist_list) == 0 and len(all_infos) == 0, pq(detail_info).html()
                trs = pq(size_tables[0]).find('tr')
                for tr in trs:
                    meas_labels = pq(tr).find('.MeasLabel')
                    meas_values = pq(tr).find('.MeasValue')
                    assert len(meas_labels) == 1 and len(meas_values) == 1, pq(detail_info).html()
                    meas_label = super_strip(pq(meas_labels[0]).text())
                    set_row_field(row, meas_label, super_strip(pq(meas_values[0]).text()))
            elif len(ic_artist_list) == 1:
                assert len(size_tables) == 0 and len(item_text_names) == 0 and len(all_infos) == 0, pq(detail_info).html()
                artist_names = []
                for ic_artist_name in pq(ic_artist_list[0]).find('.ICArtistName'):
                    artist_names.append(super_strip(pq(ic_artist_name).text()))
                set_row_field(row, 'artist_names', ', '.join(artist_names))
            elif len(all_infos) == 1:
                set_row_field(row, item_label_name, pq(all_infos[0]).text().strip())
            else:
                if pq(detail_info).text().strip() != 'Artist / Creator:':
                    raise Exception(item_url + "\n" + "\n" + pq(detail_info).html() + "\n" + pq(article).html())
        yield row


Flow(
    load_rows(),
    dump_to_path('../data/parsed_item_pages_en'),
    printer(tablefmt='html', num_rows=1)
).process()

print(all_extra_keys)

#,Title (string),ThumbImageMono (string),ThumbImage (string),SecendRow (string),therdRow (string),item_url (string),filename (string),image_urls (array),extras (array),extra (array),Museum (string),Item Type (string),artist_names (string),Period (string),Domain (string),Classification (string),length (string),width (string),Technique (string),Notes (string),Item Code (string),Photographers (any),Height (string),Depth (string),Length (string),Artist\Maker history (string),Inviter (string),Maximal width (string),Description (obverse) (string),Location (string),Date (string),Wiesbaden collecting point number (string),Registration No. (string),Copyright (string),Keywords (string),Curator (string),End Date (string),Style (string),Credit (string),Color (string),Ownership (string),Thikness (string),Maximal diameter (string),Thickness (string),Descirption (reverse) (string),Width (string),Designer (string),Diameter (string),Additional Information (string),Material (string),School (string)
1,Red Sea Hamburger,,TID126344_ITEM_MAIN_PIC_452250.jpg,"צ'נצ'ל בנגה ,","Janco-Dada Museum, Ein Hod",http://www.museumsinisrael.gov.il/en/items/Pages/ItemCard.aspx?IdItem=ICMS-JCO-C.1075.2015,../data/musportal-item-pages-en-puppeteer/rownum0.txt,['http://images.museumsinisrael.gov.il/thmbn_images/45/22/thn_1024x768_TID126344_ITEM_MAIN_PIC_45225 ...,[],[],"Janco-Dada Museum, Ein Hod",Painting,צ'נצ'ל בנגה,2015.0,Art,Visual Art,20.8 cm,14.8 cm,watercolor and indian ink on paper,Chanchal Banga's humoristic language and amusing thematical combinations are expressed in the works ...,ICMS-JCO-C.1075.2015,Warhaftig Venezian,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Untitled,,TID126340_ITEM_MAIN_PIC_551027.jpg,"Nir Dvorai ,","Janco-Dada Museum, Ein Hod",http://www.museumsinisrael.gov.il/en/items/Pages/ItemCard.aspx?IdItem=ICMS-JCO-C.1068.2015,../data/musportal-item-pages-en-puppeteer/rownum1.txt,['http://images.museumsinisrael.gov.il/thmbn_images/55/10/thn_1024x768_TID126340_ITEM_MAIN_PIC_55102 ...,,[],"Janco-Dada Museum, Ein Hod",קולאז',Nir Dvorai,2015.0,Art,Visual Art,,60 cm,,"collage of photos the artist took, then made cutouts of them, and joined in new formations, giving i ...",ICMS-JCO-C.1068.2015,[],43 cm,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
23995,Female Nude,,TID128411_ITEM_MAIN_PIC_1106257.jpg,"Stern, Friedel, Leipzig, Germany, 1917-2006 ,","The Israeli Cartoon Museum, Holon",http://www.museumsinisrael.gov.il/en/items/Pages/ItemCard.aspx?IdItem=ICMS-CAR-1003966,../data/musportal-item-pages-en-puppeteer/rownum23994.txt,['http://images.museumsinisrael.gov.il/thmbn_images/11/06/25/thn_1024x768_TID128411_ITEM_MAIN_PIC_11 ...,,[],"The Israeli Cartoon Museum, Holon",Drawing,"Stern, Friedel, Leipzig, Germany, 1917-2006",,Art,Visual Art,,,India ink on paper,,ICMS-CAR-1003966,Image produced as part of a joint project of the Harvard Library Judaica Division and the Israeli Ca ...,,,,,,,,Israel,1950 - 1970,,,"The Israeli Cartoon Museum, Holon","Nude, Portrait, Drawing",,,,"Bequest of Friedel Stern, Collection of the Israeli Cartoon Museum",,,,,,,,,,,,


{''}
