## setup

In [1]:
import pandas as pd
import numpy as np
import cv2
import os
import os.path

import time

import requests
import json
import urllib
from bs4 import BeautifulSoup
import re
import pickle

# show all dataframe
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_columns', None)  

## params

In [2]:
dir = 'clean image/mfaboston/'
err_page = []
err_object = []

# initiate a dataframe to store data
cols = ['object number', 'title','culture', 'medium', 'classifications']
df = pd.DataFrame(columns=cols)

headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

## REQUEST 1: Get keyword : page number

In [65]:
keywords = ['robe', 'dress', 'shirt', 'coat', 'ensemble', 'suit', 'jacket','gown']

In [67]:
keyword_page_n_dict = {}

for keyword in keywords:
    url = f"https://collections.mfa.org/search/Objects/*/{keyword}/images?filter=allClassifications%3ACostumes%3BimageExistence%3Atrue"
    r = requests.get(url)
    soup_key = BeautifulSoup(r.content,'html.parser')
    page_n = soup_key.find('span', class_="maxPages").text[2:].strip()

    keyword_page_n_dict[keyword] = page_n
    print(f'there are {page_n} pages for keyword {keyword}.')

there are 35 pages for keyword robe.
there are 116 pages for keyword dress.
there are 19 pages for keyword shirt.
there are 27 pages for keyword coat.
there are 32 pages for keyword ensemble.
there are 19 pages for keyword suit.
there are 40 pages for keyword jacket.
there are 9 pages for keyword gown.


## REQUEST 2: read each 12 obj grid page

### def get_12_obj_df(soup)

In [113]:
def get_12_obj_df(soup):
    # each page has 12 images
    # read each image
    # capture the number and url
    k = soup.find_all('div',class_="primaryMedia")

    # init a df to store 12 obj data
    cols = ['object number', 'object url', 'download number', 'download url']
    df = pd.DataFrame(columns=cols)

    # there are chances to get timeout when read image page    
    for j in range(len(k)):
        # locate object number 
        object_str = k[j].find('a').get('href')
        object_number = re.search('objects/(.*?)/', object_str).group(1)
        url_base = 'https://collections.mfa.org'
        object_url = url_base + object_str 

        # locate download_number
        download_str = k[j].find('img').get('src')
        download_number = re.search('dispatcher/(.*?)/', download_str).group(1)
        download_url = f'https://collections.mfa.org/internal/media/dispatcher/{download_number}/resize%3Aformat%3Dfull?download'

        dc = [object_number,object_url,download_number,download_url]
        df_addon = pd.DataFrame(dc).T
        df_addon.columns = cols
        df = pd.concat([df,df_addon])
        
    return df

###  execute

In [118]:
keyword_page_n_dict

{'robe': '35',
 'dress': '116',
 'shirt': '19',
 'coat': '27',
 'ensemble': '32',
 'suit': '19',
 'jacket': '40',
 'gown': '9'}

In [119]:
# init a df to store obj number and url
cols = ['object number', 'object url', 'download number', 'download url']
df = pd.DataFrame(columns=cols)
timeout_dict = {}

# read each page
for keyword in keywords:
    # get the page number for this keyword
    page_n = keyword_page_n_dict[keyword]
    pages = range(1, int(page_n)+1)

    # loop times
    count = 0
    
    while bool(pages) is True:
        count += 1
        timeout_pages = []

        if count >= 11:
            timeout_dict[keyword] = pages
            break
        else:
            for i in pages:
                try:
                    url = f"https://collections.mfa.org/search/Objects/*/{keyword}/images?filter=allClassifications%3ACostumes%3BimageExistence%3Atrue&page={i}"
                    r = requests.get(url, headers = headers)
                    soup_page = BeautifulSoup(r.content,'html.parser')
                    thisdf = get_12_obj_df(soup_page)
                    df = pd.concat([df, thisdf])
                    
                except:
                    # print out err page
                    timeout_pages.append(i)
                    print(f'{keyword} - page {i} - timeout')
        # let the timeout pages be the pages for next round
        print(f'keyword: {keyword}, round {count}, timeout page count: {len(timeout_pages)}')
        pages = timeout_pages          

    print(f'keyword {keyword} is done.')     

robe - page 6 - timeout
robe - page 7 - timeout
robe - page 29 - timeout
robe - page 30 - timeout
keyword: robe, round 1, timeout page count: 4
keyword: robe, round 2, timeout page count: 0
keyword robe is done.
dress - page 3 - timeout
dress - page 12 - timeout
dress - page 13 - timeout
dress - page 35 - timeout
dress - page 36 - timeout
dress - page 37 - timeout
dress - page 42 - timeout
dress - page 58 - timeout
dress - page 59 - timeout
dress - page 81 - timeout
dress - page 82 - timeout
dress - page 83 - timeout
dress - page 84 - timeout
dress - page 106 - timeout
dress - page 107 - timeout
dress - page 108 - timeout
dress - page 109 - timeout
dress - page 110 - timeout
keyword: dress, round 1, timeout page count: 18
dress - page 3 - timeout
dress - page 37 - timeout
dress - page 42 - timeout
keyword: dress, round 2, timeout page count: 3
dress - page 3 - timeout
dress - page 37 - timeout
dress - page 42 - timeout
keyword: dress, round 3, timeout page count: 3
dress - page 3 - tim

In [122]:
df.shape

(3447, 4)

In [123]:
df.sample(3)

Unnamed: 0,object number,object url,download number,download url
0,530079,https://collections.mfa.org/objects/530079/womans-dress;jsessionid=9340C8CB42CF9396BAAADD526A54EAB6?ctx=1e90fa57-b6ad-43dc-8922-8e1e6bf27512&idx=1011,1352408,https://collections.mfa.org/internal/media/dispatcher/1352408/resize%3Aformat%3Dfull?download
0,567928,https://collections.mfa.org/objects/567928/skirt-suit;jsessionid=501C109F01374275F84EE52A8B4CCAFF?ctx=d0d82c0e-8d3f-43d4-bc4d-f03dd0a735c4&idx=138,1352441,https://collections.mfa.org/internal/media/dispatcher/1352441/resize%3Aformat%3Dfull?download
0,561054,https://collections.mfa.org/objects/561054/dress;jsessionid=6546A7AF001FEECCD640F1D79E88769B?ctx=24db38ec-85f7-4f3e-8a7b-2d4544b613e2&idx=1065,1248210,https://collections.mfa.org/internal/media/dispatcher/1248210/resize%3Aformat%3Dfull?download


In [132]:
# save the file in case
df.to_csv('raw data/mfaboston.csv', index=False)

# --> dataframe saved here, continue working from here

# REQUEST 3: read single obj page, save info

In [30]:
# read the file and prepare it for next step
# which is adding cols
df = pd.read_csv('raw data/mfaboston.csv')

cols = ['title', 'culture', 'downloadable']
df[cols] = np.nan

In [43]:
df.shape

(3447, 7)

In [44]:
df.head(3)

Unnamed: 0,object number,object url,download number,download url,title,culture,downloadable
0,24510,https://collections.mfa.org/objects/24510/noh-costume-atsuita;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=0,757433,https://collections.mfa.org/internal/media/dispatcher/757433/resize%3Aformat%3Dfull?download,,,
1,645007,https://collections.mfa.org/objects/645007/bugaku-costume-in-5-parts-undercoat-shitagansane-vest-h;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=1,1493972,https://collections.mfa.org/internal/media/dispatcher/1493972/resize%3Aformat%3Dfull?download,,,
2,155566,https://collections.mfa.org/objects/155566/kesa-buddhist-priests-robe;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=2,1007847,https://collections.mfa.org/internal/media/dispatcher/1007847/resize%3Aformat%3Dfull?download,,,


## define function save dataframe

In [35]:
# save single object data to dataframe
def save_dataframe(row):
    """
    given obj url and download url
    requests and bs
    while downloadable nan, try except
    save data
    check if there is a download button
    if yes: save img + download label yes
    if no: download label no
    """
    obj_url = row['object url']
    obj_n = row['object number']

    newrow = row

    try:
        # get the bs for object page
        req = requests.get(obj_url, headers=headers)
        soup_obj = BeautifulSoup(req.content,'html.parser') 

        # get the block of item details
        tagobj = soup_obj.find('div',class_="item-details-inner")

        # locate the block that we need 
        tagleft = tagobj.contents[1] # which is the table on the left side of page
        
        # fill title, culture
        newrow['title'] = tagleft.contents[0].text # add title
        newrow['culture'] = soup_obj.find('div',class_="detailField cultureField")

        # check if the image is downloadable
        # by checking if there is a 'download' button
        # the image with 'download' button is believed to be in public domain
        bl = soup_obj.body.findAll(text='DOWNLOAD')

        if bool(bl) is True:
            newrow['downloadable'] = 'Y'
        else:
            newrow['downloadable'] = 'N'

    except:
        print(f'object {obj_n} got err')
        pass

    return newrow

In [45]:
df_7col = df.apply(lambda row: save_dataframe(row), axis=1)

object 120599 got err
object 120600 got err
object 121046 got err
object 121048 got err
object 8074 got err
object 19187 got err
object 19371 got err
object 19488 got err
object 20121 got err
object 20867 got err
object 67746 got err
object 68343 got err
object 68358 got err
object 68375 got err
object 24496 got err
object 15876 got err
object 15882 got err
object 16112 got err
object 106092 got err
object 642762 got err
object 642814 got err
object 646085 got err
object 675723 got err
object 695125 got err
object 536926 got err
object 80281 got err
object 80283 got err
object 46072 got err
object 46284 got err
object 46285 got err
object 46478 got err
object 94367 got err
object 94417 got err
object 94423 got err
object 95272 got err
object 95289 got err
object 401493 got err
object 353211 got err
object 353217 got err
object 353218 got err
object 551711 got err
object 551717 got err
object 551720 got err
object 551723 got err
object 551731 got err
object 555990 got err
object 555991 

In [None]:
# run this code several times, until no nan in downloadable
df_7col.loc[df_7col['downloadable'].isna() == True] = df_7col.loc[df_7col['downloadable'].isna() == True].apply(lambda row: save_dataframe(row), axis=1)
print(df_7col['downloadable'].isna().sum())

In [73]:
df_7col.isna().sum()

object number        0
object url           0
download number      0
download url         0
title                0
culture            106
downloadable         0
dtype: int64

In [61]:
# save the file in case
# df_7col.to_csv('raw data/mfaboston_7col.csv', index=False)

# --> dataframe saved here, continue working from here

In [62]:
df_7col = pd.read_csv('raw data/mfaboston_7col.csv')

In [72]:
df_7col.head(3)

Unnamed: 0,object number,object url,download number,download url,title,culture,downloadable
0,24510,https://collections.mfa.org/objects/24510/noh-costume-atsuita;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=0,757433,https://collections.mfa.org/internal/media/dispatcher/757433/resize%3Aformat%3Dfull?download,Noh costume (atsuita)Kimono,[\nJapanese\n],Y
1,645007,https://collections.mfa.org/objects/645007/bugaku-costume-in-5-parts-undercoat-shitagansane-vest-h;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=1,1493972,https://collections.mfa.org/internal/media/dispatcher/1493972/resize%3Aformat%3Dfull?download,Bugaku costume in 5 parts: undercoat (shitagansane); vest (hanpi); split skirt (hakama); helmet (torikabuto) and under robe,[\nJapanese\n],Y
2,155566,https://collections.mfa.org/objects/155566/kesa-buddhist-priests-robe;jsessionid=AF89E578CDE8EF15BA693C213C7E67AE?ctx=f523b82f-d3f8-49b2-b58b-ad4b83b11e67&idx=2,1007847,https://collections.mfa.org/internal/media/dispatcher/1007847/resize%3Aformat%3Dfull?download,Kesa (Buddhist priest's robe),[\nJapanese\n],Y


## download images

In [None]:
def download_image(dir = 'clean image/VA/', df=df):
    start = time.time()
    print(f'{df.shape[0]} images will be downloaded.')    

    name = df['systemNumber']
    url = df['image_base']
    name = list(name)
    url = list(url)

    error_id = []
    error_n = 0

    # check if the folder existed
    # if not, create the folder
    b = os.path.exists(dir)
    if b == False:
        os.mkdir(dir) 

    for n,u in zip(name,url):
        filename = dir + n + '.jpg'
        uu = u + 'full/full/0/default.jpg'
        try:
            z = urllib.request.urlopen(uu)
            output = open(filename, 'wb')
            output.write(z.read())
            output.close()
        except:
            error_id.append(n)
            print(f'error id {n}')
            error_n += 1
            print(f'{error_n} errors in total')
            continue

    end = time.time()
    print(f'{end - start} secs slipped...')

## define function download_image()

In [35]:
# check if this object can be downloaded
# if not, skip the rest process
def check_downloadable(soup2):
    # check if the image is downloadable
    # there will be a 'download' button
    # which believed to be in public domain
    bl = soup2.body.findAll(text='DOWNLOAD')
    return bool(bl)

In [84]:
# save single image
def download_image(dir, object_number, download_url, err_object):
    # get the name of the file
    # create the filepath to save the image
    filepath = dir + object_number + '.jpg'

    # save the image
    # save the object number if failed
    try:
        z = urllib.request.urlopen(download_url)
        output = open(filepath, 'wb')
        output.write(z.read())
        output.close() 
    except:
        err_object.append(object_number)
        print('save img err')

## scrape_object() = check downloadable + download image + save dataframe

In [86]:
def scrape_object(object_url, object_number, download_url, err_object, df):
    try:
        # get the bs for object page
        req2 = requests.get(object_url)
        soup2 = BeautifulSoup(req2.content,'html.parser')

        # get the block of item details
        tag = soup2.find('div',class_="item-details-inner")

        # locate the block that we need 
        tag1 = tag.contents[1] # which is the table on the left side of page

        # check if object image is downloadable
        # if not, skip
        bl = check_downloadable(soup2)
        if bl is True:
            download_image(dir, object_number, download_url, err_object)
            save_dataframe(tag1, soup2, object_number, df)

    except:
        err_object.append(object_number)
        print('read single page error')

## define function read_imgs()

In [87]:
def read_imgs(soup, err_object, df):
    # each page has 12 images
    # read each image
    k = soup.find_all('div',class_="primaryMedia")

    # there are chances to get timeout when read image page    
    for j in range(len(k)):
        # locate object number 
        object_str = k[j].find('a').get('href')
        object_number = re.search('objects/(.*?)/', object_str).group(1)
        url_base = 'https://collections.mfa.org'
        object_url = url_base + object_str 

        # locate download_number
        download_str = k[j].find('img').get('src')
        download_number = re.search('dispatcher/(.*?)/', download_str).group(1)
        download_url = f'https://collections.mfa.org/internal/media/dispatcher/{download_number}/resize%3Aformat%3Dfull?download'

        scrape_object(object_url, object_number, download_url, err_object, df)



In [104]:
keyword = 'robe'
page_n = 6
url = f"https://collections.mfa.org/search/Objects/*/{keyword}/images?filter=allClassifications%3ACostumes%3BimageExistence%3Atrue&page={page_n}"
r = requests.get(url)
soup = BeautifulSoup(r.content,'html.parser')

In [105]:
get_12_obj_df(soup)

Unnamed: 0,object number,object url,download number,download url
0,28704,https://collections.mfa.org/objects/28704/buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=60,1643220,https://collections.mfa.org/internal/media/dispatcher/1643220/resize%3Aformat%3Dfull?download
0,28705,https://collections.mfa.org/objects/28705/buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=61,1007836,https://collections.mfa.org/internal/media/dispatcher/1007836/resize%3Aformat%3Dfull?download
0,28706,https://collections.mfa.org/objects/28706/buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=62,1007829,https://collections.mfa.org/internal/media/dispatcher/1007829/resize%3Aformat%3Dfull?download
0,6257,https://collections.mfa.org/objects/6257/strap-for-a-buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=63,1024060,https://collections.mfa.org/internal/media/dispatcher/1024060/resize%3Aformat%3Dfull?download
0,6270,https://collections.mfa.org/objects/6270/buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=64,1007894,https://collections.mfa.org/internal/media/dispatcher/1007894/resize%3Aformat%3Dfull?download
0,6277,https://collections.mfa.org/objects/6277/buddhist-priests-robe-kesa;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=65,1007895,https://collections.mfa.org/internal/media/dispatcher/1007895/resize%3Aformat%3Dfull?download
0,7252,https://collections.mfa.org/objects/7252/emperors-court-robe-jifu;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=66,1022795,https://collections.mfa.org/internal/media/dispatcher/1022795/resize%3Aformat%3Dfull?download
0,7826,https://collections.mfa.org/objects/7826/fragment-of-a-mans-court-robe-jifu;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=67,577463,https://collections.mfa.org/internal/media/dispatcher/577463/resize%3Aformat%3Dfull?download
0,8040,https://collections.mfa.org/objects/8040/mans-semiformal-court-robe-jifu;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=68,1022801,https://collections.mfa.org/internal/media/dispatcher/1022801/resize%3Aformat%3Dfull?download
0,8041,https://collections.mfa.org/objects/8041/mans-semiformal-court-robe-jifu;jsessionid=ECE5FDBF0220EE0E56509B75BB0B9DFF?ctx=3dbfb08c-163f-47db-b00f-8e3a698829eb&idx=69,962048,https://collections.mfa.org/internal/media/dispatcher/962048/resize%3Aformat%3Dfull?download


In [None]:
                r = requests.get(url, headers = headers)
                soup_key = BeautifulSoup(r.content,'html.parser')

                print(f'{keyword} - page {i} - {r}') # check if request 200

            except:
                # if not sucessful, save the page number
                timeout_pages.append(i)
                print(f'{keyword} - page {i} - timeout')


    # init a list to store timeout pages
    timeout_pages = []
    for i in range(25,26):
        url = f"https://collections.mfa.org/search/Objects/*/{keyword}/images?filter=allClassifications%3ACostumes%3BimageExistence%3Atrue&page={i}"

        # there are chances to get timeout
        # so try
        # and save timeout pages for iterate later
        try:
            r = requests.get(url, headers = headers)
            soup_key = BeautifulSoup(r.content,'html.parser')
            print(f'{keyword} - page {i} - {r}') # check if request 200
            read_imgs(soup_key, err_object, df)

        except:
            # if not sucessful, save the page number
            timeout_pages.append(i)
            print(f'{keyword} - page {i} - timeout')
        
    timeout_dict[keyword] = timeout_pages
    print(timeout_dict)


#save the timeout pages if needed
# with open('timeout_dict.pkl', 'wb') as f:
#     pickle.dump(timeout_dict, f)


In [91]:
df

Unnamed: 0,object number,title,culture,medium,classifications


In [92]:
err_object

['68370',
 '68372',
 '68373',
 '68377',
 '68394',
 '68395',
 '68680',
 '73666',
 '23534',
 '24170',
 '24171',
 '24172',
 '98480',
 '98560',
 '98711',
 '99161',
 '99242',
 '100201',
 '100722',
 '68394',
 '68395',
 '68680',
 '73666',
 '23534',
 '99336']

In [3]:
soup_key = BeautifulSoup(r.content,'html.parser')

#### find how many pages

In [4]:
page_n = soup_key.find('span', class_="maxPages").text[2:].strip()
print(f'there are {page_n} pages for keyword {keyword}.')

there are 116 pages for keyword dress.


#### find object number and download number

In [5]:
# this is the 12 images div info
k = soup_key.find_all('div',class_="primaryMedia")

In [6]:
len(k)

12

In [9]:
# href has the object number for info scraping
# img src has the number of download link
k[11]

<div class="primaryMedia img-wrap"><a href="/objects/537843/womans-wedding-dress;jsessionid=41E61B82497A26DB19F0FCDA0B436A3B?ctx=a73acb1e-8c32-4285-b176-9e8e1913efbd&amp;idx=719" title="Woman's wedding dress"><div class="emuseum-img-wrap width-img-wrap" style="max-height:279px;max-width:279px;"><img class="" src="/internal/media/dispatcher/1190013/resize%3Aformat%3Dthumbnail;jsessionid=41E61B82497A26DB19F0FCDA0B436A3B" title="Woman's wedding dress"/></div><!-- Default (format unspecific) block when media not available --><!-- Access is restricted --><!-- UNKNOWN mimetype --><!-- IMAGE blocks --><!-- AUDIO blocks --><!-- VIDEO blocks --><!-- ZOOMIFY block --><!-- PDF files --><!-- Media info --></a></div>

In [10]:
# find the object number
object_str = k[11].find('a').get('href')
object_str

'/objects/537843/womans-wedding-dress;jsessionid=41E61B82497A26DB19F0FCDA0B436A3B?ctx=a73acb1e-8c32-4285-b176-9e8e1913efbd&idx=719'

In [11]:
object_number = re.search('objects/(.*?)/', object_str).group(1)
object_number

'537843'

In [12]:
# url of the certain object
# for capturing info for dataframe
url_base = 'https://collections.mfa.org'
object_url = url_base + object_str
print(object_url)


https://collections.mfa.org/objects/537843/womans-wedding-dress;jsessionid=41E61B82497A26DB19F0FCDA0B436A3B?ctx=a73acb1e-8c32-4285-b176-9e8e1913efbd&idx=719


In [13]:
# find the download number of this object
# the download number is somehow different than the object number
download_str = k[11].find('img').get('src')
download_str

'/internal/media/dispatcher/1190013/resize%3Aformat%3Dthumbnail;jsessionid=41E61B82497A26DB19F0FCDA0B436A3B'

In [14]:
download_number = re.search('dispatcher/(.*?)/', download_str).group(1)
download_number

'1190013'

In [15]:
# the download url
url_base = 'https://collections.mfa.org'
download_url = f'https://collections.mfa.org/internal/media/dispatcher/{download_number}/resize%3Aformat%3Dfull?download'
print(download_url)

https://collections.mfa.org/internal/media/dispatcher/1190013/resize%3Aformat%3Dfull?download


#### check if image is downloadable

In [16]:
# get the bs for object page
req2 = requests.get(object_url)
soup2 = BeautifulSoup(req2.content,'html.parser')

In [37]:
# check if the image is downloadable
# there will be a 'download' button
# which believed to be in public domain
bl = soup2.body.findAll(text='DOWNLOAD')
if bool(bl) is True:
    print('This image is downloadable.')
else:
    print('This image cannot be downloaded.')

This image is downloadable.


#### save image

In [28]:
# get the name of the file
# create the filepath to save the image
dir = 'testing/'
filepath = dir + object_number + '.jpg'

print(filepath)

testing/537843.jpg


In [35]:
# save the image
z = urllib.request.urlopen(download_url)
output = open(filepath, 'wb')
output.write(z.read())
output.close()

#### save dataframe

In [38]:
# title
soup2.find('div',class_="detailField titleField").text

"Woman's wedding dress"

In [39]:
# year?
soup2.find('div',class_="detailField displayDateField").text.strip()

'about 1945'

In [40]:
soup2.title

<title>Woman's wedding dress – Works – Museum of Fine Arts, Boston</title>

In [41]:
soup2.find('div',class_="item-details-inner").get_text()

'Woman\'s wedding dressLabeled: II O II [Utility] (British, 1945–1950)\nEnglish\n\nabout 1945\nObject Place: EnglandMedium/Technique\nAcetate satin weaveDimensions\nCenter back: 188 cm (74 in.)\r\nCenter front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. Waist measurement)\r\nBust: 38 1/2"Credit Line\nGift of Jean S. and Frederic A. SharfAccession Number2010.1381NOT ON VIEWCollectionsEurope, Textiles and Fashion ArtsClassificationsCostumesDescriptionWhite satin long sleeve wedding dress with white leaf patterning. Square neckline with padded shoulders. Waistline at natural waist with narrow pleating along waistband, self belt that ties in back. Long skirt with short train. Buttons down center back. Metal zipper on right side. Interior label at neckline: "II O II".ProvenanceBy 2010, with Leslie Verrinder, Tin Tin Collectables, London; 2010, sold by Verrinder to Frederic Sharf, Chestnut Hill, MA; 2010, year-end gift of Sharf to the MFA. (Accession date: January 26, 2011)'

In [18]:
# get the block of item details
tag = soup2.find('div',class_="item-details-inner")
tag

<div class="item-details-inner"><!-- LEFT COLUMN --><div class="col-lg-6"><div class="detailField titleField"><h2>Woman's wedding dress</h2><!--Woman's wedding dress--></div><div class="detailField peopleField">Labeled: <a href="/advancedsearch/Objects/peopleSearch%3AII%20O%20II%20%5BUtility%5D;jsessionid=9145A8B9C886A9D04F94F89AEBD17CEB">II O II [Utility]</a> (British, 1945–1950)</div><div class="detailField cultureField">
English
</div><div class="detailField displayDateField">
about 1945
</div><div class="detailField objectGeographyField">Object Place: England<br/><hr class="detailLine"/></div><div class="detailField mediumField"><span class="detailFieldLabel topLabel">Medium/Technique</span><span class="detailFieldValue">
Acetate satin weave<br/></span></div><div class="detailField dimensionsField != null"><span class="detailFieldLabel topLabel">Dimensions</span><span class="detailFieldValue">
Center back: 188 cm (74 in.)
<br/>
Center front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. 

In [19]:
# locate the block that we need
tag1 = tag.contents[1]
tag1

<div class="col-lg-6"><div class="detailField titleField"><h2>Woman's wedding dress</h2><!--Woman's wedding dress--></div><div class="detailField peopleField">Labeled: <a href="/advancedsearch/Objects/peopleSearch%3AII%20O%20II%20%5BUtility%5D;jsessionid=9145A8B9C886A9D04F94F89AEBD17CEB">II O II [Utility]</a> (British, 1945–1950)</div><div class="detailField cultureField">
English
</div><div class="detailField displayDateField">
about 1945
</div><div class="detailField objectGeographyField">Object Place: England<br/><hr class="detailLine"/></div><div class="detailField mediumField"><span class="detailFieldLabel topLabel">Medium/Technique</span><span class="detailFieldValue">
Acetate satin weave<br/></span></div><div class="detailField dimensionsField != null"><span class="detailFieldLabel topLabel">Dimensions</span><span class="detailFieldValue">
Center back: 188 cm (74 in.)
<br/>
Center front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. Waist measurement)
<br/>
Bust: 38 1/2"<br/></span></

In [45]:
# n of features
len(tag1)

12

In [47]:
tag1.contents[1]['class'][1]

'peopleField'

In [54]:
tag1.contents[2]['class'][1]

'cultureField'

In [20]:
# initiate df index
detailField = []
detailField.append('object number')
detailField.append('title')
for i in range(1,len(tag1)):
    detailField.append(tag1.contents[i]['class'][1][:-5])

df = pd.DataFrame(columns=detailField)
df

Unnamed: 0,object number,title,people,culture,displayDate,objectGeography,medium,dimensions,creditline,invnoline,onview,collectionTerms,classifications


In [22]:
print(detailField)

['object number', 'title', 'people', 'culture', 'displayDate', 'objectGeography', 'medium', 'dimensions', 'creditline', 'invnoline', 'onview', 'collectionTerms', 'classifications']


In [59]:
# get object info
detailContent = []

# add object number for record
detailContent.append(object_number)

# the title is special, add it first
detailContent.append(tag1.contents[0].text)

for i in range(1,len(tag1)):
    detailContent.append(tag1.contents[i].contents[-1].text.strip().replace('\n',''))

print(detailContent)

['537843', "Woman's wedding dress", '(British, 1945–1950)', 'English', 'about 1945', '', 'Acetate satin weave', 'Center back: 188 cm (74 in.)\rCenter front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. Waist measurement)\rBust: 38 1/2"', 'Gift of Jean S. and Frederic A. Sharf', '2010.1381', 'NOT ON VIEW', 'Europe, Textiles and Fashion Arts', 'Costumes']


In [61]:
df_addon = pd.DataFrame(detailContent).T
df_addon.columns = detailField
df_addon

Unnamed: 0,object number,title,people,culture,displayDate,objectGeography,medium,dimensions,creditline,invnoline,onview,collectionTerms,classifications
0,537843,Woman's wedding dress,"(British, 1945–1950)",English,about 1945,,Acetate satin weave,"Center back: 188 cm (74 in.)\rCenter front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. Waist measurement)\rBust: 38 1/2""",Gift of Jean S. and Frederic A. Sharf,2010.1381,NOT ON VIEW,"Europe, Textiles and Fashion Arts",Costumes


In [62]:
df = pd.concat([df,df_addon])

In [63]:
df

Unnamed: 0,object number,title,people,culture,displayDate,objectGeography,medium,dimensions,creditline,invnoline,onview,collectionTerms,classifications
0,537843,Woman's wedding dress,"(British, 1945–1950)",English,about 1945,,Acetate satin weave,"Center back: 188 cm (74 in.)\rCenter front: 135.9 x 74.9 cm (53 1/2 CF x 29 1/2 in. Waist measurement)\rBust: 38 1/2""",Gift of Jean S. and Frederic A. Sharf,2010.1381,NOT ON VIEW,"Europe, Textiles and Fashion Arts",Costumes


## define functions

## def func save_image()

In [None]:
def download_img(object_url,dir='testing/'):
    start = time.time()

    name = object_number
    url = object_url

    filename = dir + str(n) + '.jpg'

    try:    
        z = urllib.request.urlopen(url)
        output = open(filename, 'wb')
        output.write(z.read())
        output.close()

    name = list(name)
    url = list(url)

    error_id = []
    error_n = 0

    for n,u in zip(name,url):
        
        except:
            error_id.append(n)
            print(f'error id {n}')
            error_n += 1
            print(f'{error_n} errors in total')
            continue


    end = time.time()
    print(f'{end - start} secs slipped...')

In [6]:
x = soup.find_all('img')

In [7]:
t = list(x)

In [9]:
len(t)

13

In [8]:
# on one page there are 12 thumbnail images
# of the costumes we want
# the first one is unrelated, we should get rid of it
t[1:]

[<img class="" src="/internal/media/dispatcher/1544048/resize%3Aformat%3Dthumbnail;jsessionid=1A1A5B7352C444E0A2FB8651FF5CD4B7" title="Pinned dress and silver pin"/>,
 <img class="" src="/internal/media/dispatcher/1577155/resize%3Aformat%3Dthumbnail;jsessionid=1A1A5B7352C444E0A2FB8651FF5CD4B7" title="Look 7: Dress, pants, and ankle boots"/>,
 <img class="" src="/internal/media/dispatcher/1544173/resize%3Aformat%3Dthumbnail;jsessionid=1A1A5B7352C444E0A2FB8651FF5CD4B7" title="Woman's ensemble comprising dress, tights, shoes, and spats"/>,
 <img class="" src="/internal/media/dispatcher/1577158/resize%3Aformat%3Dthumbnail;jsessionid=1A1A5B7352C444E0A2FB8651FF5CD4B7" title="Woman's ensemble comprising dress, hood, and boots"/>,
 <img class="" src="/internal/media/dispatcher/1544063/resize%3Aformat%3Dthumbnail;jsessionid=1A1A5B7352C444E0A2FB8651FF5CD4B7" title="Sculpted dress and shoulder harness"/>,
 <img class="" src="/internal/media/dispatcher/1596815/resize%3Aformat%3Dthumbnail;jsessioni

In [10]:
# we need to locate the numbers after despather
# it's the object name we need for downloading
for link in t[1:]:
    found = re.search('dispatcher/(.*?)/',link.get('src')).group(1)
    print(found)
    # print(link.get('src'))

1544048
1577155
1544173
1577158
1544063
1596815
1634455
1634458
1588174
1590035
1590036
1590038


In [66]:
# the download ref number is different than object number
object_num = '815857'
download_page = f'https://collections.mfa.org/download/{object_num}'
download_page

'https://collections.mfa.org/download/815857'

In [68]:
# get the full size download link from the download page
url = download_page
req = requests.get(url)
soup = BeautifulSoup(req.content,'html.parser')
soup.find_all('a','target'="_blank")


# <a target="_blank" href="https://collections.mfa.org/internal/media/dispatcher/757433/resize%3Aformat%3Dfull?download"><button class="emuseum-button " type="button">
# Download
# </button></a>

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (2056816795.py, line 5)

In [64]:

download_link = f'https://collections.mfa.org/internal/media/dispatcher/{object_num}/resize%3Aformat%3Dfull?download'

print(download_link)

https://collections.mfa.org/internal/media/dispatcher/815857/resize%3Aformat%3Dfull?download


In [65]:
z = urllib.request.urlopen(download_link)
output = open('test.jpg', 'wb')
output.write(z.read())
output.close()