## setup

In [1]:
import pandas as pd
import cv2
import os
import os.path

import time

import requests
import json
import urllib
from bs4 import BeautifulSoup

# show all dataframe
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_columns', None)  

## V&A

guidelines here:<br>
https://api.vam.ac.uk/docs#/Archives/archive_search_v2_archives_search_get<br>
https://developers.vam.ac.uk/guide/v2/images/introduction.html#images-top

In [2]:
# checkout a topic
url = "https://api.vam.ac.uk/v2/objects/search?q_object_name=robe&page_size=100&images=true&images_exist=true"
r = requests.get(url)
r

<Response [200]>

In [3]:
r.json().keys()

dict_keys(['info', 'records', 'clusters'])

In [4]:
r.json()['info'] # the pages cound be used to iterate

{'version': '2.0',
 'record_count': 5582,
 'record_count_exact': True,
 'parameters': {},
 'page_size': 100,
 'pages': 56,
 'page': 1,
 'image_count': 6261}

## get the dataframe for selected topics

### prototoype to scrape a certain topic

In [5]:
# continue above

# get records column names
# for initial the full dataframe
records_list = r.json()['records']
records_df = pd.DataFrame.from_dict(records_list)
records_df_columns = records_df.columns

# initiate a dataframe for filling in scrapped info
df = pd.DataFrame(columns=records_df_columns)

In [6]:
df # check if initiated

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images


In [7]:
keyword = 'dress'

In [8]:
urlpath = f'https://api.vam.ac.uk/v2/objects/search?q_object_name={keyword}&page_size=100&images=true&images_exist=true'
rr = requests.get(urlpath)

info_dict = rr.json()['info'] 
pages_n = info_dict['pages']
record_n = info_dict['record_count']
print(f'{pages_n} pages for keyword {keyword}')
print(f'{record_n} records for keyword {keyword}')

74 pages for keyword dress
7321 records for keyword dress


In [9]:
# start timer
start = time.time()

for p in range(1,pages_n+1):
    urlpath = f'https://api.vam.ac.uk/v2/objects/search?q_object_name={keyword}&page_size=100&images=true&images_exist=true&page={p}'
    rr = requests.get(urlpath)
    thisdf = pd.DataFrame.from_dict(rr.json()['records'])
    df = pd.concat([df,thisdf])

# timer stops
end = time.time()
print(f'{end-start} seconds passed..')

47.67231202125549 seconds passed..


In [10]:
df.head(1)

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images
0,O63107,T.237-2001,Dress,"{'id': 'THES50141', 'displayName': 'In store', 'type': 'storage', 'site': 'BH', 'onDisplay': False, 'detail': {'free': '', 'case': '', 'shelf': '', 'box': ''}}",Dress,"{'name': 'Tam, Vivienne', 'association': 'Maker, Designer'}",2020MU3350,1999,USA,[],"{'_primary_thumbnail': 'https://framemark.vam.ac.uk/collections/2020MU3350/full/!100,100/0/default.jpg', '_iiif_image_base_url': 'https://framemark.vam.ac.uk/collections/2020MU3350/', '_iiif_presentation_url': None, 'imageResolution': 'low'}"


### define the function

In [39]:
def scrape_dataframe(keyword='robe'):
    # start timer
    start = time.time()    

    urlpath = f'https://api.vam.ac.uk/v2/objects/search?q_object_name={keyword}&page_size=100&images=true&images_exist=true'
    rr = requests.get(urlpath)

    # initiate an empty dataframe to carry the scrapped info
    # get records column names
    # for initial the full dataframe
    records_list = rr.json()['records']
    records_df = pd.DataFrame.from_dict(records_list)
    records_df_columns = records_df.columns
    df = pd.DataFrame(columns=records_df_columns)

    # get how many pages
    # and total number of records of this keyword
    info_dict = rr.json()['info'] 
    pages_n = info_dict['pages'] # for iterate
    record_n = info_dict['record_count']
    # print(f'{pages_n} pages for keyword {keyword}')
    print(f'we found {record_n} records for keyword {keyword}')

    # iterate the pages
    # since VA API could only search page by page
    for p in range(1,pages_n+1):
        urlpath = f'https://api.vam.ac.uk/v2/objects/search?q_object_name={keyword}&page_size=100&images=true&images_exist=true&page={p}'
        rr = requests.get(urlpath)
        thisdf = pd.DataFrame.from_dict(rr.json()['records'])
        df = pd.concat([df,thisdf])

    # print(f'we got {df.shape[0]} scarf.')
    
    # timer stops
    end = time.time()
    print(f'{end-start} seconds for scraping {keyword}..')

    return df

In [40]:
# test the function
df_test = scrape_dataframe('scarf')
df_test.head(1)

we found 782 records for keyword scarf
5.681551933288574 seconds for scraping scarf..


Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images
0,O102827,T.127-1997,Scarf,"{'id': 'THES50141', 'displayName': 'In store', 'type': 'storage', 'site': 'BH', 'onDisplay': False, 'detail': {'free': '', 'case': '', 'shelf': '', 'box': ''}}",Scarf on Scarf,"{'name': 'Earley, Rebecca', 'association': 'designer'}",2006AP6466,1996,England,[],"{'_primary_thumbnail': 'https://framemark.vam.ac.uk/collections/2006AP6466/full/!100,100/0/default.jpg', '_iiif_image_base_url': 'https://framemark.vam.ac.uk/collections/2006AP6466/', '_iiif_presentation_url': None, 'imageResolution': 'low'}"


In [41]:
df_test = df_test.head(3)

### scrape

In [15]:
# initiate an empty datafram
url = "https://api.vam.ac.uk/v2/objects/search?q_object_name=robe&page_size=100&images=true&images_exist=true"
r = requests.get(url)

# get records column names
# for initial the full dataframe
records_list = r.json()['records']
records_df = pd.DataFrame.from_dict(records_list)
records_df_columns = records_df.columns

# initiate a dataframe for filling in scrapped info
df = pd.DataFrame(columns=records_df_columns)

In [16]:
#check
df

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images


In [17]:
# hand picked some keywords
# checked the keywords on VA website

keywords = ['costume', 'robe', 'dress', 'shirt','ensemble','jacket','coat','suit','trousers']

In [18]:
for k in keywords:
    df_toadd = scrape_dataframe(k)
    df = pd.concat([df,df_toadd])

we found 16896 records for keyword costume
65.27326703071594 seconds for scraping costume..
we found 5582 records for keyword robe
35.6450834274292 seconds for scraping robe..
we found 7321 records for keyword dress
48.430097341537476 seconds for scraping dress..
we found 917 records for keyword shirt
6.8267552852630615 seconds for scraping shirt..
we found 399 records for keyword ensemble
3.2287628650665283 seconds for scraping ensemble..
we found 835 records for keyword jacket
6.3819639682769775 seconds for scraping jacket..
we found 763 records for keyword coat
5.62924599647522 seconds for scraping coat..
we found 979 records for keyword suit
7.155668497085571 seconds for scraping suit..
we found 360 records for keyword trousers
3.063270330429077 seconds for scraping trousers..


In [19]:
df.shape

(27156, 11)

In [20]:
df.drop_duplicates('_primaryImageId', inplace=True)

In [21]:
df.to_csv(r'scrapped_data/VA_uncleaned.csv', index = False)

In [2]:
df = pd.read_csv('scrapped_data/VA_uncleaned.csv') 

In [3]:
df.shape

(26595, 11)

## filter more before download

### hand pick certian object types

after download test, we found many unrealted images.

we could use 'object type' to filter.<br>
use the VA website search to determine if the object type needs to be removed.

In [4]:
df['objectType'] = df['objectType'].map(lambda x: str(x).lower())

In [5]:
df['objectType'].nunique()

584

In [6]:
df['objectType'].unique()

array(['costume', 'theatre costume', 'film costume', 'costume design',
       'ballet costume', 'opera costume', 'dance costume',
       'puppet costume', 'stage costume', 'fancy dress costume',
       'circus costume', 'costume plate', 'clown costume',
       'rock and pop costume', 'jacket', 'print', 'drawing',
       'fashion plate', 'nan', "doll's outfit", 'ensemble', 'bodice',
       'theatre design', 'figurine', 'oil painting', 'design',
       'aquatint etching', 'statuette', 'photograph', 'poster', 'figure',
       'relief', 'screenprint', 'fashion design', 'costume sketch',
       'costume doll', 'masks (costume)', 'cabaret costume',
       'costume accessory', 'marionette costume', 'walking costume',
       'bathing costume', 'national costume', 'golf costume',
       'costume designs', 'costume dolls', 'usher costume',
       'performance costume', 'costume print', 'tv costume',
       "man's costume", "acrobat's costume", 'theatrical costume',
       'costume doll ', 'robe'

In [15]:
df['objectType'].value_counts().head(60)

costume design                7379
fashion design                5388
dress fabric                  3774
theatre costume               1723
dress                         1141
evening dress                  638
jacket                         552
shirt                          437
coat                           379
t-shirt                        338
print                          334
ensemble                       332
trousers                       276
suit                           233
costume                        207
robe                           188
skirt suit                     175
drawing                        159
photograph                     140
wedding dress                  134
dress panel                    130
dance costume                  115
fashion plate                   94
trouser suit                    68
dress ornament                  65
day dress                       62
dress trimming                  61
book jacket                     61
costume doll        

In [16]:
sel = ['costume design', 'fashion design','dress fabric','photograph',
        'print','drawing','dress panel','costume doll',
        'dress trimming','dress-making sample','book jacket','painting',
        'fancy dress costume design','watercolour','etching','costume sketch',
        'oil painting','costume plate','theatre costume']

In [17]:
for s in sel:
    sel_index = df[df['objectType'] == s].index
    df.drop(sel_index, inplace=True)

In [18]:
df.shape

(7096, 11)

In [24]:
# easy check an object
sysnumber = 'O1682165'
df[df['systemNumber']==sysnumber]

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images
3,O1682165,S.8:1 to 5-2022,costume,"{'id': 'THES49929', 'displayName': 'Theatre & Performance, Room 104', 'type': 'Display', 'site': 'VA', 'onDisplay': True, 'detail': {'free': '', 'case': 'PL1', 'shelf': 'EXP', 'box': 'FIG1'}}",costume,"{'name': 'Slade, Gabriella', 'association': 'designer'}",2022NG6373,2020,New York,[],"{'_primary_thumbnail': 'https://framemark.vam.ac.uk/collections/2022NG6373/full/!100,100/0/default.jpg', '_iiif_image_base_url': 'https://framemark.vam.ac.uk/collections/2022NG6373/', '_iiif_presentation_url': 'https://iiif.vam.ac.uk/collections/O1682165/manifest.json', 'imageResolution': 'high'}"


## image download

### extract base url

In [25]:
df['image_value'] = df['_images'].map(lambda x: eval(x))

In [26]:
df['image_value'] = df['image_value'].map(lambda x: x.values())

In [27]:
df['image_base'] = df['image_value'].map(lambda x: (list(x)[1]))

In [28]:
df.head(1)

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images,image_value,image_base
0,O1314778,S.1658-2015,costume,"{'id': 'THES49318', 'displayName': 'In store', 'type': 'storage', 'site': 'BH', 'onDisplay': False, 'detail': {'free': '', 'case': '', 'shelf': '', 'box': ''}}",Costume,"{'name': 'Strassner, Joe', 'association': 'designers'}",2015HV9075,1936,Great Britain,[],"{'_primary_thumbnail': 'https://framemark.vam.ac.uk/collections/2015HV9075/full/!100,100/0/default.jpg', '_iiif_image_base_url': 'https://framemark.vam.ac.uk/collections/2015HV9075/', '_iiif_presentation_url': None, 'imageResolution': 'low'}","(https://framemark.vam.ac.uk/collections/2015HV9075/full/!100,100/0/default.jpg, https://framemark.vam.ac.uk/collections/2015HV9075/, None, low)",https://framemark.vam.ac.uk/collections/2015HV9075/


In [44]:
df_test['image_value'] = df_test['_images'].map(lambda x: x.values())
df_test['image_base'] = df_test['image_value'].map(lambda x: (list(x)[1]))

### prototype

In [None]:
test_sample = df.sample(1,random_state = 2333)
test_sample.columns

Index(['systemNumber', 'accessionNumber', 'objectType', '_currentLocation',
       '_primaryTitle', '_primaryMaker', '_primaryImageId', '_primaryDate',
      dtype='object')

In [None]:
image_url_dict = test_sample['_images'].to_dict()

In [None]:
k = list(image_url_dict.keys())
k=k[0]

In [None]:
image_base_url = image_url_dict[k]['_iiif_image_base_url']

In [None]:
image_base_url

'https://framemark.vam.ac.uk/collections/2011EP4395/'

In [None]:
image_download_url = image_base_url + 'full/full/0/default.jpg'
image_download_url

'https://framemark.vam.ac.uk/collections/2011EP4395/full/full/0/default.jpg'

In [None]:
# get the name of the file
# create the filepath to save the image
dir = 'clean image/VA/'
systemNumber = test_sample['systemNumber'].to_string(index=False)
filepath = dir + systemNumber + '.jpg'

print(filepath)

clean image/VA/O1174504.jpg


In [None]:
# save the image
z = urllib.request.urlopen(image_download_url)
output = open(filepath, 'wb')
output.write(z.read())
output.close()

In [None]:
df_test['image_base']

0    https://framemark.vam.ac.uk/collections/2006AP6466/
1    https://framemark.vam.ac.uk/collections/2019LM0853/
2    https://framemark.vam.ac.uk/collections/2019LM1011/
Name: image_base, dtype: object

### define the function

In [31]:
def download_image(dir = 'clean image/VA/', df=df):
    start = time.time()
    print(f'{df.shape[0]} images will be downloaded.')    

    name = df['systemNumber']
    url = df['image_base']
    name = list(name)
    url = list(url)

    error_id = []
    error_n = 0

    # check if the folder existed
    # if not, create the folder
    b = os.path.exists(dir)
    if b == False:
        os.mkdir(dir) 

    for n,u in zip(name,url):
        filename = dir + n + '.jpg'
        uu = u + 'full/full/0/default.jpg'
        try:
            z = urllib.request.urlopen(uu)
            output = open(filename, 'wb')
            output.write(z.read())
            output.close()
        except:
            error_id.append(n)
            print(f'error id {n}')
            error_n += 1
            print(f'{error_n} errors in total')
            continue

    end = time.time()
    print(f'{end - start} secs slipped...')

In [45]:
# test
download_image('vatest/', df_test)

3 images will be downloaded.
3.106887102127075 secs slipped...


### scrape

In [33]:
df.shape

(7096, 13)

In [32]:
download_image('clean image/VA_sel/', df)

7096 images will be downloaded.
9284.307857513428 secs slipped...


In [34]:
df.to_csv(r'scrapped_data/VA_all_sel.csv', index = False)

In [35]:
df = pd.read_csv('scrapped_data/VA_all_sel.csv') 