In [36]:
from pattern import web
import requests
from bs4 import BeautifulSoup
import doctest
import re
import pandas as pd
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

In [2]:
def getPages(url):
    '''
    Function
    --------
    Returns the page range of the query
    
    Parameters
    ----------
    url: str
        First page of the British Museum query
        
    Returns
    -------
    A tuple containing the first page and the last 
    page numbers, respectively.
    
    Example
    -------
    For "http://www.britishmuseum.org/research/collection_online/search.aspx?searchText=augustus"
    it would return (1, 474)
    
    Doctests
    --------
    #Regular Case
    >>> getPages('http://www.britishmuseum.org/research/collection_online/search.aspx?searchText=augustus')
    (1, 474)
    
    #Only one results page
    >>> getPages('http://www.britishmuseum.org/research/collection_online/search.aspx?searchText=1855,0512.40')
    (1, 1)
    '''
    #get page and convert to web.Element object
    html = requests.get(url).text
    dom = web.Element(html)
    
    #finds the list of pages and returns (1, 1) if no list of pages present
    elem = dom.by_class('colSearchPaging')
    if len(elem) < 1:
        return (1, 1)
    else:
        pages = elem[0].children
    
    #gets the last page number of the pages list
    last_page = pages[-4].children[0].content
    
    return (1, int(last_page))
    
#doctest.testmod()

In [3]:
def getLinks(url):
    '''
    Function
    --------
    Gets the urls to the objects from results page
    
    Parameter
    ---------
    url: str
        Search page of British Museum website
        
    Returns
    -------
    A list of str that contain the links of the objects
    
    Doctest
    -------
    >>> getLinks('http://www.britishmuseum.org/research/collection_online/search.aspx?searchText=1855,0512.40')
    [u'http://www.britishmuseum.org/research/collection_online/collection_object_details.aspx?objectId=1128207&partId=1&searchText=1855%2c0512.40&page=1', u'http://www.britishmuseum.org/research/collection_online/collection_object_details.aspx?objectId=3629209&partId=1&searchText=1855%2c0512.40&page=1']
    '''
    links = []
    
    #get page and convert to web.Element object
    html = requests.get(url).text
    dom = web.Element(html)
    
    #create list of row objects
    rows = dom.by_class('grid_12 alpha row colResults')
    
    #traverse each row and get the links
    for row in rows:
        objects = row.by_class('noImage') + row.by_class('image')
        for obj in objects:
            link_identifier_with_noise = obj.attr['href'].split('/')[-1]
            link_identifier = link_identifier_with_noise[5:]
            link = 'http://www.britishmuseum.org/research/collection_online/' + link_identifier
            links.append(link)
    
    return links

doctest.testmod()

TestResults(failed=0, attempted=3)

In [37]:
def getDetails(url):
    '''
    Function
    --------
    Reads the data off of the given British Museum item page and puts into a dictionary
    
    Parameter
    ---------
    url: str
        The url of the British Museum item page
        
    Returns
    -------
    A dictionary with of result under 'result' key if reached page, else puts url under
    'skipped' key
    '''
    result = {'url': url}
    skipped = ''
    #create list of descriptors
    wanted_strs = ['Museum number', 'Denomination', 'Description', 'State', 
                  'Culture/period', 'Date', 'Materials', 
                  'Production place', 'Curator\'s comments', 'Bibliography', 
                  'Object type', 'Weight']
    wanted_lists = ['Subjects', 'Authority', 'Associated names']
    
    #get page and convert to web.Element object
    html = requests.get(url).text
    dom = web.Element(html)
    
    #gets portion of page containing discription
    try:
        details = dom.by_class('objectDetails')[0].children
    except:
        details = []
        skipped = url
    
    #iterate over and populate the result dictionary
    for detail in details:
        detail = list(BeautifulSoup(str(detail), 'html.parser').stripped_strings)
        #print(detail)
        
        if len(detail) > 1:
            desc = detail[0]
            
            if desc in wanted_strs:
                description = ''
                for i in range(1, len(detail)):
                    description += detail[i] + ' '
                result[desc] = description
                
            elif desc in wanted_lists:
                description = []
                for i in range(1, len(detail)):
                    description += [detail[i]]
                result[desc] = description
                
            elif desc == 'Dimensions':
                for dim in detail[1:]:
                    split_dim = dim.split(': ')
                    try: 
                        if split_dim[0] in 'Weight':
                            result['Weight (g)'] = re.findall("\d+\.\d+", split_dim[1])[0]
                    except:
                        pass
                    
            elif desc == 'Inscriptions':
                inscriptions = ''
                i = 1
                inscription = ''
                
                while i < len(detail):
                    if 'Inscription ' in detail[i]:
                        if detail[i] == 'Inscription Type':
                            if inscription != '':
                                inscriptions += inscription + '|'
                                inscription = ''
                        descriptor = str(detail[i])
                        i += 1
                        description = str(detail[i])
                        i += 1
                        try:
                            while 'Inscription ' not in detail[i]:
                                description += ' ' + detail[i]
                                i += 1
                        except:
                            pass
                        inscription += descriptor + ':' + description + (';')
                        
                    else:
                        i += 1
                        
                inscriptions += inscription
                result[desc] = inscriptions
            
    return {'result': result, 'skipped': skipped}
              
              
getDetails('http://www.britishmuseum.org/research/collection_online/collection_object_details.aspx?objectId=3457746&partId=1&searchText=augustus&lookup-people=e.g.+Hokusai%2c+Ramesses&people=&lookup-place=e.g.+India%2c+Shanghai%2c+Thebes&place=&from=bc&fromDate=44&to=ad&toDate=14&lookup-object=coins&object=&lookup-subject=e.g.+farming%2c+New+Testament&subject=&lookup-matcult=e.g.+Choson+Dynasty%2c+Ptolemaic&matcult=&lookup-technique=e.g.+carved%2c+celadon-glazed&technique=&lookup-school=e.g.+French%2c+Mughal+Style&school=&lookup-material=e.g.+canvas%2c+porcelain%2c+silk&material=&lookup-ethname=e.g.+Hmong%2c+Maori%2c+Tai&ethname=&lookup-ware=e.g.+Imari+ware%2c+Qingbai+ware&ware=&lookup-escape=e.g.+cylinder%2c+gravity%2c+lever&escape=&lookup-bibliography=&bibliography=&citation=&museumno=&catalogueOnly=&view=&page=6')

{'result': {u'Associated names': [u'Portrait of: Augustus (Octavian)'],
  u'Authority': [u'Ruler:  Augustus (Octavian)'],
  u'Bibliography': u'BMC Greek (Galatia) 3.p243 RPC1 4452/1 ',
  u'Culture/period': u'Roman Provincial ',
  u'Date': u'27 BC-14 ',
  u'Description': u'Copper alloy coin.(obverse) Bare head of Augustus, right. (reverse) Goddess seated, left; in right hand poppy-head; left hand on sceptre; at feet forepart of sphinx, wearing modius. ',
  u'Inscriptions': 'Inscription Type:inscription;Inscription Script:Greek;Inscription Position:reverse;Inscription Language:Greek;Inscription Content:\xce\x93\xce\x91\xce\x92\xce\x91\xce\x9b\xce\x95\xce\xa9\xce\x9d;Inscription Transliteration:GABALEON;|Inscription Type:inscription;Inscription Script:Greek;Inscription Position:reverse;Inscription Language:Greek;Inscription Content:\xce\x94\xce\x9c;Inscription Transliteration:DM;Inscription Comment:left field;|Inscription Type:inscription;Inscription Script:Greek;Inscription Position:reve

In [38]:
url = 'http://www.britishmuseum.org/research/collection_online/search.aspx?searchText=augustus&lookup-people=e.g.+Hokusai%2C+Ramesses&people=&lookup-place=e.g.+India%2C+Shanghai%2C+Thebes&place=&from=bc&fromDate=44&to=ad&toDate=14&lookup-object=coins&object=&lookup-subject=e.g.+farming%2C+New+Testament&subject=&lookup-matcult=e.g.+Choson+Dynasty%2C+Ptolemaic&matcult=&lookup-technique=e.g.+carved%2C+celadon-glazed&technique=&lookup-school=e.g.+French%2C+Mughal+Style&school=&lookup-material=e.g.+canvas%2C+porcelain%2C+silk&material=&lookup-ethname=e.g.+Hmong%2C+Maori%2C+Tai&ethname=&lookup-ware=e.g.+Imari+ware%2C+Qingbai+ware&ware=&lookup-escape=e.g.+cylinder%2C+gravity%2C+lever&escape=&lookup-bibliography=&bibliography=&citation=&museumno=&catalogueOnly=&view='
url = url + '&page='

In [40]:
pages = getPages(url)
links = [getLinks(url + str(i)) for i in range(1, pages[1]+1)]
links = [item for sublist in links for item in sublist]
data = [getDetails(link) for link in links]
details = []
for datum in data:
    details.append(datum['result'])
    skipped = datum['skipped']
    while skipped:
        result = getDetails(skipped)
        if result['result']:
            details.append(result['result'])
            skipped = ''
            

In [41]:
len(details)

3295

In [42]:
df = pd.DataFrame(details)
df.to_csv('AugustusCoins_44BC-14AD.csv', encoding='utf-8')