# MoMA Online Archive Scrape

![img](img/banner.png)

This project uses `urllib` and `BeautifulSoup` libraries to scrape data from the Museum of Modern Art website.

In [153]:
import urllib
import re
from bs4 import BeautifulSoup

def build_url(term):
    """Build a url to query MoMA website"""
    url = (f'https://www.moma.org/artists/{term}?locale=en&page=1')
    return url

def get_url_response(url):
    """Get HTML response from url"""
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        return response.read()

def build_a_soup(query):
    """Create a BeautifulSoup class from HTML response"""
    return BeautifulSoup(get_url_response(build_url(query)), 'html.parser')

In [154]:
# Each artists' archives have an associated ID.
# Create a dictionary that contains a mapping between the artist name and their ID
# on the MoMA 
artist_ids = {
    "le corbusier": {
        "name": "le corbusier",
        "id": 3426},
    "mies van der rohe": {
        "name": "mies van der rohe",
        "id": 7166
    },
    "antoni gaudi": {
        "name": "antoni gaudi",
        "id": 2096
    },
}

In [155]:
# Create a BeautifulSoup class for each artist
le_corbusier = build_a_soup(artist_ids['le corbusier']['id'])
mies_van_der_rohe = build_a_soup(artist_ids['mies van der rohe']['id'])
antoni_gaudi = build_a_soup(artist_ids['antoni gaudi']['id'])

## Structure of Response Data

The relevant HTML section we want to extract data has the following structure:

```
<li> - class "grid-item--work"
└── <a> tag - link to artwork page
    ├── <div> 
    │   └── <picture> tag
    │       └── <img> tag
    └── <div> with artwork data
        └── <h3>
            ├── <span> - artist name
            ├── <span> - artwork name
            └── <span> - artwork date
```

The goal is to pull the artwork data inside the `h3` tag.

In [157]:
# Find out how many artworks are in MoMA's online collection
# for each artist
# le_corbusier_works = le_corbusier.find_all('li','grid-item--work')

def extract_year(string):
    """Extra year string from HTML tag"""
    return re.search(r'\d{4}', string)

def validate_string(data):
    """Validate whether string exists and return a cleaned string"""
    # Some titles are wrapped in an <em> tag inside span
    if data.find("em"):
        return data.find("em").string
    elif data.string is not None:
        return data.string.strip()
    return None

def get_artwork_data(soup_object):
    """Parse through HTML tags and extract artwork date"""
    artist_works = soup_object.find_all('li', 'grid-item--work')
    works_year = []
    for work in artist_works:
        # get span tags that contain artwork data
        data = work.find_all("span")

        artist = validate_string(data[0])
        title = validate_string(data[1])
        year = validate_string(data[2])
        works_year.append((artist,
                           title,
                           extract_year(year).group(0)))
    return (works_year)

c_dates = get_artwork_data(le_corbusier)
m_dates = get_artwork_data(mies_van_der_rohe)
a_dates = get_artwork_data(antoni_gaudi)

artworks = [c_dates, m_dates, a_dates]

[print(work,'\n') for work in artworks]

[('Le Corbusier (Charles-Édouard Jeanneret)', 'Bol, Pipes, et Papiers Enroulés', '1919'), ('Le Corbusier (Charles-Édouard Jeanneret)', 'Still Life', '1920'), ('Le Corbusier (Charles-Édouard Jeanneret)', "L'Esprit Nouveau letterhead (Letter to Naum Gabo from Paul Dermée)", '1920'), ('Le Corbusier (Charles-Édouard Jeanneret)', 'Still Life with Objects', '1923'), ('Alfred Roth, Le Corbusier (Charles-Édouard Jeanneret)', 'Bed Frame', '1927'), ('Le Corbusier (Charles-Édouard Jeanneret), Pierre Jeanneret', 'Les Terrasses, Villa Stein-de-Monzie, Garches, France', '1926'), ('Le Corbusier (Charles-Édouard Jeanneret), Pierre Jeanneret, Charlotte Perriand', 'Chaise Longue (LC/4)', '1928'), ('Le Corbusier (Charles-Édouard Jeanneret), Pierre Jeanneret, Charlotte Perriand', 'Armchair with a Tilting Back (Siège à Dossier Basculant)', '1928'), ('Le Corbusier (Charles-Édouard Jeanneret)', 'Urban projects for Montevideo and São Paulo, Brazil. Aerial perspectives', '1929'), ('Le Corbusier (Charles-Édouar

[None, None, None]