# Scrape for Artists and Representative Image

Scrape for the top 100 most influential artists of all time. Select a painting that
is most representative of their known art and save to a folder. 
Reference the images for image transfers

## Scrape for top 101 artists of all time

In [1]:
# import libraries
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import pymongo
from splinter import Browser
import urllib.request
import time

In [2]:
# # launch chrome driver
executable_path = {'executable_path': '/Users/prettyvo/Desktop/chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

In [3]:
# visit the site
url = "http://www.theartwolf.com/articles/most-important-painters.htm"
browser.visit(url)
html=browser.html
art = BeautifulSoup(html, 'html.parser')

In [4]:
# scrape for the artists anmes 
artists = art.find('div', class_='noticiacentro')
artists_info = artists.find_all('p')
artists_name = [x.strong for x in artists_info]

# artists_names = [x.text for x in artists_name]
artists_name.pop(0) # remove first in list -> intro paragraph with no artist
artists_name = [x.text for x in artists_name] # pull text from tag

# manipulate arists list to search in wikiart
artists_search = [x.lower() for x in artists_name]
artists_search = [x.replace(' ', '-') for x in artists_search]

# put artists names into a df
df = pd.DataFrame(artists_name, columns=["artists"])
df['artists_search_name'] = artists_search
df.head()

Unnamed: 0,artists,artists_search_name
0,PABLO PICASSO,pablo-picasso
1,GIOTTO DI BONDONE,giotto-di-bondone
2,LEONARDO DA VINCI,leonardo-da-vinci
3,PAUL CÉZANNE,paul-cézanne
4,REMBRANDT VAN RIJN,rembrandt-van-rijn


## Scrape for images

In [5]:
# loop over and pull first image for artist
artist_image_found = []
art_pieces = []
counter  = 0
for name in artists_search:
    # visit the site for each artist
    url_paintings = f'https://www.wikiart.org/en/{name}'
    browser.visit(url_paintings)
    time.sleep(1) # wait for the page to load
    html3=browser.html
    search_artist = BeautifulSoup(html3, 'html.parser')
    
    try:
        # get the image of the artwork
        images = search_artist.find('div', class_='masonry-content')
        paintings = images.find_all('img')
        paintings = [x['src'] for x in paintings]
        
        # get the title of the art work    
        title = search_artist.find_all('div', class_='title-block')
        painting_titles = [x.text.strip() for x in title] # clean data
        painting_titles = [x.split('\n')[0].strip() for x in painting_titles] # save only the title
        art_pieces.append(painting_titles[0])
        artist_image_found.append(name)
        
        # save image
        save_image = os.path.join("images", f'{name}_{painting_titles[0]}.jpg')
        urllib.request.urlretrieve(paintings[0], save_image) #save the image from the url
        print(f'{counter} {name} image saved')
        counter +=1
    except:
        print(f'{name} error')
        pass

['pablo-picasso']
0 pablo-picasso image saved
['pablo-picasso', 'giotto-di-bondone']
1 giotto-di-bondone image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci']
2 leonardo-da-vinci image saved
paul-cézanne error
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn']
3 rembrandt-van-rijn image saved
diego-velázquez error
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky']
4 wassily-kandinsky image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet']
5 claude-monet image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio']
6 caravaggio image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner']
7 joseph-mallord-william-turne

26 paul-klee image saved
francis-bacon error
gustav-klimt- error
eugène-delacroix error
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondrian', 'piero-della-francesca', 'peter-paul-rubens', 'andy-warhol', 'tommaso-masaccio', 'marc-chagall', 'gustave-courbet', 'nicolas-poussin', 'paul-klee', 'paolo-uccello']
27 paolo-uccello image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondr

38 jean-antoine-watteau image saved
salvador-dalí error
max-ernst- error
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondrian', 'piero-della-francesca', 'peter-paul-rubens', 'andy-warhol', 'tommaso-masaccio', 'marc-chagall', 'gustave-courbet', 'nicolas-poussin', 'paul-klee', 'paolo-uccello', 'william-blake', 'andrea-mantegna', 'jan-vermeer', 'el-greco', 'caspar-david-friedrich', 'winslow-homer', 'marcel-duchamp', 'giorgione', 'frida-kahlo', 'hans-holbein-the-younger', 'fra-angelico', 'jean-antoine-watteau', 'tintoretto']
39 tintoretto image saved
-jasper-johns error
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', '

48 edward-hopper image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondrian', 'piero-della-francesca', 'peter-paul-rubens', 'andy-warhol', 'tommaso-masaccio', 'marc-chagall', 'gustave-courbet', 'nicolas-poussin', 'paul-klee', 'paolo-uccello', 'william-blake', 'andrea-mantegna', 'jan-vermeer', 'el-greco', 'caspar-david-friedrich', 'winslow-homer', 'marcel-duchamp', 'giorgione', 'frida-kahlo', 'hans-holbein-the-younger', 'fra-angelico', 'jean-antoine-watteau', 'tintoretto', 'sandro-botticelli', 'umberto-boccioni', 'joachim-patinir', 'john-constable', 'jacques-louis-david', 'pieter-bruegel-the-elder', 'simone-martini', 'frederic-edwin-church', 'edward-hopper', 'lucio-fontana']
49 l

56 hans-memling image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondrian', 'piero-della-francesca', 'peter-paul-rubens', 'andy-warhol', 'tommaso-masaccio', 'marc-chagall', 'gustave-courbet', 'nicolas-poussin', 'paul-klee', 'paolo-uccello', 'william-blake', 'andrea-mantegna', 'jan-vermeer', 'el-greco', 'caspar-david-friedrich', 'winslow-homer', 'marcel-duchamp', 'giorgione', 'frida-kahlo', 'hans-holbein-the-younger', 'fra-angelico', 'jean-antoine-watteau', 'tintoretto', 'sandro-botticelli', 'umberto-boccioni', 'joachim-patinir', 'john-constable', 'jacques-louis-david', 'pieter-bruegel-the-elder', 'simone-martini', 'frederic-edwin-church', 'edward-hopper', 'lucio-fontana', 'fran

63 egon-schiele image saved
['pablo-picasso', 'giotto-di-bondone', 'leonardo-da-vinci', 'rembrandt-van-rijn', 'wassily-kandinsky', 'claude-monet', 'caravaggio', 'joseph-mallord-william-turner', 'jackson-pollock', 'michelangelo-buonarroti', 'paul-gauguin', 'francisco-de-goya', 'vincent-van-gogh', 'mark-rothko', 'raphael', 'jean-michel-basquiat', 'edvard-munch', 'titian', 'piet-mondrian', 'piero-della-francesca', 'peter-paul-rubens', 'andy-warhol', 'tommaso-masaccio', 'marc-chagall', 'gustave-courbet', 'nicolas-poussin', 'paul-klee', 'paolo-uccello', 'william-blake', 'andrea-mantegna', 'jan-vermeer', 'el-greco', 'caspar-david-friedrich', 'winslow-homer', 'marcel-duchamp', 'giorgione', 'frida-kahlo', 'hans-holbein-the-younger', 'fra-angelico', 'jean-antoine-watteau', 'tintoretto', 'sandro-botticelli', 'umberto-boccioni', 'joachim-patinir', 'john-constable', 'jacques-louis-david', 'pieter-bruegel-the-elder', 'simone-martini', 'frederic-edwin-church', 'edward-hopper', 'lucio-fontana', 'fran

In [6]:
# close browser
browser.quit()

In [8]:
# save artists with images to a df
df_artists_images = pd.DataFrame(artist_image_found, columns = ['Artists'])
df_artists_images['Artist_Pieces'] = art_pieces

# display dataframe
df_artists_images.head()

Unnamed: 0,Artists,Artist_Pieces
0,pablo-picasso,Child with dove
1,giotto-di-bondone,"The Trial by Fire, St. Francis offers to walk ..."
2,leonardo-da-vinci,The Madonna of the Carnation
3,rembrandt-van-rijn,The Storm on the Sea of Galilee
4,wassily-kandinsky,Blue rider


In [9]:
# save the csv
saved = df_artists_images.to_csv("artist_and_art_titles.csv")