# Scrape for Artists and Representative Image

Scrape for the top 100 most influential artists of all time. Select a painting that
is most representative of their known art and save to a folder. 
Reference the images for image transfers

## Scrape for top 101 artists of all time

In [1]:
# import libraries
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import pymongo
from splinter import Browser
import urllib.request
import time

In [14]:
# # launch chrome driver
executable_path = {'executable_path': '/Users/prettyvo/Desktop/chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

In [3]:
# visit the site
url = "http://www.theartwolf.com/articles/most-important-painters.htm"
browser.visit(url)
html=browser.html
art = BeautifulSoup(html, 'html.parser')

In [4]:
# scrape for the artists anmes 
artists = art.find('div', class_='noticiacentro')
artists_info = artists.find_all('p')
artists_name = [x.strong for x in artists_info]

# artists_names = [x.text for x in artists_name]
artists_name.pop(0) # remove first in list -> intro paragraph with no artist
artists_name = [x.text for x in artists_name] # pull text from tag

# manipulate arists list to search in wikiart
artists_search = [x.lower() for x in artists_name]
artists_search = [x.replace(' ', '-') for x in artists_search]

# put artists names into a df
df = pd.DataFrame(artists_name, columns=["artists"])
df['artists_search_name'] = artists_search
df.head()

Unnamed: 0,artists,artists_search_name
0,PABLO PICASSO,pablo-picasso
1,GIOTTO DI BONDONE,giotto-di-bondone
2,LEONARDO DA VINCI,leonardo-da-vinci
3,PAUL CÉZANNE,paul-cézanne
4,REMBRANDT VAN RIJN,rembrandt-van-rijn


## Scrape for images

In [15]:
# loop over and pull first image for artist
artist_image_found = []
art_pieces = []
art_work = []
counter  = 0
for name in artists_search:
    # visit the site for each artist
    url_paintings = f'https://www.wikiart.org/en/{name}'
    browser.visit(url_paintings)
    time.sleep(1) # wait for the page to load
    html3=browser.html
    search_artist = BeautifulSoup(html3, 'html.parser')
    
    try:
        # get the image of the artwork
        images = search_artist.find('div', class_='masonry-content')
        paintings = images.find_all('img')
        paintings = [x['src'] for x in paintings]
        
        # get the title of the art work    
        title = search_artist.find_all('div', class_='title-block')
        painting_titles = [x.text.strip() for x in title] # clean data
        painting_titles = [x.split('\n')[0].strip() for x in painting_titles] # save only the title
        art_pieces.append(painting_titles[0])
        art_work.append(paintings[0])
        artist_image_found.append(name)
        
        # save image
        save_image = os.path.join("images", f'{name}_{painting_titles[0]}.jpg')
        urllib.request.urlretrieve(paintings[0], save_image) #save the image from the url
        
        print(f'{counter} {name} image saved')
        print(f'paini')
        counter +=1
    except:
        print(f'{name} error')
        pass

0 pablo-picasso image saved
paini
1 giotto-di-bondone image saved
paini
2 leonardo-da-vinci image saved
paini
paul-cézanne error
3 rembrandt-van-rijn image saved
paini
diego-velázquez error
4 wassily-kandinsky image saved
paini
5 claude-monet image saved
paini
6 caravaggio image saved
paini
7 joseph-mallord-william-turner image saved
paini
-jan-van-eyck error
albrecht-dürer error
8 jackson-pollock image saved
paini
9 michelangelo-buonarroti image saved
paini
10 paul-gauguin image saved
paini
11 francisco-de-goya image saved
paini
12 vincent-van-gogh image saved
paini
édouard-manet error
13 mark-rothko image saved
paini
henri-matisse- error
14 raphael image saved
paini
15 jean-michel-basquiat image saved
paini
16 edvard-munch image saved
paini
17 titian image saved
paini
18 piet-mondrian image saved
paini
19 piero-della-francesca image saved
paini
20 peter-paul-rubens image saved
paini
21 andy-warhol image saved
paini
joan-miró error
22 tommaso-masaccio image saved
paini
23 marc-chagall

In [16]:
# close browser
browser.quit()

In [18]:
# save artists with images to a df
df_artists_images = pd.DataFrame(artist_image_found, columns = ['Artists'])
df_artists_images['Piece_Title'] = art_pieces
df_artists_images['Art_Piece'] = art_work
print(len(art_pieces))

# display dataframe
df_artists_images.head()

72


Unnamed: 0,Artists,Piece_Title,Art_Piece
0,pablo-picasso,Child with dove,https://uploads4.wikiart.org/images/pablo-pica...
1,giotto-di-bondone,"The Trial by Fire, St. Francis offers to walk ...",https://uploads6.wikiart.org/images/giotto/the...
2,leonardo-da-vinci,The Madonna of the Carnation,https://uploads7.wikiart.org/images/leonardo-d...
3,rembrandt-van-rijn,The Storm on the Sea of Galilee,https://uploads4.wikiart.org/images/rembrandt/...
4,wassily-kandinsky,Blue rider,https://uploads3.wikiart.org/images/wassily-ka...


In [19]:
# save the csv
saved = df_artists_images.to_csv("artist_and_art_titles.csv")