# Scraping WikiArt by Selenium

### Author: Yangyu Wang
### Date: Jan 18, 2025

## Generate artist names from existing dataset

In [1]:
import pandas as pd
import csv
import unicodedata

def remove_accents(input_str):
    """
    Convert Spanish and other accented characters to their English equivalent.
    """
    return unicodedata.normalize('NFKD', input_str).encode('ASCII', 'ignore').decode('utf-8')

file_path = "wikiart_expanded/wikiart_expanded.csv"
data = pd.read_csv(file_path)

unique_artists = data['Artist'].dropna().unique()

print(unique_artists[:10])

['Ancient Egypt' 'Ancient Greek Painting and Sculpture'
 'Ancient Greek Pottery' 'Apelles' 'Fayum portrait' 'Cricorps'
 'Teddy Cobeña' 'Thiago Boecan' 'Edward Burne-Jones' 'Orthodox Icons']


In [46]:
unique_artists[:5]

array(['Ancient Egypt', 'Ancient Greek Painting and Sculpture',
       'Ancient Greek Pottery', 'Apelles', 'Fayum portrait'], dtype=object)

## Open firefox

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Firefox()

The geckodriver version (0.34.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (134.0.1); currently, geckodriver 0.35.0 is recommended for firefox 134.*, so it is advised to delete the driver in PATH and retry


## Functions to extract artist information and artworks contents

In [41]:
def extract_artist_info(artist_name, url_name = None):

    if url_name:
        artist = url_name
    else:   
        artist = remove_accents(artist_name).lower().replace('.', '').replace(' ', '-')

    artist_info = {"name": artist_name, "url_name": artist}

    url_artist = f"https://www.wikiart.org/en/{artist}"
    #url_artworks = f"https://www.wikiart.org/en/{artist}/all-works/text-list"

    driver.get(url_artist)
    
    try:
        WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.TAG_NAME, "h3"))
        )
    except:
        print("Error (404):", artist_name)
        return artist_name

    try:
        birth_date_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="birthDate"]')
        artist_info["Birth Date"] = birth_date_element.text.strip()
    except:
        artist_info["Birth Date"] = None

    try:
        death_date_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="deathDate"]')
        artist_info["Death Date"] = death_date_element.text.strip()
    except:
        artist_info["Death Date"] = None

    try:
        birth_place_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="birthPlace"]')
        artist_info["Birth place"] = birth_place_element.text.strip()
    except:
        artist_info["Birth place"] = None

    try:
        death_place_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="deathPlace"]')
        artist_info["Death place"] = death_place_element.text.strip()
    except:
        artist_info["Death place"] = None

    li_elements = driver.find_elements(By.TAG_NAME, "li")

    for element in li_elements:
        try:
            field_name = element.find_element(By.TAG_NAME, "s").text.strip().replace(":", "")
        except:
            continue
            
        try:
            value_elements = element.find_elements(By.XPATH, ".//span/a | .//a")
            value_texts = [v.text.strip() for v in value_elements if v.text.strip()]

            if value_texts:
                artist_info[field_name] = "|".join(value_texts)
        except:
            artist_info[field_name] = None

    try:
        wiki_element = driver.find_element(By.CSS_SELECTOR, ".truncated-link a.truncate.external")
        artist_info["Wikipedia"] = wiki_element.get_attribute("href")
    except:
        artist_info["Wikipedia"] = None

    return(artist_info)


In [29]:
def extract_artist_work(artist_name, url_name = None):

    if url_name:
        artist = url_name
    else:   
        artist = remove_accents(artist_name).lower().replace('.', '').replace(' ', '-')
    
    artworks = []

    url_artworks = f"https://www.wikiart.org/en/{artist}/all-works/text-list"

    driver.get(url_artworks)

    painting_elements = driver.find_elements(By.CSS_SELECTOR, ".painting-list-text-row")

    for element in painting_elements:
        try:
            artwork_element = element.find_element(By.TAG_NAME, "a")
            artwork_name = artwork_element.text.strip()
            artwork_link = artwork_element.get_attribute("href")

            try:
                year_element = element.find_element(By.TAG_NAME, "span")
                artwork_year = year_element.text.strip().replace(",", "")
            except:
                artwork_year = None

            # 存储数据
            artworks.append({
                "Artist_name": artist,
                "Artwork Name": artwork_name,
                "Year": artwork_year,
                "Link": artwork_link
            })
        except:
            continue

    return artworks

## Extract artist information and artworks contents

In [5]:
from tqdm import tqdm 
import time
import random


In [None]:

artist_data = []
not_exist = []
artworks_data = []

for artist in tqdm(unique_artists, desc="Processing Artists", unit="artist"):
    artist_info = extract_artist_info(artist)
    if isinstance(artist_info, dict):
        artist_data.append(artist_info)
        time.sleep(random.randint(3, 5))
        artworks_data = artworks_data + extract_artist_work(artist)
        time.sleep(random.randint(1, 3))
    else:
        not_exist.append(artist_info)


Processing Artists:   2%|▏         | 50/2174 [08:57<5:53:57, 10.00s/artist]

Error (404): Paweł Kluza


Processing Artists:   6%|▌         | 122/2174 [20:54<5:32:42,  9.73s/artist]

Error (404): [ a y s h ]


Processing Artists:   6%|▌         | 134/2174 [22:55<6:31:44, 11.52s/artist]

Error (404): Stanisław Szukalski


Processing Artists:  13%|█▎        | 277/2174 [47:32<8:11:03, 15.53s/artist]

Error (404): Vahram Gayfedjian


Processing Artists:  14%|█▎        | 298/2174 [51:22<5:03:09,  9.70s/artist]

Error (404): Roderic O'Conor


Processing Artists:  15%|█▍        | 325/2174 [57:04<6:36:03, 12.85s/artist]

Error (404): Oleksa Novakivskyi


Processing Artists:  17%|█▋        | 372/2174 [1:05:57<5:07:58, 10.25s/artist]

Error (404): A.Y. Jackson


Processing Artists:  18%|█▊        | 397/2174 [1:10:20<4:33:44,  9.24s/artist]

Error (404): Frank O'Meara


Processing Artists:  19%|█▉        | 421/2174 [1:14:37<4:43:46,  9.71s/artist]

Error (404): Stanisław Wyspiański


Processing Artists:  26%|██▌       | 555/2174 [1:38:06<5:03:38, 11.25s/artist] 

Error (404): Soltan Soltanlı


Processing Artists:  26%|██▋       | 576/2174 [1:41:31<4:13:29,  9.52s/artist]

Error (404): Marevna (Marie Vorobieff)


Processing Artists:  27%|██▋       | 578/2174 [1:41:53<4:29:17, 10.12s/artist]

Error (404): Olusola David, Ayibiowu


Processing Artists:  27%|██▋       | 592/2174 [1:44:10<4:38:42, 10.57s/artist]

Error (404): Georgia O'Keeffe


Processing Artists:  28%|██▊       | 600/2174 [1:45:34<4:44:53, 10.86s/artist]

Error (404): Stanisław Ignacy Witkiewicz


Processing Artists:  30%|███       | 654/2174 [1:53:41<3:30:57,  8.33s/artist]

Error (404): Yuri Zlotnikov


Processing Artists:  30%|███       | 660/2174 [1:54:37<3:56:08,  9.36s/artist]

Error (404): M.F. Husain


Processing Artists:  31%|███▏      | 680/2174 [1:57:42<3:34:43,  8.62s/artist]

Error (404): Stig Brøgger


Processing Artists:  32%|███▏      | 692/2174 [1:59:35<3:54:55,  9.51s/artist]

Error (404): Georges Troubat


Processing Artists:  32%|███▏      | 695/2174 [2:00:02<3:41:07,  8.97s/artist]

Error (404): di Mauro


Processing Artists:  32%|███▏      | 696/2174 [2:00:11<3:37:18,  8.82s/artist]

Error (404): Goran Despotovski


Processing Artists:  32%|███▏      | 699/2174 [2:00:41<3:49:18,  9.33s/artist]

Error (404): Babak-Matveev


Processing Artists:  36%|███▌      | 783/2174 [2:14:13<3:34:41,  9.26s/artist]

Error (404): Włodzimierz Zakrzewski


Processing Artists:  36%|███▌      | 784/2174 [2:14:24<3:43:54,  9.67s/artist]

Error (404): Vajiha Samadova


Processing Artists:  38%|███▊      | 829/2174 [2:21:43<3:19:53,  8.92s/artist]

Error (404): Vlady


Processing Artists:  40%|███▉      | 867/2174 [2:28:42<3:33:02,  9.78s/artist]

Error (404): Rose O'Neill


Processing Artists:  42%|████▏     | 906/2174 [2:35:10<3:12:11,  9.09s/artist]

Error (404): Carl Holsøe


Processing Artists:  43%|████▎     | 940/2174 [2:40:57<3:06:12,  9.05s/artist]

Error (404): Chaim Goldberg


Processing Artists:  44%|████▍     | 967/2174 [2:45:28<3:24:08, 10.15s/artist]

Error (404): YiFei  Chen


Processing Artists:  45%|████▌     | 982/2174 [2:47:49<2:57:21,  8.93s/artist]

Error (404): Aleksander Belyaev


Processing Artists:  45%|████▌     | 984/2174 [2:48:09<3:04:41,  9.31s/artist]

Error (404): Goran Vojinovic


Processing Artists:  46%|████▌     | 994/2174 [2:49:48<3:24:08, 10.38s/artist]

Error (404): Raben Fernan


Processing Artists:  52%|█████▏    | 1123/2174 [3:10:45<2:53:41,  9.92s/artist]

Error (404): Enrique Silvestre


Processing Artists:  54%|█████▍    | 1177/2174 [3:19:15<2:21:07,  8.49s/artist]

Error (404): George Grosz


Processing Artists:  57%|█████▋    | 1245/2174 [3:29:54<2:29:48,  9.68s/artist]

Error (404): Yun Hyong–keun


Processing Artists:  58%|█████▊    | 1252/2174 [3:31:01<2:19:13,  9.06s/artist]

Error (404): Ilse D'Hollander


Processing Artists:  60%|█████▉    | 1298/2174 [3:38:09<2:19:27,  9.55s/artist]

Error (404): Robert De Niro, Sr.


Processing Artists:  60%|██████    | 1306/2174 [3:39:23<2:01:48,  8.42s/artist]

Error (404): Zani Corrado


Processing Artists:  65%|██████▍   | 1404/2174 [3:54:11<1:53:40,  8.86s/artist]

Error (404): Hifa Cybe


Processing Artists:  65%|██████▍   | 1411/2174 [3:55:17<2:07:49, 10.05s/artist]

Error (404): Petro Kholodny (Elder)


Processing Artists:  71%|███████   | 1539/2174 [4:14:17<1:36:52,  9.15s/artist]

Error (404): Christian Attersee


Processing Artists:  77%|███████▋  | 1674/2174 [4:34:19<1:20:15,  9.63s/artist]

Error (404): Allan D'Arcangelo


Processing Artists:  77%|███████▋  | 1676/2174 [4:34:38<1:16:56,  9.27s/artist]

Error (404): Yuriy Khymych


Processing Artists:  78%|███████▊  | 1694/2174 [4:37:18<1:09:15,  8.66s/artist]

Error (404): Varnette Honeywood


Processing Artists:  80%|███████▉  | 1733/2174 [4:43:09<1:04:14,  8.74s/artist]

Error (404): Iain Baxter&


Processing Artists:  81%|████████  | 1759/2174 [4:47:01<56:13,  8.13s/artist]  

Error (404): Bernd and Hilla Becher


Processing Artists:  83%|████████▎ | 1799/2174 [4:52:57<49:50,  7.97s/artist]  

Error (404): Robert Ryman


Processing Artists:  86%|████████▌ | 1875/2174 [5:04:15<46:53,  9.41s/artist]

Error (404): Jorge Pardo


Processing Artists:  88%|████████▊ | 1923/2174 [5:11:39<39:51,  9.53s/artist]

Error (404): Georg Miciú


Processing Artists:  90%|█████████ | 1961/2174 [5:17:29<35:54, 10.11s/artist]

Error (404): Laolu Senbanjo


Processing Artists:  91%|█████████▏| 1988/2174 [5:21:30<26:50,  8.66s/artist]

Error (404): Miss.Tic


Processing Artists:  94%|█████████▍| 2050/2174 [5:30:28<19:48,  9.58s/artist]

Error (404): Luis Álvarez Roure


Processing Artists:  96%|█████████▌| 2083/2174 [5:35:38<13:40,  9.02s/artist]

Error (404): Shin Yoon-bok


Processing Artists:  98%|█████████▊| 2130/2174 [5:43:30<07:24, 10.11s/artist]

## Save into csv files

In [11]:
def extract_all_keys(dict_list):
    """
    Extract all unique keys from a list of dictionaries.
    Ensures that different dictionaries with different keys are fully accounted for.
    """
    ordered_keys = []
    seen_keys = set()
    
    for dictionary in dict_list:
        for key in dictionary.keys():
            if key not in seen_keys:
                ordered_keys.append(key)
                seen_keys.add(key)
                
    return ordered_keys

In [52]:
all_keys_artist = extract_all_keys(artist_data)

with open("artist_data/artist_data_new.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artist)

    writer.writeheader()

    for artist in tqdm(artist_data, desc="Saving artist info", unit="artist"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artist})

Saving artist info: 100%|██████████| 878/878 [00:00<00:00, 30261.64artist/s]


In [None]:
all_keys_artworks = extract_all_keys(artworks_data)

with open("artist_data/artist_artwork.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artworks)

    writer.writeheader()

    for artist in tqdm(artworks_data, desc="Saving artwork info", unit="artwork"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artworks})

Saving artwork info: 100%|██████████| 80693/80693 [00:00<00:00, 91055.27artwork/s]


## Re-scraping for not found items

In [None]:
with open("to_search.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["name", "url"])

    writer.writeheader()

    for artist in not_exist:
        writer.writerow({"name": artist})


Code the urls by human

In [42]:
with open("to_search_completed.csv", mode='r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    lines = [(row[0], row[1]) for row in reader][1:]


artist_data_notfound = []
not_exist_notfound = []
artworks_data_notfound = []

for artist, url in tqdm(lines, desc="Processing Artists", unit="artist"):
    artist_info = extract_artist_info(artist, url)
    if isinstance(artist_info, dict):
        artist_data_notfound.append(artist_info)
        time.sleep(1)
        artworks_data_notfound = artworks_data_notfound + extract_artist_work(artist, url)
        time.sleep(1)
    else:
        not_exist_notfound.append(artist_info)

Processing Artists: 100%|██████████| 4/4 [00:18<00:00,  4.66s/artist]


In [44]:
all_keys_artist = extract_all_keys(artist_data_notfound)

with open("artist_data/artist_data_notfound.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artist)

    writer.writeheader()

    for artist in tqdm(artist_data_notfound, desc="Saving artist info", unit="artist"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artist})

#deleted after merging

Saving artist info: 100%|██████████| 4/4 [00:00<00:00, 10149.56artist/s]


In [43]:
all_keys_artworks = extract_all_keys(artworks_data_notfound)

with open("artist_data/artist_artwork_notfound.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artworks)

    writer.writeheader()

    for artist in tqdm(artworks_data_notfound, desc="Saving artwork info", unit="artwork"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artworks})

#deleted after merging

Saving artwork info: 100%|██████████| 40/40 [00:00<00:00, 56660.64artwork/s]
