# Scraping artworks on wikiart by Selenium

### Author: Yangyu Wang
### Date: Jan 18, 2025

In [1]:
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests

In [2]:
driver = webdriver.Firefox()

The geckodriver version (0.34.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (134.0.1); currently, geckodriver 0.35.0 is recommended for firefox 134.*, so it is advised to delete the driver in PATH and retry


In [3]:
def extract_artworkinfo(url):

    driver.get(url)

    artwork_info = {"url": url}
    
    try:
        WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.TAG_NAME, "h3"))
        )
    except:
        print("Error (404):", url)
        return url

    try:
        birth_date_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="dateCreated"]')
        artwork_info["Create Date"] = birth_date_element.text.strip()
    except:
        artwork_info["Create Date"] = None

    try:
        birth_place_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="locationCreated"]')
        artwork_info["Create Location"] = birth_place_element.text.strip()
    except:
        artwork_info["Create Location"] = None


    article = driver.find_element(By.XPATH, "//article")
    li_elements = article.find_elements(By.TAG_NAME, "li")

    for element in li_elements:
        try:
            field_name = element.find_element(By.TAG_NAME, "s").text.strip().replace(":", "")
        except:
            continue
            
        try:
            if field_name in ["Media", "Style", "Genre"]:
                value_elements = element.find_elements(By.XPATH, ".//a")
            else:
                value_elements = element.find_elements(By.XPATH, ".//span/a | .//a | .//span")
            value_texts = [v.text.strip() for v in value_elements if v.text.strip()]

            if value_texts:
                artwork_info[field_name] = "|".join(list(set(value_texts)))
        except:
            artwork_info[field_name] = None

        try:
            image_element = driver.find_element(By.XPATH, "//img[@itemprop='image']")
            image_url = image_element.get_attribute("src")
            artwork_info["image_url"] = image_url
            #artwork_info["image_n"] = n

            #image_data = requests.get(image_url).content
            #with open(f"artwork_picture/{n}.jpg", "wb") as file:
            #    file.write(image_data)
        except:
            artwork_info["image_url"] = None
            #artwork_info["image_n"] = None

    try:
        tags_elements = driver.find_elements(By.CSS_SELECTOR, ".tags-cheaps__item a.tags-cheaps__item__ref")
        tags = "|".join([tag.text.strip() for tag in tags_elements])
        artwork_info["tags"] = tags
    except:
        artwork_info["tags"] = None


    return artwork_info 


In [4]:
artworks = pd.read_csv("artist_data/artist_artwork.csv")

In [5]:
links = list(artworks["Link"])
links[0]

'https://www.wikiart.org/en/ancient-greek-pottery/attic-middle-geometric-amphora-from-kerameikos--800'

In [15]:
import os

url_found = []
for file in os.listdir("artwork_data"):
    if "revised_artwork_data" in file:
        artwork_data = pd.read_csv("artwork_data/" + file)
        url_found += list(artwork_data["url"])
        #artwork_data["Style"] = ["|".join(style.split("|")[0].split(", "))  if str(style) != "nan" else None for style in list(artwork_data["Style"])]
        #artwork_data["Genre"] = ["|".join(genre.split("|")[0].split(", "))  if str(genre) != "nan" else None for genre in list(artwork_data["Genre"])]
        #artwork_data["Media"] = ["|".join(media.split("|")[0].split(", "))  if str(media) != "nan" else None for media in list(artwork_data["Media"])]
        #artwork_data.to_csv("revised_" + file)

url_unfound = list(set(links) - set(url_found))

In [16]:
len(url_unfound)

104438

In [10]:
from tqdm import tqdm 
import time
import random

In [11]:
def extract_all_keys(dict_list):
    """
    Extract all unique keys from a list of dictionaries.
    Ensures that different dictionaries with different keys are fully accounted for.
    """
    ordered_keys = []
    seen_keys = set()
    
    for dictionary in dict_list:
        for key in dictionary.keys():
            if key not in seen_keys:
                ordered_keys.append(key)
                seen_keys.add(key)
                
    return ordered_keys

In [20]:
count = 7
count

7

In [14]:
artwork_data = []
notfound = []
n = 0

for url in tqdm(url_unfound, desc="Processing Artworks", unit="artwork"):
    try:
        artwork_info = extract_artworkinfo(url)
        if not isinstance(artwork_info, str):
            artwork_data.append(artwork_info)
        else:
            notfound.append((artwork_info))

    except KeyboardInterrupt:
        print("stopped")
        break
    except Exception:
        notfound.append((url))
    
    n += 1

    if n%10000 == 0:
        all_keys_artist = extract_all_keys(artwork_data)

        with open(f"artwork_data/revised_artwork_data_{count}.csv", mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=all_keys_artist)

            writer.writeheader()

            for artist in tqdm(artwork_data, desc="Saving artist info", unit="artist"):
                writer.writerow({key: artist.get(key, "") for key in all_keys_artist})
        
        count += 1
        artwork_data = []

        if n//10000 == 1:
            break
        

Processing Artworks:   2%|▏         | 2084/134413 [1:00:30<58:05:41,  1.58s/artwork]

Error (404): https://www.wikiart.org/en/katsushika-hokusai/shimomeguro


Processing Artworks:   2%|▏         | 2912/134413 [1:24:47<53:28:05,  1.46s/artwork]

Error (404): https://www.wikiart.org/en/mikhail-larionov/summer-1912


Processing Artworks:   2%|▏         | 2985/134413 [1:26:50<53:48:21,  1.47s/artwork]

Error (404): https://www.wikiart.org/en/tsuguharu-foujita/portrait-of-a-young-woman-in-profile


Processing Artworks:   3%|▎         | 3392/134413 [1:38:40<96:22:12,  2.65s/artwork]

Error (404): https://www.wikiart.org/en/zinaida-serebriakova/ballet-washroom-1924


Processing Artworks:   4%|▍         | 5088/134413 [2:28:14<89:11:44,  2.48s/artwork] 

Error (404): https://www.wikiart.org/en/victor-meirelles/a-passagem-de-humait-1886


Processing Artworks:   5%|▌         | 6769/134413 [3:17:37<57:32:42,  1.62s/artwork]

Error (404): https://www.wikiart.org/en/morris-louis/saraband-1959


Processing Artworks:   5%|▌         | 7146/134413 [3:29:36<97:08:19,  2.75s/artwork] 

Error (404): https://www.wikiart.org/en/salvador-dali/soft-self-portrait-with-fried-bacon


Processing Artworks:   6%|▌         | 8256/134413 [4:02:52<57:25:21,  1.64s/artwork]

Error (404): https://www.wikiart.org/en/alfred-freddy-krupa/old-town-ozalj-2012


Processing Artworks:   7%|▋         | 9686/134413 [4:46:37<88:16:18,  2.55s/artwork]

Error (404): https://www.wikiart.org/en/taki-183/tag-1970-1


Saving artist info: 100%|██████████| 9991/9991 [00:02<00:00, 3646.88artist/s]rtwork]
Processing Artworks:   9%|▉         | 11887/134413 [6:01:02<95:20:44,  2.80s/artwork] 

Error (404): https://www.wikiart.org/en/zinaida-serebriakova/peasant-girl-1906


Processing Artworks:  10%|▉         | 12971/134413 [6:35:39<85:36:25,  2.54s/artwork]

Error (404): https://www.wikiart.org/en/allan-mccollum/144-plaster-surrogates-no-1


Processing Artworks:  10%|▉         | 13379/134413 [6:48:43<97:45:03,  2.91s/artwork] 

Error (404): https://www.wikiart.org/en/ralph-fasanella/american-tragedy-1964


Processing Artworks:  10%|█         | 13976/134413 [7:06:28<84:50:33,  2.54s/artwork]

Error (404): https://www.wikiart.org/en/laolu-nyc/esu


Processing Artworks:  11%|█         | 14862/134413 [7:31:59<50:13:28,  1.51s/artwork]

Error (404): https://www.wikiart.org/en/pablo-picasso/house-in-a-garden-1908


Processing Artworks:  11%|█         | 15070/134413 [7:38:02<83:22:09,  2.51s/artwork]

Error (404): https://www.wikiart.org/en/joaquim-rodrigo/sereia


Processing Artworks:  12%|█▏        | 16503/134413 [8:20:21<77:00:35,  2.35s/artwork] 

Error (404): https://www.wikiart.org/en/jules-breton/rainbow-1883


Processing Artworks:  13%|█▎        | 17693/134413 [8:58:12<81:47:35,  2.52s/artwork]

Error (404): https://www.wikiart.org/en/thomas-jones/ariccia-buildings-on-the-edge-of-the-town-1777


Processing Artworks:  14%|█▍        | 19404/134413 [9:51:03<56:07:15,  1.76s/artwork]

Error (404): https://www.wikiart.org/en/antonio-bueno/double-self-portrait-1944


Saving artist info: 100%|██████████| 9991/9991 [00:01<00:00, 9007.43artist/s] artwork]
Processing Artworks:  15%|█▌        | 20352/134413 [10:20:10<50:01:34,  1.58s/artwork]

Error (404): https://www.wikiart.org/en/arturo-souto/dance-class-1932


Processing Artworks:  16%|█▌        | 21775/134413 [11:03:05<77:59:17,  2.49s/artwork]

Error (404): https://www.wikiart.org/en/pierre-auguste-renoir/sailboats-at-argenteuil


Processing Artworks:  16%|█▋        | 21916/134413 [11:07:26<82:27:16,  2.64s/artwork]

Error (404): https://www.wikiart.org/en/robert-smithson/glue-pour-1969


Processing Artworks:  17%|█▋        | 22735/134413 [11:33:21<50:38:23,  1.63s/artwork]

Error (404): https://www.wikiart.org/en/carlos-almaraz/early-hawaiians-1983


Processing Artworks:  17%|█▋        | 23056/134413 [11:43:01<77:54:29,  2.52s/artwork]

Error (404): https://www.wikiart.org/en/alexei-harlamoff/a-neapolitan-girl


Processing Artworks:  19%|█▉        | 25245/134413 [12:51:47<81:41:35,  2.69s/artwork]

Error (404): https://www.wikiart.org/en/john-miller/porth-kidney-beach


Processing Artworks:  19%|█▉        | 25562/134413 [13:02:23<78:57:32,  2.61s/artwork]

Error (404): https://www.wikiart.org/en/adolf-dietrich/sechs-meerschweinchen-im-stall-1934


Saving artist info: 100%|██████████| 9993/9993 [00:01<00:00, 9856.65artist/s] artwork]
Processing Artworks:  22%|██▏       | 30187/134413 [15:28:20<78:41:09,  2.72s/artwork]

Error (404): https://www.wikiart.org/en/pablo-picasso/bearded-man-1962


Processing Artworks:  23%|██▎       | 30432/134413 [15:36:12<53:18:51,  1.85s/artwork]

stopped



