# Scraping artworks on wikiart by Selenium

### Author: Yangyu Wang
### Date: Jan 18, 2025

In [1]:
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests

In [2]:
driver = webdriver.Firefox()

The geckodriver version (0.34.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (134.0.1); currently, geckodriver 0.35.0 is recommended for firefox 134.*, so it is advised to delete the driver in PATH and retry


In [3]:
def extract_artworkinfo(url):

    driver.get(url)

    artwork_info = {"url": url}
    
    try:
        WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.TAG_NAME, "h3"))
        )
    except:
        print("Error (404):", url)
        return url

    try:
        birth_date_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="dateCreated"]')
        artwork_info["Create Date"] = birth_date_element.text.strip()
    except:
        artwork_info["Create Date"] = None

    try:
        birth_place_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="locationCreated"]')
        artwork_info["Create Location"] = birth_place_element.text.strip()
    except:
        artwork_info["Create Location"] = None


    article = driver.find_element(By.XPATH, "//article")
    li_elements = article.find_elements(By.TAG_NAME, "li")

    for element in li_elements:
        try:
            field_name = element.find_element(By.TAG_NAME, "s").text.strip().replace(":", "")
        except:
            continue
            
        try:
            if field_name in ["Media", "Style", "Genre"]:
                value_elements = element.find_elements(By.XPATH, ".//a")
            else:
                value_elements = element.find_elements(By.XPATH, ".//span/a | .//a | .//span")
            value_texts = [v.text.strip() for v in value_elements if v.text.strip()]

            if value_texts:
                artwork_info[field_name] = "|".join(list(set(value_texts)))
        except:
            artwork_info[field_name] = None

        try:
            image_element = driver.find_element(By.XPATH, "//img[@itemprop='image']")
            image_url = image_element.get_attribute("src")
            artwork_info["image_url"] = image_url
            #artwork_info["image_n"] = n

            #image_data = requests.get(image_url).content
            #with open(f"artwork_picture/{n}.jpg", "wb") as file:
            #    file.write(image_data)
        except:
            artwork_info["image_url"] = None
            #artwork_info["image_n"] = None

    try:
        tags_elements = driver.find_elements(By.CSS_SELECTOR, ".tags-cheaps__item a.tags-cheaps__item__ref")
        tags = "|".join([tag.text.strip() for tag in tags_elements])
        artwork_info["tags"] = tags
    except:
        artwork_info["tags"] = None


    return artwork_info 


In [4]:
artworks = pd.read_csv("artist_data/artist_artwork.csv")

In [5]:
links = list(artworks["Link"])
links[0]

'https://www.wikiart.org/en/ancient-greek-pottery/attic-middle-geometric-amphora-from-kerameikos--800'

In [6]:
import os

url_found = []
for file in os.listdir("artwork_data"):
    if "revised_artwork_data" in file:
        artwork_data = pd.read_csv("artwork_data/" + file)
        url_found += list(artwork_data["url"])
        #artwork_data["Style"] = ["|".join(style.split("|")[0].split(", "))  if str(style) != "nan" else None for style in list(artwork_data["Style"])]
        #artwork_data["Genre"] = ["|".join(genre.split("|")[0].split(", "))  if str(genre) != "nan" else None for genre in list(artwork_data["Genre"])]
        #artwork_data["Media"] = ["|".join(media.split("|")[0].split(", "))  if str(media) != "nan" else None for media in list(artwork_data["Media"])]
        #artwork_data.to_csv("revised_" + file)

url_unfound = list(set(links) - set(url_found))

In [7]:
len(url_unfound)

104438

In [8]:
from tqdm import tqdm 
import time
import random

In [9]:
def extract_all_keys(dict_list):
    """
    Extract all unique keys from a list of dictionaries.
    Ensures that different dictionaries with different keys are fully accounted for.
    """
    ordered_keys = []
    seen_keys = set()
    
    for dictionary in dict_list:
        for key in dictionary.keys():
            if key not in seen_keys:
                ordered_keys.append(key)
                seen_keys.add(key)
                
    return ordered_keys

In [10]:
count = 7
count

7

In [11]:
len(url_unfound[(count-7)*10000:(count-6)*10000])
print(url_unfound[(count-7)*10000:(count-6)*10000][0])
print(url_unfound[(count-7)*10000:(count-6)*10000][9999])

https://www.wikiart.org/en/paul-gauguin/tree-in-the-farm-yard-1874
https://www.wikiart.org/en/william-adolphe-bouguereau/the-return-of-the-market-1869


In [12]:
artwork_data = []
notfound = []
n = 0

for url in tqdm(url_unfound[(count-7)*10000:(count-6)*10000], desc="Processing Artworks", unit="artwork"):
    try:
        artwork_info = extract_artworkinfo(url)
        if not isinstance(artwork_info, str):
            artwork_data.append(artwork_info)
        else:
            notfound.append((artwork_info))

    except KeyboardInterrupt:
        print("stopped")
        break
    except Exception:
        notfound.append((url))
    
all_keys_artist = extract_all_keys(artwork_data)

with open(f"artwork_data/revised_artwork_data_{count}.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artist)

    writer.writeheader()

    for artist in tqdm(artwork_data, desc="Saving artist info", unit="artist"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artist})

artwork_data = []

Processing Artworks:   0%|          | 0/10000 [00:00<?, ?artwork/s]

Processing Artworks:  52%|█████▏    | 5169/10000 [3:14:18<4:47:48,  3.57s/artwork]

Error (404): https://www.wikiart.org/en/fyodor-vasilyev/pond-at-the-sunset


Processing Artworks:  97%|█████████▋| 9683/10000 [6:03:16<13:56,  2.64s/artwork]  

Error (404): https://www.wikiart.org/en/jacques-louis-david/the-death-of-marat-1793


Processing Artworks: 100%|██████████| 10000/10000 [6:15:13<00:00,  2.25s/artwork]
Saving artist info: 100%|██████████| 9998/9998 [00:00<00:00, 50631.71artist/s]
