# Scraping artworks on wikiart by Selenium

### Author: Yangyu Wang
### Date: Jan 18, 2025

In [1]:
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests

In [2]:
driver = webdriver.Firefox()

The geckodriver version (0.34.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (134.0.2); currently, geckodriver 0.35.0 is recommended for firefox 134.*, so it is advised to delete the driver in PATH and retry


In [3]:
def extract_artworkinfo(url):

    driver.get(url)

    artwork_info = {"url": url}
    
    try:
        WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.TAG_NAME, "h3"))
        )
    except:
        print("Error (404):", url)
        return url

    try:
        birth_date_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="dateCreated"]')
        artwork_info["Create Date"] = birth_date_element.text.strip()
    except:
        artwork_info["Create Date"] = None

    try:
        birth_place_element = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="locationCreated"]')
        artwork_info["Create Location"] = birth_place_element.text.strip()
    except:
        artwork_info["Create Location"] = None


    article = driver.find_element(By.XPATH, "//article")
    li_elements = article.find_elements(By.TAG_NAME, "li")

    for element in li_elements:
        try:
            field_name = element.find_element(By.TAG_NAME, "s").text.strip().replace(":", "")
        except:
            continue
            
        try:
            if field_name in ["Media", "Style", "Genre"]:
                value_elements = element.find_elements(By.XPATH, ".//a")
            else:
                value_elements = element.find_elements(By.XPATH, ".//span/a | .//a | .//span")
            value_texts = [v.text.strip() for v in value_elements if v.text.strip()]

            if value_texts:
                artwork_info[field_name] = "|".join(list(set(value_texts)))
        except:
            artwork_info[field_name] = None

        try:
            image_element = driver.find_element(By.XPATH, "//img[@itemprop='image']")
            image_url = image_element.get_attribute("src")
            artwork_info["image_url"] = image_url
            #artwork_info["image_n"] = n

            #image_data = requests.get(image_url).content
            #with open(f"artwork_picture/{n}.jpg", "wb") as file:
            #    file.write(image_data)
        except:
            artwork_info["image_url"] = None
            #artwork_info["image_n"] = None

    try:
        tags_elements = driver.find_elements(By.CSS_SELECTOR, ".tags-cheaps__item a.tags-cheaps__item__ref")
        tags = "|".join([tag.text.strip() for tag in tags_elements])
        artwork_info["tags"] = tags
    except:
        artwork_info["tags"] = None


    return artwork_info 


In [2]:
artworks = pd.read_csv("artist_data/artist_artwork.csv")

In [3]:
links = list(artworks["Link"])
links[0]

'https://www.wikiart.org/en/ancient-greek-pottery/attic-middle-geometric-amphora-from-kerameikos--800'

In [4]:
import os

url_found = []
for file in os.listdir("artwork_data"):
    if "revised_artwork_data" in file:
        artwork_data = pd.read_csv("artwork_data/" + file)
        url_found += list(artwork_data["url"])
        print(len(set(url_found)), len(set(list(artwork_data["url"]))))
        #artwork_data["Style"] = ["|".join(style.split("|")[0].split(", "))  if str(style) != "nan" else None for style in list(artwork_data["Style"])]
        #artwork_data["Genre"] = ["|".join(genre.split("|")[0].split(", "))  if str(genre) != "nan" else None for genre in list(artwork_data["Genre"])]
        #artwork_data["Media"] = ["|".join(media.split("|")[0].split(", "))  if str(media) != "nan" else None for media in list(artwork_data["Media"])]
        #artwork_data.to_csv("revised_" + file)

url_unfound = list(set(links) - set(url_found))

9992 9992
18484 9996
28477 9993
35536 9997
41422 9995
50486 9995
58708 9973
66058 9995
71126 9998
75349 9994
85324 9975
85327 3
95313 9986
105302 9989
114294 8992
116676 2382
126668 9992
136659 9991
143310 7461
147663 7330
157654 9991
164360 9998
174353 9993


In [5]:
len(url_unfound)

2

In [8]:
from tqdm import tqdm 
import time
import random

In [10]:
def extract_all_keys(dict_list):
    """
    Extract all unique keys from a list of dictionaries.
    Ensures that different dictionaries with different keys are fully accounted for.
    """
    ordered_keys = []
    seen_keys = set()
    
    for dictionary in dict_list:
        for key in dictionary.keys():
            if key not in seen_keys:
                ordered_keys.append(key)
                seen_keys.add(key)
                
    return ordered_keys

In [18]:
count = 23
count

23

In [19]:
artwork_data = []
notfound = []
n = 0

for url in tqdm(url_unfound, desc="Processing Artworks", unit="artwork"):
    try:
        artwork_info = extract_artworkinfo(url)
        if not isinstance(artwork_info, str):
            artwork_data.append(artwork_info)
        else:
            notfound.append((artwork_info))

    except KeyboardInterrupt:
        print("stopped")
        break
    except Exception:
        notfound.append((url))
    
all_keys_artist = extract_all_keys(artwork_data)

with open(f"artwork_data/revised_artwork_data_{count}.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_keys_artist)

    writer.writeheader()

    for artist in tqdm(artwork_data, desc="Saving artist info", unit="artist"):
        writer.writerow({key: artist.get(key, "") for key in all_keys_artist})

artwork_data = []

Processing Artworks: 100%|██████████| 2/2 [00:05<00:00,  2.67s/artwork]


Error (404): https://www.wikiart.org/en/jacques-louis-david/the-death-of-marat-1793


Saving artist info: 0artist [00:00, ?artist/s]


In [6]:

df_list = [pd.read_csv("artwork_data/" + file) for file in os.listdir("artwork_data")]
merged_df = pd.concat(df_list, ignore_index=True, join="outer")  # Use 'inner' for only common columns



In [26]:
merged_df.drop_duplicates().to_csv("artwork_data/artwork_data_all.csv", index=False)

Unnamed: 0,url,Create Date,Create Location,Date,image_url,Style,Genre,Media,tags,Location,Series,Period,Theme,Share
0,https://www.wikiart.org/en/paul-emile-chabas/l...,1905,,1905|c.1905; France,https://uploads2.wikiart.org/00396/images/paul...,Art Deco,nude painting (nu)|figurative,oil|canvas,female-nude,,,,,
1,https://www.wikiart.org/en/thomas-dewing/the-d...,1886,,1886,https://uploads4.wikiart.org/00114/images/thom...,Romanticism,genre painting,,Mythology,,,,,
2,https://www.wikiart.org/en/john-mclaughlin/num...,1964,,1964,https://uploads0.wikiart.org/images/john-mclau...,Minimalism,abstract,,,,,,,
3,https://www.wikiart.org/en/hans-bellmer/the-wo...,1948,,1948,https://uploads5.wikiart.org/images/hans-bellm...,Surrealism,sketch and study,,,,,,,
4,https://www.wikiart.org/en/eyvind-earle/sierra...,1988,,1988; United States|1988,https://uploads2.wikiart.org/images/eyvind-ear...,Magic Realism,landscape,,cliffs-and-rocks|Natural landscape|Mountain|Sk...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206006,https://www.wikiart.org/en/robert-smithson/bro...,1971,,1971,https://uploads5.wikiart.org/images/robert-smi...,Environmental (Land) Art,installation,,Water resources|Water|Reservoir|Circle,,,,,
206007,https://www.wikiart.org/en/hafiz-osman/unknown...,,,,https://uploads6.wikiart.org/images/hafiz-osma...,Ottoman Period,calligraphy,,Font|Text,,,,,
206008,https://www.wikiart.org/en/maxime-maufra/passi...,1898,,1898; France|1898,https://uploads4.wikiart.org/images/maxime-mau...,Post-Impressionism,marina,oil|canvas,boats-and-ships|seas-and-oceans|Galway hooker|...,Private Collection,,,,
206009,https://www.wikiart.org/en/camille-corot/orphe...,1865,,1865,https://uploads1.wikiart.org/images/camille-co...,Romanticism,mythological painting,oil|canvas,Greek-and-Roman-Mythology|Orpheus-and-Eurydice...,"Kimbell Art Museum, Fort Worth, TX, US",,,"Other cityscapes, landscapes and seascapes",
