In [2]:
import pandas as pd

artemis = pd.read_csv('/Users/sunnyyu/Desktop/Cap4Art/data/artemis_dataset_release_v0.csv')

# get the unique value of paintings
artemis


Unnamed: 0,art_style,painting,emotion,utterance,repetition
0,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"She seems very happy in the picture, and you w...",10
1,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,sadness,This woman has really knotty hands which makes...,10
2,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"When looking at this woman, I am filled with c...",10
3,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,contentment,"A woman looking at ease, peaceful, and satisfi...",10
4,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,awe,She looks like a lady from that past that migh...,10
...,...,...,...,...,...
454679,Cubism,gino-severini_a-dancer-1,awe,the collection and collage of different colors...,48
454680,Romanticism,ivan-aivazovsky_sea-at-night-1861,awe,The peaceful reflections of the moonlight on t...,8
454681,Romanticism,ivan-aivazovsky_sea-at-night-1861,excitement,I can imagine the sailors resting this peacefu...,8
454682,Romanticism,ivan-aivazovsky_sea-at-night-1861,contentment,The steep mountains and the moonlight provide ...,8


In [3]:
links = pd.read_csv('/Users/sunnyyu/Desktop/Cap4Art/data/wikiart_scraped.csv')
links

Unnamed: 0,Style,Artwork,Artist,Date,Link
0,Early-Dynastic,Narmer Palette,Ancient Egypt,3050 BC,https://uploads3.wikiart.org/00265/images/anci...
1,Early-Dynastic,Box Inlay with a Geometric Pattern,Ancient Egypt,3100-2900 BC,https://uploads2.wikiart.org/00244/images/anci...
2,Old-Kingdom,Khafre Enthroned,Ancient Egypt,2570 BC,https://uploads2.wikiart.org/00305/images/anci...
3,Middle-Kingdom,Stele of the Serpent King (Stela of Djet),Ancient Egypt,3000 BC,https://uploads7.wikiart.org/00305/images/anci...
4,Middle-Kingdom,"Laden Donkeys and Ploughing, Tomb of Djar",Ancient Egypt,2060-2010 BC,https://uploads8.wikiart.org/00244/images/anci...
...,...,...,...,...,...
124165,Street-Photography,Portrait of the corn stalk,Alfred Freddy Krupa,2019,https://uploads5.wikiart.org/00241/images/alfr...
124166,Street-Photography,The other side of life,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00241/images/alfr...
124167,Street-Photography,The bonfire during construction,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00242/images/alfr...
124168,Street-Photography,Limpidity,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00248/images/alfr...


In [4]:
import re

# Make a copy to work on
df_artemis_processed = artemis.copy()

# Function to process Artemis painting string
def parse_painting_string(painting_str):
    # Split into artist and artwork-year part
    parts = painting_str.split('_', 1)
    if len(parts) != 2:
        return None, None, None

    artist_raw, artwork_year_raw = parts
    # Reformat artist: dashes to spaces, title case
    artist = artist_raw.replace('-', ' ').title()

    # Extract year if exists (assumes last hyphen and 4 digits)
    match = re.search(r'-(\d{4})$', artwork_year_raw)
    if match:
        year = match.group(1)
        artwork_raw = artwork_year_raw[:match.start()]
    else:
        year = None
        artwork_raw = artwork_year_raw

    # Reformat artwork: dashes to spaces, title case
    artwork = artwork_raw.replace('-', ' ').title()

    return artist, artwork, year

# Apply to Artemis dataset
parsed = df_artemis_processed['painting'].apply(parse_painting_string)
df_artemis_processed[['artist_parsed', 'artwork_parsed', 'year_parsed']] = pd.DataFrame(parsed.tolist(), index=df_artemis_processed.index)

# Show sample
df_artemis_processed[['painting', 'artist_parsed', 'artwork_parsed', 'year_parsed']].head(5)


Unnamed: 0,painting,artist_parsed,artwork_parsed,year_parsed
0,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,Vincent Van Gogh,Portrait Of Madame Ginoux L Arlesienne,1890
1,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,Vincent Van Gogh,Portrait Of Madame Ginoux L Arlesienne,1890
2,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,Vincent Van Gogh,Portrait Of Madame Ginoux L Arlesienne,1890
3,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,Vincent Van Gogh,Portrait Of Madame Ginoux L Arlesienne,1890
4,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,Vincent Van Gogh,Portrait Of Madame Ginoux L Arlesienne,1890


In [5]:
# Full matching code

# 1. Prepare Artemis keys for matching
df_artemis_processed['artist_key'] = df_artemis_processed['artist_parsed'].str.lower().str.strip()
df_artemis_processed['artwork_key'] = df_artemis_processed['artwork_parsed'].str.lower().str.strip()

# 2. Prepare WikiArt keys for matching
links['artist_key'] = links['Artist'].str.lower().str.strip()
links['artwork_key'] = links['Artwork'].str.lower().str.strip()

# 3. Create a dictionary from WikiArt for fast lookup
wikiart_lookup = {(row['artist_key'], row['artwork_key']): row['Link'] for _, row in links.iterrows()}

# 4. Function to find matching link
def find_link(row):
    key = (row['artist_key'], row['artwork_key'])
    return wikiart_lookup.get(key, None)  # return None if not found

# 5. Apply matching
df_artemis_processed['link'] = df_artemis_processed.apply(find_link, axis=1)

# 6. Show result
match_rate = df_artemis_processed['link'].notnull().mean()
matches_preview = df_artemis_processed[['painting', 'artist_parsed', 'artwork_parsed', 'link']].head(10)




In [7]:
# drop the columns where link is na
df_artemis_processed = df_artemis_processed.dropna(subset=['link'])
df_artemis_processed.to_csv('/Users/sunnyyu/Desktop/Cap4Art/data/artemis_images_with_links.csv', index=False)

scraping code

In [10]:
import os
import requests
from tqdm import tqdm

# Directory to save images
save_dir = './images'
os.makedirs(save_dir, exist_ok=True)

# Use only the rows where a link is available
df_to_scrape = df_artemis_processed[df_artemis_processed['link'].notnull()]

# Function to download and save an image
def download_image(row):
    url = row['link']
    painting_name = row['painting']
    
    # Make a safe filename (remove problematic characters)
    safe_filename = painting_name.replace('/', '_').replace('\\', '_') + '.jpg'
    # add a new column in the df that's the filename
    df_artemis_processed.loc[df_artemis_processed['painting'] == painting_name, 'filename'] = safe_filename
    save_path = os.path.join(save_dir, safe_filename)
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # raise an error for bad status codes
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except Exception as e:
        # You can log e somewhere if you want to debug failed downloads
        return False

# Scrape images with a progress bar
success_count = 0
for _, row in tqdm(df_to_scrape.iterrows(), total=len(df_to_scrape), desc="Downloading images"):
    success = download_image(row)
    if success:
        success_count += 1

print(f"Successfully downloaded {success_count} out of {len(df_to_scrape)} images.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_artemis_processed.loc[df_artemis_processed['painting'] == painting_name, 'filename'] = safe_filename
Downloading images:   0%|          | 29/157867 [00:22<33:45:43,  1.30it/s]


KeyboardInterrupt: 

In [9]:
import os
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Directory to save images
save_dir = './images'
os.makedirs(save_dir, exist_ok=True)

# Only use rows with a link
df_to_scrape = df_artemis_processed[df_artemis_processed['link'].notnull()]

# Function to download and save an image
def download_image(row):
    url = row['link']
    painting_name = row['painting']
    
    # Safe filename
    safe_filename = painting_name.replace('/', '_').replace('\\', '_') + '.jpg'
    df_artemis_processed.loc[df_artemis_processed['painting'] == painting_name, 'filename'] = safe_filename
    save_path = os.path.join(save_dir, safe_filename)
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except Exception:
        return False

# Prepare rows for easy parallel execution
rows = [row for _, row in df_to_scrape.iterrows()]

# Parallel download with live tqdm
success_count = 0
max_workers = 50  # Number of parallel threads

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(download_image, row) for row in rows]
    with tqdm(total=len(futures), desc="Downloading images") as pbar:
        for future in as_completed(futures):
            if future.result():
                success_count += 1
            pbar.update(1)  # Update tqdm for every finished download

print(f"✅ Successfully downloaded {success_count} out of {len(rows)} images.")


Downloading images: 100%|██████████| 157867/157867 [5:03:48<00:00,  8.66it/s]       

✅ Successfully downloaded 103016 out of 157867 images.



