In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment2/'
FOLDERNAME = 'cs231n/project/WikiArt'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load

# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Mounted at /content/drive


In [2]:
from re import S
import pandas as pd
import os
import requests
from time import sleep
from urllib.parse import urlparse


BASE_DIR = "/content/drive/MyDrive/cs231n/project/WikiArt"
# 1. Read in the two CSV files
wiki_df = pd.read_csv(os.path.join(BASE_DIR, "wikiart_scraped.csv"))
artemis_df0 = pd.read_csv(os.path.join(BASE_DIR, 'artemis_dataset_release_v0.csv'))
# 2. Filter Artemis to only rows with repetition >= 5
artemis_df = artemis_df0[artemis_df0['repetition'] >= 5].copy()

In [3]:
import pandas as pd
import os
from urllib.parse import urlparse


# 3. Helper functions to parse and build token keys
def parse_wiki_link(url):
    """
    Given a WikiArt link like:
      https://uploads3.wikiart.org/images/vincent-van-gogh/l-arlesienne-portrait-of-madame-ginoux-1890.jpg
    Returns (artist_folder, painting_name_no_ext), both lowercase.
    """
    path = urlparse(url).path
    parts = path.strip('/').split('/')
    if len(parts) < 2:
        return None, None
    artist_folder = parts[-2].lower()
    filename = parts[-1]
    painting_name, _ = os.path.splitext(filename)
    return artist_folder, painting_name.lower()

def parse_artemis_painting(p):
    """
    Given Artemis 'painting' like:
      vincent-van-gogh_portrait-of-madame-ginoux-l-arlesienne-1890
    Returns (artist_part, painting_part), both lowercase.
    """
    if not isinstance(p, str) or '_' not in p:
        return None, None
    artist_part, painting_part = p.split('_', 1)
    return artist_part.lower(), painting_part.lower()

def token_key(s):
    """
    Splits on '-' and sorts tokens alphabetically, then joins with spaces.
    Ensures that ordering differences don't break a match.
    """
    tokens = [tok for tok in s.split('-') if tok]
    return ' '.join(sorted(tokens))

# 4. Parse both DataFrames into (artist, painting) and compute token_key
wiki_df[['wiki_artist', 'wiki_painting']] = wiki_df['Link'].apply(
    lambda u: pd.Series(parse_wiki_link(u))
)
artemis_df[['artemis_artist', 'artemis_painting']] = artemis_df['painting'].apply(
    lambda p: pd.Series(parse_artemis_painting(p))
)

# 5. Drop any rows where parsing failed
wiki_filtered = wiki_df.dropna(subset=['wiki_artist', 'wiki_painting']).copy()
artemis_filtered = artemis_df.dropna(subset=['artemis_artist', 'artemis_painting']).copy()

# 6. Build token_key for painting names
wiki_filtered['wiki_key'] = wiki_filtered['wiki_painting'].map(token_key)
artemis_filtered['artemis_key'] = artemis_filtered['artemis_painting'].map(token_key)

# 7. Deduplicate wiki_filtered by (wiki_artist, wiki_key)
wiki_unique = wiki_filtered.drop_duplicates(subset=['wiki_artist', 'wiki_key'])[
    ['wiki_artist', 'wiki_key', 'Link']
]

# 8. Deduplicate Artemis_filtered by (artemis_artist, artemis_key)
artemis_unique = artemis_filtered.drop_duplicates(subset=['artemis_artist', 'artemis_key']).copy()

# 9. Merge Artemis_unique with Wiki_unique on (artist, key)
merged = pd.merge(
    artemis_unique,
    wiki_unique,
    left_on=['artemis_artist', 'artemis_key'],
    right_on=['wiki_artist', 'wiki_key'],
    how='inner'
)
# 10a. Extract unique URLs and assign each a unique integer index
unique_links = (
    merged[['Link']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'Link': 'url'})
)
unique_links['url_id'] = unique_links.index  # 0, 1, 2, …

# 10b. Merge url_id back onto merged via the URL
merged = merged.merge(unique_links, left_on='Link', right_on='url', how='left')

# 11. Now merge back to artemis_filtered to keep all original rows (emotion, art_style, etc.)
artemis_merged = pd.merge(
    artemis_filtered,
    merged[['url_id', 'Link', 'wiki_artist', 'wiki_key']],
    left_on=['artemis_artist', 'artemis_key'],
    right_on=['wiki_artist', 'wiki_key'],
    how='inner'
)

print("Number of rows in merged:", merged.shape[0])
print("Number of unique URLs:", merged['Link'].nunique())
print("Number of unique url_ids:", merged['url_id'].nunique())

# 12. Drop fully duplicate rows
artemis_unique_rows = artemis_merged.drop_duplicates().copy()

# 13. Create a new column combining art_style and emotion
artemis_unique_rows['style_emotion'] = (
    artemis_unique_rows['art_style'] + '_' + artemis_unique_rows['emotion']
)

# 14. Save to CSV
print("Saving to CSV...")
output_path = os.path.join(BASE_DIR, 'artemis_with_style_emotion.csv')
artemis_unique_rows.to_csv(output_path, index=False)
print("Done Saving.")

#  Distribution of emotion alone
if 'emotion' in artemis_unique_rows.columns:
    counts_per_emotion = (
        artemis_unique_rows['emotion']
        .value_counts()
        .reset_index()
        .rename(columns={'index': 'emotion', 'emotion': 'url_count'})
    )
    print("\nURLs per emotion label:")
    print(counts_per_emotion.to_string(index=False))
else:
    print("\nNo 'emotion' column found in the unique set.")

# 15. Distribution of the new combined style_emotion column
counts_per_style_emotion = (
    artemis_unique_rows['style_emotion']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'style_emotion', 'style_emotion': 'count'})
)
print("\nURLs per (style x emotion) combined label:")
print(counts_per_style_emotion.to_string(index=False))

# 16. Print summary of sizes
print(f"\nwiki_df rows:                          {wiki_df.shape[0]}  cols: {wiki_df.shape[1]}")
print(f"artemis_df (original) rows:           {artemis_df.shape[0]}  cols: {artemis_df.shape[1]}")
print(f"artemis_df (repetition >= 5) rows:    {artemis_filtered.shape[0]}  cols: {artemis_filtered.shape[1]}")
print(f"artemis_unique (one per painting) rows: {artemis_unique.shape[0]}  cols: {artemis_unique.shape[1]}")
print(f"merged rows (one per painting):       {merged.shape[0]}  cols: {merged.shape[1]}")
print(f"artemis_unique_rows (fully unique)    {artemis_unique_rows.shape[0]}  cols: {artemis_unique_rows.shape[1]}")
print(f"Unique paintings in Artemis (rep>=5): {artemis_filtered['painting'].nunique()}")
print(f"Unique paintings in merged:           {merged['painting'].nunique()}")

Number of rows in merged: 43640
Number of unique URLs: 43640
Number of unique url_ids: 43640
Saving to CSV...
Done Saving.

URLs per emotion label:
     url_count  count
   contentment  61222
           awe  38982
something else  32172
     amusement  28961
       sadness  25896
          fear  23557
    excitement  20981
       disgust  13827
         anger   4462

URLs per (style x emotion) combined label:
                                    count  count
                Impressionism_contentment   8850
           Post_Impressionism_contentment   6729
                      Realism_contentment   5764
                       Rococo_contentment   3333
                Expressionism_contentment   3270
                    Symbolism_contentment   3061
           Art_Nouveau_Modern_contentment   2999
                        Impressionism_awe   2989
                            Symbolism_awe   2965
                 Northern_Renaissance_awe   2958
                  Romanticism_contentment   2886


In [4]:
merged['url_id'].value_counts()

Unnamed: 0_level_0,count
url_id,Unnamed: 1_level_1
43639,1
0,1
1,1
2,1
3,1
...,...
12,1
11,1
10,1
9,1


In [5]:
import os
import requests
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # for a progress bar

# 1. Ensure SAVE_DIR exists
SAVE_DIR = os.path.join(BASE_DIR, 'artemis_images')
os.makedirs(SAVE_DIR, exist_ok=True)

# 2. Prepare a list of (url_id, url) for each row in `merged`
#    merged must already have a unique 'url_id' column and a 'Link' column
urls_to_download = list(zip(merged['url_id'], merged['Link']))

def download_one(item):
    """
    Downloads a single (url_id, url) tuple.
    Saves file as "<url_id>.<ext>" (e.g. "0.jpg").
    Returns None on success or (url_id, url, error_msg) on failure.
    """
    url_id, url = item
    ext = os.path.splitext(urlparse(url).path)[1]  # e.g. ".jpg" or ".png"
    save_name = f"{url_id}{ext}"
    save_path = os.path.join(SAVE_DIR, save_name)

    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(r.content)
        return None
    except Exception as e:
        return (url_id, url, str(e))

# 3. Download in parallel
MAX_WORKERS = 20
failed = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(download_one, item): item for item in urls_to_download}
    for fut in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
        result = fut.result()
        if result is not None:
            failed.append(result)

# 4. Print summary
total = len(urls_to_download)
success = total - len(failed)
print(f"\nTotal attempted: {total}")
print(f"Downloaded successfully: {success}")
print(f"Failed downloads: {len(failed)}")
if failed:
    print("Some failure examples:", failed[:5])

Downloading: 100%|██████████| 43640/43640 [38:41<00:00, 18.80it/s]


Total attempted: 43640
Downloaded successfully: 43608
Failed downloads: 32
Some failure examples: [(4394, 'https://uploads6.wikiart.org/images/sam-francis/untitled-1953-1.jpg', '520 Server Error: <none> for url: https://uploads6.wikiart.org/images/sam-francis/untitled-1953-1.jpg'), (6802, 'https://uploads2.wikiart.org/images/egon-schiele/self-portrait-with-hand-to-cheek-1910.jpg', '520 Server Error: <none> for url: https://uploads2.wikiart.org/images/egon-schiele/self-portrait-with-hand-to-cheek-1910.jpg'), (8535, 'https://uploads4.wikiart.org/images/forrest-bess/it-fits-1955.jpg', '520 Server Error: <none> for url: https://uploads4.wikiart.org/images/forrest-bess/it-fits-1955.jpg'), (11494, 'https://uploads0.wikiart.org/images/paul-gauguin/breton-woman-1886.jpg', '520 Server Error: <none> for url: https://uploads0.wikiart.org/images/paul-gauguin/breton-woman-1886.jpg'), (12068, 'https://uploads3.wikiart.org/images/rembrandt/the-shepards-and-the-family-1644.jpg', '520 Server Error: <n


