In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

In [None]:
# read all the link for each author
authors = pd.read_csv('./data/data_ukiyo-e_authors.csv')
authors = authors['author_link'].tolist()
print(len(authors))

In [None]:
def crawl_artist_page(artist_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }

    while artist_url:
        response = requests.get(artist_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        for img_div in soup.find_all('div', class_='img'):
            link = img_div.find('a', class_='img')['href']
            title = img_div.find('a', class_='img')['title']
            artist = img_div.find('a', class_='artist').get_text(strip=True)
            # print(f'Link: {link}\nTitle: {title}\nArtist: {artist}')
            yield {
                'Link': link,
                'Title': title,
                'Artist': artist
            }

        next_page = soup.find('span', class_='next')
        if next_page:
            artist_url = next_page.find('a')['href']
        else:
            break

In [None]:
# crawl all the artwork links
fieldnames = ['Link', 'Title', 'Artist']

with open('./data/ukiyo-e_artworks.csv', mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, artist_url in enumerate(authors):
        for artwork_data in crawl_artist_page(artist_url):
            writer.writerow(artwork_data)
        print(f'{i+1}/150, Finished crawling {artist_url}')


In [15]:
def extract_artwork_details(artwork_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }

    response = requests.get(artwork_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        # scaled image
        image_url = soup.find('div', class_='imageholder').find('img')['src']
        # large image
        # image_url = soup.find('div', class_='imageholder').find('a')['href']
    except:
        image_url = None
    try:
        artist = soup.find('p', class_='row artist').find('a').get_text(strip=True)
    except:
        artist = None
    try:
        title = soup.find('p', class_='row title').find('span').get_text(strip=True)
    except:
        title = None
    try:
        date = soup.find('p', class_='row date').find('span').get_text(strip=True)
    except:
        date = None
    try:
        details = soup.find('p', class_='row details').find('a')['href']
    except:
        details = None
    try:
        source = soup.find('p', class_='row source').find('a')['href']
    except:
        source = None
    try:
        description_label = soup.find('strong', string='Description:')
        description_paragraph = description_label.find_parent('p', class_='row').find_next_sibling('p', class_='row')
        description = description_paragraph.find('span', class_='col-xs-9').get_text(strip=True)
    except:
        description = None
    try:
        similar_prints = [
            img_div.find('a')['href']
            for img_div in soup.find_all('div', class_='img')
        ]
    except:
        similar_prints = None

    # print(f'Image URL: {image_url}\nArtist: {artist}\nTitle: {title}\nDate: {date}\nDetails: {details}\nSource: {source}\nDescription: {description}\nSimilar Prints: {similar_prints}')
    yield {
        'Image URL': image_url,
        'Artist': artist,
        'Title': title,
        'Date': date,
        'Details': details,
        'Source': source,
        'Description': description,
        'Similar Prints': similar_prints
    }


In [16]:
# read the artwork links
artworks = pd.read_csv('./data/ukiyo-e_artworks.csv')
artworks = artworks['Link'].tolist()
print(len(artworks))

177985


In [17]:
# for each artwork, extract the details
fieldnames = ['Image URL', 'Artist', 'Title', 'Date', 'Details', 'Source', 'Description', 'Similar Prints']

with open(f'./data/ukiyo-e_artworks_details.csv', mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, artwork_url in enumerate(artworks):
        for artwork_data in extract_artwork_details(artwork_url):
            writer.writerow(artwork_data)

        if (i+1) % 5000 == 0:
            print(f'{i+1}/{len(artworks)}, Finished crawling {artwork_url}')


5000/177985, Finished crawling https://ukiyo-e.org/image/ritsumei/Z0166-287
10000/177985, Finished crawling https://ukiyo-e.org/image/mfa/sc157376
15000/177985, Finished crawling https://ukiyo-e.org/image/artelino/43707g1
20000/177985, Finished crawling https://ukiyo-e.org/image/bm/AN00515121_001_l
25000/177985, Finished crawling https://ukiyo-e.org/image/waseda/100-6828
30000/177985, Finished crawling https://ukiyo-e.org/image/metro/N280-002
35000/177985, Finished crawling https://ukiyo-e.org/image/waseda/118-0087
40000/177985, Finished crawling https://ukiyo-e.org/image/waseda/002-0352
45000/177985, Finished crawling https://ukiyo-e.org/image/bm/AN00799763_001_l
50000/177985, Finished crawling https://ukiyo-e.org/image/mak/11486-52
55000/177985, Finished crawling https://ukiyo-e.org/image/jaodb/Kunisada_1_Utagawa-Ukiyo_e_Comparison_of_Genji-CH5_Wakamurasaki-00030017-020302-F06
60000/177985, Finished crawling https://ukiyo-e.org/image/artelino/11335g1
65000/177985, Finished crawling h