In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

In [5]:
# read all the link for each author
authors = pd.read_csv('./data/data_ukiyo-e_authors.csv')
authors = authors['author_link'].tolist()
print(len(authors))

150


In [29]:
def crawl_artist_page(artist_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }

    while artist_url:
        response = requests.get(artist_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        for img_div in soup.find_all('div', class_='img'):
            link = img_div.find('a', class_='img')['href']
            title = img_div.find('a', class_='img')['title']
            artist = img_div.find('a', class_='artist').get_text(strip=True)
            # print(f'Link: {link}\nTitle: {title}\nArtist: {artist}')
            yield {
                'Link': link,
                'Title': title,
                'Artist': artist
            }

        next_page = soup.find('span', class_='next')
        if next_page:
            artist_url = next_page.find('a')['href']
        else:
            break

In [31]:
# crawl all the artwork links
fieldnames = ['Link', 'Title', 'Artist']

with open('./data/ukiyo-e_artworks.csv', mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, artist_url in enumerate(authors):
        for artwork_data in crawl_artist_page(artist_url):
            writer.writerow(artwork_data)
        print(f'{i+1}/150, Finished crawling {artist_url}')


1/150, Finished crawling https://ukiyo-e.org/artist/okumura-masanobu
2/150, Finished crawling https://ukiyo-e.org/artist/hishikawa-moronobu
3/150, Finished crawling https://ukiyo-e.org/artist/torii-kiyomasu-ii
4/150, Finished crawling https://ukiyo-e.org/artist/nishikawa-sukenobu
5/150, Finished crawling https://ukiyo-e.org/artist/torii-kiyonobu-ii
6/150, Finished crawling https://ukiyo-e.org/artist/torii-kiyomasu-i
7/150, Finished crawling https://ukiyo-e.org/artist/nishimura-shigenaga
8/150, Finished crawling https://ukiyo-e.org/artist/torii-kiyonobu-i
9/150, Finished crawling https://ukiyo-e.org/artist/torii-kiyotada-i
10/150, Finished crawling https://ukiyo-e.org/artist/okumura-toshinobu
11/150, Finished crawling https://ukiyo-e.org/artist/katsukawa-shunsho
12/150, Finished crawling https://ukiyo-e.org/artist/suzuki-harunobu
13/150, Finished crawling https://ukiyo-e.org/artist/isoda-koryusai
14/150, Finished crawling https://ukiyo-e.org/artist/katsukawa-shunko
15/150, Finished craw

In [41]:
def extract_artwork_details(artwork_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }

    response = requests.get(artwork_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        image_url = soup.find('div', class_='imageholder').find('img')['src']
    except:
        image_url = None
    try:
        artist = soup.find('p', class_='row artist').find('a').get_text(strip=True)
    except:
        artist = None
    try:
        artist_link = soup.find('p', class_='row artist').find('a')['href']
    except:
        artist_link = None
    try:
        title = soup.find('p', class_='row title').find('span').get_text(strip=True)
    except:
        title = None
    try:
        date = soup.find('p', class_='row date').find('span').get_text(strip=True)
    except:
        date = None
    try:
        details = soup.find('p', class_='row details').find('a')['href']
    except:
        details = None
    try:
        source = soup.find('p', class_='row source').find('a')['href']
    except:
        source = None
    try:
        similar_prints = [
            img_div.find('a')['href']
            for img_div in soup.find_all('div', class_='img')
        ]
    except:
        similar_prints = None

    # print(f'Image URL: {image_url}\nArtist: {artist}\nArtist_link: {artist_link}\nTitle: {title}\nDate: {date}\nDetails: {details}\nSource: {source}\nSimilar Prints: {similar_prints}')
    yield {
        'Image URL': image_url,
        'Artist': artist,
        'Artist Link': artist_link,
        'Title': title,
        'Date': date,
        'Details': details,
        'Source': source,
        'Similar Prints': similar_prints
    }


In [37]:
# read the artwork links
artworks = pd.read_csv('./data/ukiyo-e_artworks.csv')
artworks = artworks['Link'].tolist()
print(len(artworks))

177985


In [42]:
# for each artwork, extract the details
fieldnames = ['Image URL', 'Artist', 'Artist Link', 'Title', 'Date', 'Details', 'Source', 'Similar Prints']

with open(f'./data/ukiyo-e_artworks_details.csv', mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, artwork_url in enumerate(artworks):
        for artwork_data in extract_artwork_details(artwork_url):
            writer.writerow(artwork_data)

        if (i+1) % 1000 == 0:
            print(f'{i+1}/{len(artworks)}, Finished crawling {artwork_url}')


1000/177985, Finished crawling https://ukiyo-e.org/image/artelino/16827g1
2000/177985, Finished crawling https://ukiyo-e.org/image/aic/89960_448127
3000/177985, Finished crawling https://ukiyo-e.org/image/met/DP134639


KeyboardInterrupt: 