# Downloading the dataset

## The dataset

https://www.wga.hu/ provides approximately 45,000 photos or scans of (mostly Western) works of art. 

## Disclaimer

The dataset is available via a web interface. The website itself makes no claim as to their general policy on the automated download of these images for machine learning purposes, so indvidual permission should be obtained before running this script. 

## Imports

Please refer to the [README.md](README.md) for proper installation of the dependencies listed here.

In [None]:
# We use gevent to significntly speed up the download by running it in parallel
import gevent
from gevent import monkey
monkey.patch_all()

from tqdm import tqdm_notebook
import requests
from io import BytesIO
from zipfile import ZipFile
import os
import csv
import math
from shutil import copyfile
import random
import PIL
from IPython.display import clear_output, Image, display
import glob

## Settings

- `source`: This should not be changed unless the file has been moved on the server.are `InceptionV3` and `VGG16`.
- `dir`: Name of the directory to download to.
- `cc`: Number of images to download concurrently. 
- `search_dicts`: The search terms for creating the subset. All search terms from the web interface in all combinations are possible. One dictionary = one class.
- `name`: Name of the subset to be created.

In [2]:
source = 'https://www.wga.hu/database/download/data_txt.zip'
dir = 'wga'
cc = 250
search_dicts = [{'FORM': 'painting', 'TYPE': 'portrait'},
                {'FORM': 'painting', 'TYPE': 'landscape'},
                {'FORM': 'painting', 'TYPE': 'still-life'}]
name = 'portrait-landscape-stilllife'

## Helper functions

In [3]:
# Save remote image file to disk
def save(url):
    data = requests.get(url[0]).content
    file = dir + '/' + url[1]
    with open(file, 'wb') as f:
        f.write(data)

# Yield a list as chunks of size n
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def show_image(img, fmt='jpeg'):
    f = BytesIO()
    PIL.Image.fromarray(img).save(f, fmt)
    display(Image(data=f.getvalue()))
    
def equalize_classes(classes):
    min_val = min([len(v) for v in classes.values()])
    for k,v in classes.items():
        classes[k] = classes[k][:min_val]    
    
def split_classes(subdir, classes, split_percent):
    traindir = subdir + '/train'
    if not os.path.exists(traindir): os.makedirs(traindir)
    valdir = subdir + '/val'
    if not os.path.exists(valdir): os.makedirs(valdir)
    
    for classname, classfiles in classes.items():
        
        classname = str(classname)
        
        if not os.path.exists(traindir + '/' + classname): os.makedirs(traindir + '/' + classname)
        if not os.path.exists(valdir + '/' + classname): os.makedirs(valdir + '/' + classname)
        
        split = len(classfiles)-math.floor(len(classfiles)*split_percent)
        trainfiles = classfiles[:split]
        valfiles = classfiles[split:]
        
        for n, file in enumerate(trainfiles): copyfile(file, traindir + '/' + classname + '/%s.jpg' % n)
        for n, file in enumerate(valfiles): copyfile(file, valdir + '/' + classname + '/%s.jpg' % n)

## Download the complete dataset

This downloads the complete dataset concurrently.

In [None]:
# Create dictionaries to hold database information and image file locations.
header_row = ['AUTHOR', 'BORN-DIED', 'TITLE', 'DATE', 'TECHNIQUE', 'LOCATION', 'URL', 'FORM', 'TYPE', 'SCHOOL', 'TIMEFRAME']
header_dict = {v:k for k,v in enumerate(header_row)}
header_dict_local = header_dict
header_dict_local.update({'JPGURL': 11, 'FILE': 12})

# Create directory to hold image files
if not os.path.exists(dir): os.makedirs(dir)

# Read remote zip file content into memory
response = requests.get(source)
zipfile = ZipFile(BytesIO(response.content))

# There is only one file in the archive
infile = zipfile.namelist()[0]

# Encoding is ISO-8859, with some special characters replaced with '?' in the original file!
iterator = [line.decode('ISO-8859-1') for line in zipfile.open(infile).readlines()]

urls = []

with open('catalog_local_utf8.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=';', quotechar='"')
    reader = csv.reader(iterator, delimiter=';', quotechar='"')
    
    for row in reader:

        # Skip header
        if row != header_row:
                        
            html_url = row[6]
            jpg_url = html_url.replace('.html', '.jpg').replace('html', 'art') # Hacky regex
            jpg_name = jpg_url.replace('https://www.wga.hu/art/', '').replace('/', '-')[2:] # Hacky regex
            
            row += [jpg_url, jpg_name]            
            urls.append([jpg_url, jpg_name])
            writer.writerow(row)

done = 0
total = math.ceil(len(urls)/cc)
# Tqdm needs total provided for generators
for chunk in tqdm_notebook(chunks(urls, cc), total=total):
    jobs = [gevent.spawn(save, url) for url in chunk]
    gevent.wait(jobs)
    done += len(chunk)

## Create subset of dataset according to selected classes

In [None]:
subdir = dir + '-' + name
if not os.path.exists(subdir): os.makedirs(subdir)
datadir = dir
if not os.path.exists(datadir): os.makedirs(datadir)
    
classes = {} # Files for each class
c = 0 # Class counter

for search_dict in search_dicts:
    
    classes[c] = []
    
    with open('catalog_local_utf8.csv', 'r') as catalog:
        reader = csv.reader(catalog, delimiter=';', quotechar='"')
        for row in reader:
            matches = 0
            for k,v in search_dict.items():
                if row[header_dict_local[k]].lower() == v:
                    matches += 1
            if matches == len(search_dict):       
                file = datadir + '/' + row[header_dict_local['FILE']]
                classes[c] += [file]
        
        random.shuffle(classes[c])
        c+=1
        
equalize_classes(classes)
split_classes(subdir, classes, 0.1)