# 1 Split dataset in train, val & test

This script seperates the selected artwork in a train, val & test set in a stratified manner.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load selected artwork
img_details = pd.read_csv('balanced_256.csv')

In [3]:
img_details.head()

Unnamed: 0,filename,artist
0,44067.jpg,Albert Bierstadt
1,59090.jpg,Albert Bierstadt
2,6401.jpg,Albert Bierstadt
3,73846.jpg,Albert Bierstadt
4,96907.jpg,Albert Bierstadt


In [4]:
# Split the training dataset in ~80% train, ~10% validation and ~10% test
train_valid, test = train_test_split(img_details, test_size=0.1015625, stratify=img_details["artist"], random_state=20181125)
train, valid = train_test_split(train_valid, test_size=0.1130434, stratify=train_valid["artist"], random_state=20181125)

print("Number of train classes: {:d}".format(len(pd.unique(train["artist"]))))
print("Number of valid classes: {:d}".format(len(pd.unique(valid["artist"]))))
print("Number of test classes: {:d}".format(len(pd.unique(test["artist"]))))
print("Total number of training samples: {:d}".format(len(train)))
print("Total number of validation samples: {:d}".format(len(valid)))
print("Total number of test samples: {:d}".format(len(test)))

Number of train classes: 69
Number of valid classes: 69
Number of test classes: 69
Total number of training samples: 14076
Total number of validation samples: 1794
Total number of test samples: 1794


In [8]:
# Add column 'set' to each dataframe with set description
train['set'] = 'train'
valid['set'] = 'valid'
test['set'] = 'test'

In [9]:
# Merge train, val & test set in single dataframe
selected_paintings_256_split = pd.concat([train, valid, test])
print("Number of classes: {:d}".format(len(pd.unique(selected_paintings_256_split["artist"]))))
print("Number of splits: {:d}".format(len(pd.unique(selected_paintings_256_split["set"]))))
print("Total number of samples: {:d}".format(len(selected_paintings_256_split)))

Number of classes: 69
Number of splits: 3
Total number of samples: 17664


In [10]:
selected_paintings_256_split.head()

Unnamed: 0,filename,artist,set
13761,39175.jpg,Peter Paul Rubens,train
3213,36949.jpg,Egon Schiele,train
11290,46722.jpg,Martiros Saryan,train
6205,9206.jpg,Gustave Loiseau,train
13685,80386.jpg,Peter Paul Rubens,train


In [11]:
selected_paintings_256_split.to_csv("balanced_256_split.csv", index = False)