# 0 Select artworks

This notebook takes the all_data_info.csv from the Painter by Numbers dataset and selects only artists with more than 256 works.

The dataset can be downloaded here:
https://www.kaggle.com/c/painter-by-numbers/data

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
all_data_info = pd.read_csv('all_data_info.csv')

In [3]:
# Rename new_filename to filename for conveinance.
all_data_info.rename(columns={'new_filename':'filename'}, inplace=True)

In [4]:
all_data_info.head()

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,filename
0,Barnett Newman,1955.0,abstract,15530.0,6911.0,9201912.0,wikiart,Color Field Painting,Uriel,train_only,True,102257.jpg
1,Barnett Newman,1950.0,abstract,14559.0,6866.0,8867532.0,wikiart,Color Field Painting,Vir Heroicus Sublimis,train_only,True,75232.jpg
2,kiri nichol,2013.0,,9003.0,9004.0,1756681.0,,Neoplasticism,,test_only,False,32145.jpg
3,kiri nichol,2013.0,,9003.0,9004.0,1942046.0,,Neoplasticism,,test_only,False,20304.jpg
4,kiri nichol,2013.0,,9003.0,9004.0,1526212.0,,Neoplasticism,,test_only,False,836.jpg


In [5]:
# Keep only artist and filename
all_data_artist = all_data_info[['artist','filename']]

In [6]:
all_data_artist.head()

Unnamed: 0,artist,filename
0,Barnett Newman,102257.jpg
1,Barnett Newman,75232.jpg
2,kiri nichol,32145.jpg
3,kiri nichol,20304.jpg
4,kiri nichol,836.jpg


In [7]:
# Count the number of artworks for each artist
count_artworks = all_data_artist['artist'].value_counts()

In [8]:
# Convert to pandas dataframe
counted_artworks = pd.DataFrame(count_artworks).reset_index()

In [9]:
# Rename columns
counted_artworks.rename(columns = {'artist':'number', 'index': 'artist'}, inplace = True)

In [10]:
counted_artworks.head()

Unnamed: 0,artist,number
0,Zdislav Beksinski,500
1,John Singer Sargent,500
2,Ilya Repin,500
3,Ivan Aivazovsky,500
4,Rembrandt,500


In [11]:
# Merge artist, filename and counted number of artworks
paintings_numbers = pd.merge(all_data_artist, counted_artworks)

In [12]:
paintings_numbers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103250 entries, 0 to 103249
Data columns (total 3 columns):
artist      103250 non-null object
filename    103250 non-null object
number      103250 non-null int64
dtypes: int64(1), object(2)
memory usage: 3.2+ MB


In [13]:
paintings_numbers.head()

Unnamed: 0,artist,filename,number
0,Barnett Newman,102257.jpg,47
1,Barnett Newman,75232.jpg,47
2,Barnett Newman,62252.jpg,47
3,Barnett Newman,63861.jpg,47
4,Barnett Newman,59145.jpg,47


In [14]:
# Select artists with 256 or more works
selected_paintings_256 = paintings_numbers[paintings_numbers['number'] >= 256]

In [15]:
# Swap columns, filename first, omit number column
df = selected_paintings_256[['filename', 'artist', 'number']]

In [16]:
df.head()

Unnamed: 0,filename,artist,number
364,99442.jpg,Ivan Aivazovsky,500
365,81750.jpg,Ivan Aivazovsky,500
366,82140.jpg,Ivan Aivazovsky,500
367,74871.jpg,Ivan Aivazovsky,500
368,51238.jpg,Ivan Aivazovsky,500


# Sample 256 artworks from each artist to balance dataset

In [17]:
artists = np.unique(list(df['artist']))

In [18]:
for a in tqdm(range(len(artists))):
    if a == 0:
        selection = df.loc[df['artist'] == artists[a]].sample(n=256, random_state=20181126)
    else:
        sample = df.loc[df['artist'] == artists[a]].sample(n=256, random_state=20181126)
        selection = pd.concat([selection, sample], ignore_index=True)

100%|██████████| 69/69 [00:00<00:00, 260.77it/s]


In [19]:
len(selection) // 256

69

In [20]:
selection.head()

Unnamed: 0,filename,artist,number
0,44067.jpg,Albert Bierstadt,322
1,59090.jpg,Albert Bierstadt,322
2,6401.jpg,Albert Bierstadt,322
3,73846.jpg,Albert Bierstadt,322
4,96907.jpg,Albert Bierstadt,322


In [21]:
# Omit number column
balanced_256 = selection[['filename', 'artist']]

In [22]:
balanced_256.head()

Unnamed: 0,filename,artist
0,44067.jpg,Albert Bierstadt
1,59090.jpg,Albert Bierstadt
2,6401.jpg,Albert Bierstadt
3,73846.jpg,Albert Bierstadt
4,96907.jpg,Albert Bierstadt


In [23]:
# Save selected artworks to csv
balanced_256.to_csv("balanced_256.csv", index = False)