In [1]:
import requests
import pandas as pd
import numpy as np
import time
import os,sys,os.path


API_XAPP_TOKEN = os.environ.get('API_XAPP_TOKEN')


In [2]:
# creating the offset for API calls, and creating an empty DataFrame
offset = 0
size = 500
artworks_df = pd.DataFrame(columns=('artwork_id',
                                    'title',
                                    'category',
                                    'medium',
                                    'date',
                                    'height_cm',
                                    'width_cm',
                                    'collecting_institution',
                                    'image_versions',
                                    'image_url',
                                    'image_url_is_template'))

In [3]:
# Pull all the available artworks from the public version of the Artsy API
while offset < 27000:
    params = {'size':size,
              'offset':offset,
              'xapp_token':API_XAPP_TOKEN}
    url = 'https://api.artsy.net/api/artworks?'

    response = requests.get(url,params=params).json()

    artworks = response['_embedded']['artworks']
#     artworks_ = [] 


    for artwork in artworks:
        artwork_dict = {}
        artwork_dict['artwork_id'] = artwork['id']
        artwork_dict['title'] = artwork['title']
        artwork_dict['category'] = artwork['category']
        artwork_dict['medium'] = artwork['medium']
        artwork_dict['date'] = artwork['date']
        artwork_dict['height_cm'] = artwork['dimensions']['cm']['height']
        artwork_dict['width_cm'] = artwork['dimensions']['cm']['width']
        artwork_dict['collecting_institution'] = artwork['collecting_institution']
    
        try:
            artwork_dict['image_versions'] = artwork['image_versions']
            artwork_dict['image_url'] = artwork['_links']['image']['href']
            artwork_dict['image_url_is_template'] = artwork['_links']['image']['templated']

        except:
            artwork_dict['image_versions'] = []
            artwork_dict['image_url'] = ''
            artwork_dict['image_url_is_template'] = ''
#         artworks_.append(artwork_dict)
        artworks_temp = pd.DataFrame([artwork_dict])
        artworks_df = pd.concat([artworks_df, artworks_temp], ignore_index=True)
#     finals.append(artworks_)
    offset+=size
    time.sleep(.5)

KeyError: '_embedded'

In [4]:
artworks_df.head()

Unnamed: 0,artwork_id,title,category,medium,date,height_cm,width_cm,collecting_institution,image_versions,image_url,image_url_is_template
0,4d8b92eb4eb68a1b2c000968,Der Kuss (The Kiss),Painting,Oil and gold leaf on canvas,1907-1908,180.1,180.1,"Österreichische Galerie Belvedere, Vienna","[large, larger, medium, medium_rectangle, norm...",https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa...,True
1,4d8b92ee4eb68a1b2c0009ab,The Third of May,Painting,Oil on canvas,1814,266.2,345.2,"Museo Nacional del Prado, Madrid","[square, small, large_rectangle, large, tall, ...",https://d32dm0rphc51dk.cloudfront.net/m4X41Fun...,True
2,4d8b93394eb68a1b2c0010fa,The Company of Frans Banning Cocq and Willem v...,Painting,Oil on canvas,1642,363.2,437.4,"Rijksmuseum, Amsterdam","[large, large_rectangle, larger, medium, mediu...",https://d32dm0rphc51dk.cloudfront.net/IG8ZLvVm...,True
3,4d8b937c4eb68a1b2c001722,Mona Lisa,Painting,Oil on poplar,ca. 1503-1506,77.0,53.1,Musée du Louvre,"[large, large_rectangle, larger, medium, mediu...",https://d32dm0rphc51dk.cloudfront.net/5L1xjKC_...,True
4,4d8b93b04eb68a1b2c001b9d,Luncheon on the Grass (Le Déjeuner sur l'herbe),Painting,Oil on canvas,1863,208.3,264.2,"Musée d'Orsay, Paris","[large, large_rectangle, larger, medium_rectan...",https://d32dm0rphc51dk.cloudfront.net/zFA7cwdk...,True


In [5]:
# Create columns for each image_version and adding true/false if each artwork has that format represented. 
# artworks_df["large"] = artworks_df['image_versions'].apply(lambda x: "large" in x)
# artworks_df["large_rectangle"] = artworks_df['image_versions'].apply(lambda x: "large_rectangle" in x)
# artworks_df["larger"] = artworks_df['image_versions'].apply(lambda x: "larger" in x)
# artworks_df["medium_image"] = artworks_df['image_versions'].apply(lambda x: "medium" in x)
# artworks_df["medium_rectangle"] = artworks_df['image_versions'].apply(lambda x: "medium_rectangle" in x)
artworks_df["normalized"] = artworks_df['image_versions'].apply(lambda x: "normalized" in x)
# artworks_df["small"] = artworks_df['image_versions'].apply(lambda x: "small" in x)
# artworks_df["square"] = artworks_df['image_versions'].apply(lambda x: "square" in x)
# artworks_df["tall"] = artworks_df['image_versions'].apply(lambda x: "tall" in x)

In [17]:
# quick analysis of image format categories. Normalized images are highest represented with 25303.

# print(artworks_df["large"].value_counts())
# print(artworks_df["large_rectangle"].value_counts())
# print(artworks_df["larger"].value_counts())
# print(artworks_df["medium"].value_counts())
# print(artworks_df["medium_rectangle"].value_counts())
print(artworks_df["normalized"].value_counts())
# print(artworks_df["small"].value_counts())
# print(artworks_df["square"].value_counts())
# print(artworks_df["tall"].value_counts())

normalized
True     25303
False      697
Name: count, dtype: int64


In [17]:
# create a zipped CSV of artworks_df DataFrame. This contains the raw data.
compression_opts = dict(method='zip',
                        archive_name='artworks_df.csv')  
artworks_df.to_csv('artworks_df.zip', index=True,
          compression=compression_opts)

In [6]:
# Split off a copy of artworks_df DataFrame
artsy_fartsci_images = artworks_df

In [7]:
# Drop the columns that are not needed. 
artsy_fartsci_images.drop(['image_versions'],axis=1, inplace=True)

# Keep images with the 'normalized' image format only. 
artsy_fartsci_images = artsy_fartsci_images[artsy_fartsci_images['normalized'] == True]

In [8]:
# Remove unneccessary categories of images. We are keeping 2D non-photographic images. 
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Sculpture'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Photography'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Design/Decorative Art'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Other'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Architecture'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Books and Portfolios'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Mixed Media'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Textile Arts'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Posters'].index, inplace=True)
artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == ''].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Sculpture'].index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Photography'].index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artsy_fartsci_images.drop(artsy_fartsci_images[artsy_fartsci_images['category'] == 'Design/Decorative Art'].index, inplace=True)
A value is 

In [42]:
# Reset index after removals
artsy_fartsci_images.reset_index

<bound method DataFrame.reset_index of                      artwork_id  \
0      4d8b92eb4eb68a1b2c000968   
1      4d8b92ee4eb68a1b2c0009ab   
2      4d8b93394eb68a1b2c0010fa   
3      4d8b937c4eb68a1b2c001722   
4      4d8b93b04eb68a1b2c001b9d   
...                         ...   
25995  53559d87cd530e850c000166   
25996  53559fedb202a3262000016a   
25997  5357d09c7622dd0ff400002a   
25998  5357d2cfa09a679102000045   
25999  5357d5288b3b813fcd00004a   

                                                   title  category  \
0                                    Der Kuss (The Kiss)  Painting   
1                                       The Third of May  Painting   
2      The Company of Frans Banning Cocq and Willem v...  Painting   
3                                              Mona Lisa  Painting   
4        Luncheon on the Grass (Le Déjeuner sur l'herbe)  Painting   
...                                                  ...       ...   
25995  View of Houses in Delft, Known as ‘The Litt

In [9]:
artsy_fartsci_images

Unnamed: 0,artwork_id,title,category,medium,date,height_cm,width_cm,collecting_institution,image_url,image_url_is_template,normalized
0,4d8b92eb4eb68a1b2c000968,Der Kuss (The Kiss),Painting,Oil and gold leaf on canvas,1907-1908,180.1,180.1,"Österreichische Galerie Belvedere, Vienna",https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa...,True,True
1,4d8b92ee4eb68a1b2c0009ab,The Third of May,Painting,Oil on canvas,1814,266.2,345.2,"Museo Nacional del Prado, Madrid",https://d32dm0rphc51dk.cloudfront.net/m4X41Fun...,True,True
2,4d8b93394eb68a1b2c0010fa,The Company of Frans Banning Cocq and Willem v...,Painting,Oil on canvas,1642,363.2,437.4,"Rijksmuseum, Amsterdam",https://d32dm0rphc51dk.cloudfront.net/IG8ZLvVm...,True,True
3,4d8b937c4eb68a1b2c001722,Mona Lisa,Painting,Oil on poplar,ca. 1503-1506,77.0,53.1,Musée du Louvre,https://d32dm0rphc51dk.cloudfront.net/5L1xjKC_...,True,True
4,4d8b93b04eb68a1b2c001b9d,Luncheon on the Grass (Le Déjeuner sur l'herbe),Painting,Oil on canvas,1863,208.3,264.2,"Musée d'Orsay, Paris",https://d32dm0rphc51dk.cloudfront.net/zFA7cwdk...,True,True
...,...,...,...,...,...,...,...,...,...,...,...
25995,53559d87cd530e850c000166,"View of Houses in Delft, Known as ‘The Little ...",Painting,Oil on canvas,ca. 1658,54.3,44.0,,https://d32dm0rphc51dk.cloudfront.net/6sSY1BE4...,True,True
25996,53559fedb202a3262000016a,The Battle of Terheide,Painting,Ink on canvas,1657,170.0,289.0,,https://d32dm0rphc51dk.cloudfront.net/fOlaqjoa...,True,True
25997,5357d09c7622dd0ff400002a,"Isaac and Rebecca, Known as ‘The Jewish Bride’",Painting,Oil on canvas,ca. 1665 -1669,121.5,166.5,,https://d32dm0rphc51dk.cloudfront.net/T8zyOM1P...,True,True
25998,5357d2cfa09a679102000045,"Portrait of a Woman, Possibly Maria Trip",Painting,Oil on panel,1639,107.0,82.0,,https://d32dm0rphc51dk.cloudfront.net/UJxHMaBg...,True,True


In [40]:
# create a zipped CSV of artsy_fartsci_images DataFrame for use in uploading. 
compression_opts = dict(method='zip',
                        archive_name='artsy-fartsci-images.csv')  
artsy_fartsci_images.to_csv('artsy-fartsci-images.zip', index=True,
          compression=compression_opts)

In [44]:
10/3

3.3333333333333335

In [45]:
artsy_fartsci_images.to_csv('artsy-fartsci-images.csv')