In [1]:
# imports
import pandas as pd
import numpy as np
import os
from google.cloud import bigquery

In [2]:
# global params
GCP_PROJECT = os.environ.get('GCP_PROJECT')
BQ_DATASET = os.environ.get('BQ_DATASET')

Get data and filter to modern artists only

In [3]:
# get csv of artists
df = pd.read_csv('/home/mollyppl/code/molpl/artsy-fartsci/data/artists.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           50 non-null     int64 
 1   name         50 non-null     object
 2   years        50 non-null     object
 3   genre        50 non-null     object
 4   nationality  50 non-null     object
 5   bio          50 non-null     object
 6   wikipedia    50 non-null     object
 7   paintings    50 non-null     int64 
dtypes: int64(2), object(6)
memory usage: 3.2+ KB


In [5]:
# create birth year and death year cols for filtering
df['birth_year'] = df.years.apply(lambda x: int(x.replace('–','-').split(' - ')[0]))
df['death_year'] = df.years.apply(lambda x: int(x.replace('–','-').split(' - ')[1]))

In [None]:
# ignore old art
old_art_df = df[df['death_year'] < 1900]

In [6]:
# new artists only
modern_art_df = df[df['death_year'] > 1900]

In [7]:
modern_art_df.sort_values(by='name',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  modern_art_df.sort_values(by='name',inplace=True)


Artist Df

In [None]:
# get info into format to fit with the rest of the data - artists
artist_dict = {'id':[],
               'slug':[],
               'name':[],
               'gender':[],
               'birthday':[],
               'deathday':[],
               'hometown':[],
               'location':[],
               'nationality':[]
               }
for ix, row in modern_art_df.iterrows():
    artist_dict["id"].append('')
    artist_dict["slug"].append(row['name'].lower().replace(' ','-')) # name-like-this
    artist_dict["name"].append(row["name"])
    if row['name'] == 'Frida Kahlo':
        gender = 'female'
    else:
        gender = 'male'
    artist_dict["gender"].append(gender) # add this manually - but i think they're all men 
    artist_dict["birthday"].append(row['birth_year'])
    artist_dict["deathday"].append(row['death_year'])
    artist_dict["hometown"].append('') # google
    artist_dict["location"].append('') # where they worked
    artist_dict["nationality"].append(row['nationality'])

In [None]:
# get into df - ready to add to dataframe later
artist_formatted_df = pd.DataFrame(artist_dict)

Artworks DF

In [11]:
# get og dataframe 
artworks_artsy_df = pd.read_csv('bq_data.csv')

In [12]:
# get artwork info
# generate x rows where x = number of paintings for that artist

artworks_all = artworks_artsy_df.copy()
for ix, row in modern_art_df.iterrows():
    for i in range(row['paintings']):
        temp = {}
        temp['artwork_id'] = f"{row['name'].replace(' ','')}{i+1}"
        temp['title'] = f"{row['name']} {i+1}"
        temp['category'] = 'Painting'
        temp['medium'] = None
        temp['date'] = None
        temp['height_cm'] = None
        temp['width_cm'] = None
        temp['image_url_template'] = None
        temp['collecting_institution'] = None
        temp['image_url_normalized'] = None
        temp_df = pd.DataFrame(temp, index=[0])
        artworks_all = pd.concat([artworks_all,temp_df],ignore_index=True)



In [13]:
artworks_all

Unnamed: 0,artwork_id,title,category,medium,date,height_cm,width_cm,image_url_template,collecting_institution,image_url_normalized
0,4eb1c899c8004a000100deb3,Portrait of a Young Woman with a White Coif,Painting,Oil and tempera on panel,1541,,,https://d32dm0rphc51dk.cloudfront.net/liVgLUFu...,,https://d32dm0rphc51dk.cloudfront.net/liVgLUFu...
1,4f99da873314020001000700,The Alba Madonna,Painting,Oil on panel transferred to canvas,ca. 1510,,,https://d32dm0rphc51dk.cloudfront.net/lnnzsg3v...,"National Gallery of Art, Washington D.C.",https://d32dm0rphc51dk.cloudfront.net/lnnzsg3v...
2,515b1d46056351dc33001014,The Fall of Man [middle panel],Painting,Oil on hardboard transferred from panel,ca. 1535,,,https://d32dm0rphc51dk.cloudfront.net/kBOdyvyl...,"National Gallery of Art, Washington D.C.",https://d32dm0rphc51dk.cloudfront.net/kBOdyvyl...
3,515ce23e7b7057eb4c00115c,The Rule of Mars [right panel],Painting,Oil on hardboard transferred from panel,ca. 1535,,,https://d32dm0rphc51dk.cloudfront.net/hidKPTZj...,"National Gallery of Art, Washington D.C.",https://d32dm0rphc51dk.cloudfront.net/hidKPTZj...
4,515d6a15b5907b33b1004797,Madonna and Child with Saint Mary Magdalene an...,Painting,Tempera on panel transferred to canvas,ca. 1330/1340,,,https://d32dm0rphc51dk.cloudfront.net/-SeTFEk1...,"National Gallery of Art, Washington D.C.",https://d32dm0rphc51dk.cloudfront.net/-SeTFEk1...
...,...,...,...,...,...,...,...,...,...,...
10486,VasiliyKandinskiy84,Vasiliy Kandinskiy 84,Painting,,,,,,,
10487,VasiliyKandinskiy85,Vasiliy Kandinskiy 85,Painting,,,,,,,
10488,VasiliyKandinskiy86,Vasiliy Kandinskiy 86,Painting,,,,,,,
10489,VasiliyKandinskiy87,Vasiliy Kandinskiy 87,Painting,,,,,,,


In [14]:
# load into BigQuery
# get full table name
full_table_name = f'{GCP_PROJECT}.{BQ_DATASET}.image_data_balanced_new'
#source data
source_data = artworks_all
# instantiate client
client = bigquery.Client(project=GCP_PROJECT)
# truncate or append?
write_mode = 'WRITE_TRUNCATE'
# job_config
job_config = bigquery.LoadJobConfig(write_disposition=write_mode)
# job
job = client.load_table_from_dataframe(source_data,full_table_name,job_config=job_config)
# result
result = job.result()