Move data from dataframe to BigQuery

Dataframe already populated with data from API in prev stage

In [1]:
# imports
import os
import pandas as pd
from google.cloud import bigquery



In [2]:
# global params
GCP_PROJECT = os.environ.get('GCP_PROJECT')
BQ_DATASET = os.environ.get('BQ_DATASET')

In [3]:
# get data from local csv (created by Alex is separate notebook)
artworks_df = pd.read_csv('../data/artsy-fartsci-images.csv')

In [4]:
# have a look at the data
artworks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23815 entries, 0 to 23814
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              23815 non-null  int64  
 1   artwork_id              23815 non-null  object 
 2   title                   23815 non-null  object 
 3   category                22177 non-null  object 
 4   medium                  23815 non-null  bool   
 5   date                    18757 non-null  object 
 6   height_cm               11406 non-null  float64
 7   width_cm                11371 non-null  float64
 8   image_url               23815 non-null  object 
 9   collecting_institution  22005 non-null  object 
 10  image_url_is_template   23815 non-null  bool   
 11  normalized              23815 non-null  bool   
dtypes: bool(3), float64(2), int64(1), object(6)
memory usage: 1.7+ MB


In [5]:
# rename url col with template
artworks_df.rename(columns={'image_url':'image_url_template'},inplace=True)


In [6]:
# rename index column
artworks_df.rename(columns={'Unnamed: 0':'numeric_index'},inplace=True)

In [7]:
# drop is template col - all are true so is redundant
artworks_df.drop(columns=['image_url_is_template'],inplace=True)

In [8]:
# drop normalized col - all are true so redundant
artworks_df.drop(columns=['normalized'],inplace=True)

In [9]:
# create new url col with normalized url
artworks_df['image_url_normalized'] = artworks_df['image_url_template'].apply(lambda x: x.replace('{image_version}','normalized'))



In [10]:
# load into BigQuery
# get full table name
full_table_name = f'{GCP_PROJECT}.{BQ_DATASET}.image_data'
#source data
source_data = artworks_df
# instantiate client
client = bigquery.Client(project=GCP_PROJECT)
# truncate or append?
write_mode = 'WRITE_TRUNCATE'
# job_config
job_config = bigquery.LoadJobConfig(write_disposition=write_mode)
# job
job = client.load_table_from_dataframe(source_data,full_table_name,job_config=job_config)
# result
result = job.result()