Move images from urls, to local file to bucket

In [9]:

# Import cell
import os
import requests
from google.cloud import storage
import pandas as pd
import shutil
from google.cloud import bigquery

In [10]:
# global params

GCP_PROJECT = os.environ.get('GCP_PROJECT')
BQ_DATASET = os.environ.get('BQ_DATASET')
BUCKET_NAME = os.environ.get('BUCKET_NAME')
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

In [4]:
GOOGLE_APPLICATION_CREDENTIALS

'/home/mollyppl/code/molpl/gcp/alert-passkey-392415-1db41ed4e59c.json'

In [11]:
# get table name
full_table_name = f'{GCP_PROJECT}.{BQ_DATASET}.image_data'
# write query
query = f'''
        SELECT 
            numeric_index,
            artwork_id,
            title,
            category,
            medium,
            date,
            height_cm,
            width_cm,
            image_url_template,
            collecting_institution,
            image_url_normalized
        FROM {full_table_name}
        '''

In [12]:
# get image table from big query
# instantiate client
client = bigquery.Client(project=GCP_PROJECT)
# set up query job
query_job = client.query(query)
# run query
result = query_job.result()
# store results in df
artworks_df = result.to_dataframe()

In [13]:
artworks_df[artworks_df['artwork_id'] == '532897d60bb6d6acf40002a8']
artworks_df.iloc[9279,-1]

'https://d32dm0rphc51dk.cloudfront.net/YUimsdF4bk4i59NpNfxAXw/normalized.jpg'

In [14]:
# create local folder to store the images
path = './image_temp'
if not os.path.exists(path):
  os.mkdir(path)
  print("Folder %s created!" % path)
else:
    shutil.rmtree(path)
    os.mkdir(path)
    print("Folder deleted and recreated")


Folder deleted and recreated


In [None]:
# upload 500 images to a separate folder
for i in range(20000,20500):
    # get url of normalized image
    url = artworks_df.loc[i, 'image_url_normalized']
    # get file name from index
    file_name = f"image_temp/{i}_{artworks_df.loc[i,'numeric_index']}_{artworks_df.loc[i,'artwork_id']}.jpg"
    
    # get the image from the url
    data = requests.get(url).content
    
    # open a new file
    f = open(file_name, 'wb')
    
    # store the image to the file
    f.write(data)
    f.close()
    
    # now upload the image to the cloud
    # instantiate the storage client
    storage_client = storage.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS, project=GCP_PROJECT)
    
    # get bucket
    bucket = storage_client.get_bucket(BUCKET_NAME)
    
    # create blob
    blob = bucket.blob('test/{}.jpg'.format(f"{i}_{artworks_df.loc[i,'numeric_index']}_{artworks_df.loc[i,'artwork_id']}"))
    
    # upload file
    with open(file_name, 'rb') as f:
        blob.upload_from_file(f)
    f.close()
    
    # delete file
    os.remove(file_name)

In [13]:
# upload all to all folder
for i in range(2730,2800):
    # get url of normalized image
    url = artworks_df.loc[i, 'image_url_normalized']
    # get file name from index
    file_name = f"image_temp/{i}_{artworks_df.loc[i,'numeric_index']}_{artworks_df.loc[i,'artwork_id']}.jpg"
    
    # get the image from the url
    data = requests.get(url).content
    
    # open a new file
    f = open(file_name, 'wb')
    
    # store the image to the file
    f.write(data)
    f.close()
    
    # now upload the image to the cloud
    # instantiate the storage client
    storage_client = storage.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS, project=GCP_PROJECT)
    
    # get bucket
    bucket = storage_client.get_bucket(BUCKET_NAME)
    
    # create blob
    blob = bucket.blob('all/{}.jpg'.format(f"{i}_{artworks_df.loc[i,'numeric_index']}_{artworks_df.loc[i,'artwork_id']}"))
    
    # upload file
    with open(file_name, 'rb') as f:
        blob.upload_from_file(f)
    f.close()
    
    # delete file
    os.remove(file_name)