# Create, update, and manage image data in BigQuery

In [47]:
! pip install --upgrade google-cloud-bigquery google-cloud-storage jsonlines pandas

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-2.30.1-py2.py3-none-any.whl (203 kB)
[K     |████████████████████████████████| 203 kB 5.2 MB/s eta 0:00:01
Collecting pandas
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 50.7 MB/s eta 0:00:01
Installing collected packages: pandas, google-cloud-bigquery
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.3
    Uninstalling pandas-1.3.3:
      Successfully uninstalled pandas-1.3.3
  Attempting uninstall: google-cloud-bigquery
    Found existing installation: google-cloud-bigquery 2.28.0
    Uninstalling google-cloud-bigquery-2.28.0:
      Successfully uninstalled google-cloud-bigquery-2.28.0
Successfully installed google-cloud-bigquery-2.30.1 pandas-1.3.4


Install the [Fantasy Maps library](https://github.com/telpirion/FantasyMaps) of tools.

In [71]:
! pip uninstall -y fantasy-maps
! pip install --upgrade ..

Found existing installation: fantasy-maps 0.1.0
Uninstalling fantasy-maps-0.1.0:
  Successfully uninstalled fantasy-maps-0.1.0
Processing /home/jupyter/FantasyMaps
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Building wheels for collected packages: fantasy-maps
  Building wheel for fantasy-maps (setup.py) ... [?25ldone
[?25h  Created wheel for fantasy-maps: filename=fantasy_maps-0.1.0-py3-none-any.whl size=8839 sha256=637f7f50120c9f580df8a3d82b1775e6750f35fa29331bd5c0b8cf93de6f93a2
  Stored in directory: /tmp/pip-ephem-wheel-cache-l419h241/wheels/0b/82/e1/1ada250fdbcebff835c7af761a03d1def55d5cbbe7ed027c64
Successfully built fantas

In [2]:
## COLAB ONLY! ##
try:
    from google.colab import auth
    auth.authenticate_user()
except:
    print('Not running in Colab')

Not running in Colab


In [83]:
#@markdown Store the resource names for this notebook

bigquery_table_id = 'fantasy_maps.dataset_maps'
display_name = 'dnd-maps-bigquery'  #@param {type: "string"}
PROJECT_ID = 'video-erschmid' #@param {type: "string"}
LOCATION = 'us-central1' #@param {type: "string"}
gcs_bucket_name = 'video-erschmid' #@param {type: "string"}

In [49]:
!gcloud config get-value project
!gcloud config set project $PROJECT_ID
!gcloud config get-value project

video-erschmid


To take a quick anonymous survey, run:
  $ gcloud survey

Updated property [core/project].
video-erschmid


## Upload the original training data to BQ

Download `training_data` file from Storage.

In [42]:
# Compile a list of JSONL sources to populate table with
# ONE-TIME / FIRST TIME thing

training_data = 'DnD/map_training_data.jsonl'

In [16]:
from google.cloud import storage

storage_client = storage.Client(project=PROJECT_ID)

bucket = storage_client.bucket(gcs_bucket_name)
training_data_blob = bucket.get_blob(training_data)
blob_bytes = training_data_blob.download_as_bytes()
training_data_str = blob_bytes.decode('utf-8')

Convert training data to Pandas `DataFrame` object

In [29]:
training_data_rows = training_data_str.split('\n')
json.loads(training_data_rows[0])

{'imageGcsUri': 'gs://video-erschmid/DnD/GL_OasisCity_Rain.jpg',
 'boundingBoxAnnotations': [{'xMin': 0.020972644376899698,
   'yMin': 0.014935064935064935,
   'yMax': 0.03051948051948052,
   'xMax': 0.04285714285714286,
   'displayName': 'cell'},
  {'xMin': 0.020972644376899698,
   'yMin': 0.030086580086580085,
   'yMax': 0.04567099567099567,
   'xMax': 0.04285714285714286,
   'displayName': 'cell'},
  {'xMin': 0.020972644376899698,
   'yMin': 0.04523809523809524,
   'yMax': 0.06082251082251082,
   'xMax': 0.04285714285714286,
   'displayName': 'cell'},
  {'xMin': 0.020972644376899698,
   'yMin': 0.06038961038961039,
   'yMax': 0.07597402597402597,
   'xMax': 0.04285714285714286,
   'displayName': 'cell'},
  {'xMin': 0.020972644376899698,
   'yMin': 0.07554112554112555,
   'yMax': 0.09112554112554112,
   'xMax': 0.04285714285714286,
   'displayName': 'cell'},
  {'xMin': 0.020972644376899698,
   'yMin': 0.09069264069264069,
   'yMax': 0.10627705627705628,
   'xMax': 0.04285714285714286

In [31]:
import pandas as pd
import json

# List-row format: image_uri, training_data, source, ID 
columns = ['image_uri', 'training_data', 'source', 'ID']

training_data_list = []

for row in training_data_rows:
    try:
        row_json = json.loads(row)
        image_gcs_uri = row_json['imageGcsUri']
        image_name = image_gcs_uri.split('/')[-1]
        image_id = f"original.{image_name.lower()}"
        source = "manual"

        training_data_list.append([image_gcs_uri, row, source, image_id])
    except:
        continue
    
df = pd.DataFrame(training_data_list, columns=columns)

df.head(2)

Unnamed: 0,image_uri,training_data,source,ID
0,gs://video-erschmid/DnD/GL_OasisCity_Rain.jpg,"{""imageGcsUri"": ""gs://video-erschmid/DnD/GL_Oa...",manual,original.gl_oasiscity_rain.jpg
1,gs://video-erschmid/DnD/G_AbandonedMineEntranc...,"{""imageGcsUri"": ""gs://video-erschmid/DnD/G_Aba...",manual,original.g_abandonedmineentrance_crystal.jpg


In [35]:
from google.cloud import bigquery

bigquery_client = bigquery.Client()

job_config = job_config = bigquery.LoadJobConfig(schema=[
    bigquery.SchemaField("image_uri", "STRING"),
    bigquery.SchemaField("training_data", "STRING"),
    bigquery.SchemaField("source", "STRING"),
    bigquery.SchemaField("ID", "STRING"),
])

job = bigquery_client.load_table_from_dataframe(
    df, bigquery_table_id, job_config=job_config
)

job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7f1b1e86f9d0>

## Upload batch prediction results

In [53]:
batch_predict_prefix = "DnD-batch-predict-input"

In [61]:
blobs = storage_client.list_blobs(gcs_bucket_name,
                                  prefix=batch_predict_prefix)

output_files = []

for blob in blobs:
    blob_name = blob.name
    if blob_name.find("predictions_00001.jsonl") > -1:
        output_files.append(blob_name)
        print(blob_name)


DnD-batch-predict-input/20210930232243/output/prediction-dnd-maps-full-grid-model-online-20210913212620-2021-09-30T23:26:31.424946Z/predictions_00001.jsonl
DnD-batch-predict-input/20211005225454/output/prediction-dnd-maps-full-grid-model-online-20210913212620-2021-10-05T22:55:16.624758Z/predictions_00001.jsonl


For each batch prediction output file, load the prediction output data into memory

In [65]:
output_data = ""
for output_file in output_files:
    batch_prediction_data_blob = bucket.get_blob(output_file)
    bp_blob_bytes = batch_prediction_data_blob.download_as_bytes()
    bp_str = bp_blob_bytes.decode('utf-8')
    output_data += bp_str
    
    print(f"Processing {output_file}...")
    print(f"Length of data: {len(output_data)}")

Processing DnD-batch-predict-input/20210930232243/output/prediction-dnd-maps-full-grid-model-online-20210913212620-2021-09-30T23:26:31.424946Z/predictions_00001.jsonl...
Length of data: 543451
Processing DnD-batch-predict-input/20211005225454/output/prediction-dnd-maps-full-grid-model-online-20210913212620-2021-10-05T22:55:16.624758Z/predictions_00001.jsonl...
Length of data: 1086884


In [64]:
bp_data_rows = output_data.split('\n')
print(len(bp_data_rows))

27


In [85]:
import pandas as pd
import json
from fantasy_maps import converter

bp_data_list = []

# List-row format: image_uri, training_data, source, ID 
columns = ['image_uri', 'training_data', 'source', 'ID']

for row in bp_data_rows:
    try:
        row_json = json.loads(row)
        image_gcs_uri = row_json['instance']['content']
        image_name = image_gcs_uri.split('/')[-1]
        image_id = f"bp.{image_name.lower()}"
        source = "reddit"
        
        # Convert BP output to training inputs
        processed_row = converter.convert_batch_predictions_to_training_data(row_json, 0.4)
        
        if processed_row == None:
            continue
        else:
            bp_data_list.append([image_gcs_uri, json.dumps(processed_row), source, image_id])
    except:
        continue
    
df = pd.DataFrame(bp_data_list, columns=columns)
print(df.size)
df.head(2)

16


Unnamed: 0,image_uri,training_data,source,ID
0,gs://video-erschmid/DnD-batch-predict-input/20...,"{""imageGcsUri"": {""content"": ""gs://video-erschm...",reddit,bp.canal_city_battle_map_30x30.jpg
1,gs://video-erschmid/DnD-batch-predict-input/20...,"{""imageGcsUri"": {""content"": ""gs://video-erschm...",reddit,bp.what_do_you_mean?_did_you_pay_30_gold_for_a...


In [86]:
from google.cloud import bigquery

bigquery_client = bigquery.Client()

job_config = job_config = bigquery.LoadJobConfig(schema=[
    bigquery.SchemaField("image_uri", "STRING"),
    bigquery.SchemaField("training_data", "STRING"),
    bigquery.SchemaField("source", "STRING"),
    bigquery.SchemaField("ID", "STRING"),
])

job = bigquery_client.load_table_from_dataframe(
    df, bigquery_table_id, job_config=job_config
)

job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7f1b1ebaeed0>

## Calculate hash of files

In [89]:
import hashlib

with open("sample_data/gridded-desert-ground.jpg", "rb") as f:
    file_hash = hashlib.md5()
    chunk = f.read(8192)
    while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

print(file_hash.digest())
print(file_hash.hexdigest())

b'\x03\x00\xe8\x92\xc4/S\xf8\x82**\x85\x1eE]\x12'
0300e892c42f53f8822a2a851e455d12


In [90]:
import hashlib

with open("sample_data/gridded-desert-ground-copy.jpg", "rb") as f:
    file_hash = hashlib.md5()
    chunk = f.read(8192)
    while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

print(file_hash.digest())
print(file_hash.hexdigest())

b'\x03\x00\xe8\x92\xc4/S\xf8\x82**\x85\x1eE]\x12'
0300e892c42f53f8822a2a851e455d12


In [91]:
import hashlib

with open("sample_data/gridded-ruined-keep.jpg", "rb") as f:
    file_hash = hashlib.md5()
    chunk = f.read(8192)
    while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

print(file_hash.digest())
print(file_hash.hexdigest())

b'\x8c\xd4\x16\xac\x9fB\r\xb5\x1b\x92\xf5\xb9h\xa1q\x08'
8cd416ac9f420db51b92f5b968a17108
