# Populating Firestore from GCS

In [25]:
! pip install google-cloud-firestore google-cloud-storage



Adding image data to Firestore for translation into training data. 

In [13]:
! gsutil cp gs://fantasy-maps/map_training_data.jsonl .

Copying gs://fantasy-maps/map_training_data.jsonl...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][150.2 MiB/150.2 MiB]                                                
Operation completed over 1 objects/150.2 MiB.                                    


In [5]:
import json

with open("map_training_data.jsonl", "r") as f:
    json_str = f.read()

json_lines = json_str.split("\n")

# Drop last empty item
json_lines = json_lines[:-1]

json_data = [json.loads(l) for l in json_lines]

In [12]:
print(json_data[0])

{'imageGcsUri': 'gs://fantasy-maps/TrainingInput/GL_OasisCity_Rain.jpg', 'boundingBoxAnnotations': [{'xMin': 0.020972644376899698, 'yMin': 0.014935064935064935, 'yMax': 0.03051948051948052, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.030086580086580085, 'yMax': 0.04567099567099567, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.04523809523809524, 'yMax': 0.06082251082251082, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.06038961038961039, 'yMax': 0.07597402597402597, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.07554112554112555, 'yMax': 0.09112554112554112, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.09069264069264069, 'yMax': 0.10627705627705628, 'xMax': 0.04285714285714286, 'displayName': 'cell'}, {'xMin': 0.020972644376899698, 'yMin': 0.10584

## Transfer training data to Firestore

The structure of the data in the JSONL file is:

```
{
    "imageGcsUri": "URI",
    "boundingBoxAnnotations": {
        "displayName": "LABEL_NAME",
        "xMin": ##,
        "xMax": ##,
        "yMin": ##,
        "yMax": ##,
    }
}
```

[Documentation here.](https://cloud.google.com/vertex-ai/docs/datasets/prepare-image#object-detection)

In [17]:
# Verify that we have the correct data
print(json_data[0]["imageGcsUri"])

gs://fantasy-maps/TrainingInput/GL_OasisCity_Rain.jpg


Next, let's create our working directory

In [33]:
import os

WORKING_IMG_PATH = "sample_data"

if not os.path.isdir(WORKING_IMG_PATH):
    os.mkdir(WORKING_IMG_PATH)

Then, let's create a function to download a img from storage and then create the SHA-1 hash value of the img. This function returns the hash as a "hexdigest" (a string representation of the hash value).

In [32]:
def convert_img_data_sha(img_dict):
    
    import hashlib
    
    img_gcs_uri = img_dict["imageGcsUri"]
    
    # Download file locally
    ! gsutil cp $img_gcs_uri $WORKING_IMG_PATH
    
    file_name = img_gcs_uri.split("/")[-1]
    
    # Create local path to file
    local_path = f"{WORKING_IMG_PATH}/{file_name}"
    
    # Read file into buffer
    # https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
    BUF_SIZE = 65536 # 64 kB
    
    sha1 = hashlib.sha1()
    
    # Create a SHA-1 hash value of file
    with open(local_path, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            sha1.update(data)
            
    # Clean up working dir
    if os.path.exists(local_path):
        os.remove(local_path)
    
    return sha1.hexdigest()

Now we can take the image data and store it in our database!

In [35]:
def store_img_data_fs(img_dict, sha_str):
    from google.cloud import firestore
    
    COLLECTION_NAME = "FantasyMaps"
    client = firestore.Client(project="fantasymaps-334622")
    
    # Compose the file name, extract the data we need
    img_gcs_uri = img_dict["imageGcsUri"]
    bboxes = img_dict["boundingBoxAnnotations"]
    file_name = img_gcs_uri.split("/")[-1]
    
    data = {
        u"filename": file_name,
        u"gcsURI": img_gcs_uri,
        u"computedBBoxes": bboxes,
        u"source": "TrainingData",
        u"userId": "None",
        #u"predictedBBoxes": bboxes
    }
    
    client.collection(COLLECTION_NAME).document(sha_str).set(data)
    

And finally, to test our functions to ensure they work!

In [47]:
img_dict = json_data[0]
sha_str = convert_img_data_sha(img_dict)
store_img_data_fs(img_dict, sha_str)

Copying gs://fantasy-maps/TrainingInput/GL_OasisCity_Rain.jpg...
/ [1 files][ 11.5 MiB/ 11.5 MiB]                                                
Operation completed over 1 objects/11.5 MiB.                                     


Very, very last step: iterate over all the training data and store the metadata in the DB.

In [None]:
#json_data = json_data[1:]
#print(json_data[0])

for img_dict in json_data:
    try:
        sha_str = convert_img_data_sha(img_dict)
        store_img_data_fs(img_dict, sha_str)
    except BaseException as err:
        print(err)
        

Copying gs://fantasy-maps/TrainingInput/G_AbandonedMineEntrance_Crystal.jpg...
/ [1 files][  2.5 MiB/  2.5 MiB]                                                
Operation completed over 1 objects/2.5 MiB.                                      
Copying gs://fantasy-maps/TrainingInput/G_AbandonedMineEntrance_Desert_Day.jpg...
/ [1 files][  2.3 MiB/  2.3 MiB]                                                
Operation completed over 1 objects/2.3 MiB.                                      
Copying gs://fantasy-maps/TrainingInput/G_AbandonedMineEntrance_Dragon_Day.jpg...
/ [0 files][    0.0 B/  3.1 MiB]                                                

## Check the results of the metadata creation

Now that (hopefully) all of the image metadata has been added to the DB, we need to do a count/list to make sure that all the data is there.

In [4]:
from google.cloud import firestore

COLLECTION_NAME = "FantasyMaps"
client = firestore.Client(project="fantasymaps-334622")

collection = client.collection(COLLECTION_NAME)

docs = collection.stream()
count = 0


for doc in docs:
    count += 1
    
print(count)

583


This number **should** be 616 or so ... but we're off by 33. This could mean that 33 images errored out during upload, or that the process timed out. We'll query the collection by GCS URI to see which of the images were not processed.

_Memo to me_: We need an easier way to get the size of a collection in Firestore.

In [6]:
gcs_uris = [j["imageGcsUri"] for j in json_data]

In [7]:
print(gcs_uris[0])

gs://fantasy-maps/TrainingInput/GL_OasisCity_Rain.jpg


In [16]:
missing_imgs = []
doc_ids = []
for uri in gcs_uris:
    docs = collection.where("gcsURI", "==", uri).stream()
    
    count = 0
    for doc in docs:
        count = count + 1
        doc_ids.append(doc)
    
    if count is 0:
        missing_imgs.append(uri)
        
print(len(missing_imgs))
print(len(doc_ids))

33
580


In [18]:
for missing_uri in missing_imgs:
    print(missing_uri)

gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_DarkSun.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Day.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Green.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Night.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Purple.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_BonesOfTheGods.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_Camp_Day.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_Camp_Night.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_CosmicDragon_Day.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_CosmicDragon_Night.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_Fireflies.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_HotAirBalloon_Day.jpg
gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_HotAirBalloon_Ni

In [28]:
missing_data = filter(lambda d: d["imageGcsUri"] in missing_imgs, json_data)
missing_data_list = list(missing_data)

In [29]:
print(len(missing_data_list))

33


In [36]:
for img_dict in missing_data_list:
    try:
        sha_str = convert_img_data_sha(img_dict)
        store_img_data_fs(img_dict, sha_str)
    except BaseException as err:
        print(err)

Copying gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_DarkSun.jpg...
/ [1 files][  6.0 MiB/  6.0 MiB]                                                
Operation completed over 1 objects/6.0 MiB.                                      
Copying gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Day.jpg...
/ [1 files][  8.8 MiB/  8.8 MiB]                                                
Operation completed over 1 objects/8.8 MiB.                                      
Copying gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Green.jpg...
/ [1 files][  6.4 MiB/  6.4 MiB]                                                
Operation completed over 1 objects/6.4 MiB.                                      
Copying gs://fantasy-maps/TrainingInput/G_MountaintopObservatory_ArcaneCannon_Night.jpg...
/ [1 files][  6.6 MiB/  6.6 MiB]                                                
Operation completed over 1 objects/6.6 MiB.                       