In [7]:
# Setup
!pip install --quiet --upgrade pip

# Install the dependencies.
!pip install --quiet -r gdal_req.txt

# Restart the runtime by ending the process.
exit()

In [1]:
import numpy as np
import rasterio
from google.cloud import storage
import os
import matplotlib.pyplot as plt
from serving.constants import  SCALE, BUCKET, IMG_SOURCE_PREFIX, HIST_DEST_PREFIX, NUM_BANDS, HIST_BINS_LIST, SCALE, CROP, MONTHS, IMAGE_BATCH
from serving.hist_data import recombine_image, write_histogram_to_gcs
from serving.common import list_blobs_with_prefix
from serving.data import get_varied_labels, get_labels
import io
import google.auth
from rasterio.io import MemoryFile
from osgeo import gdal
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# SETUP
credentials, _ = google.auth.default()

bucket_name = BUCKET

directory_prefix = IMG_SOURCE_PREFIX
output_prefix = HIST_DEST_PREFIX

logging.basicConfig(filename="hist.log",level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
def check_blob_prefix_exists(bucket_name, prefix):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    blobs = bucket.list_blobs(prefix=prefix, max_results=1)
    return any(blobs)

def batch_check_blobs(bucket_name, prefixes):
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_prefix = {executor.submit(check_blob_prefix_exists, bucket_name, prefix): prefix for prefix in prefixes}
        results = {}
        for future in as_completed(future_to_prefix):
            prefix = future_to_prefix[future]
            results[prefix] = future.result()
    return results

# Generate all prefixes
start_time = time.perf_counter()

labels_df = get_varied_labels(*IMAGE_BATCH)
labels = list(zip(labels_df["county_name"],
                  labels_df["county_ansi"],
             labels_df["state_ansi"],
             labels_df["year"]))
months = MONTHS

prefixes = [f'images/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}' 
            for county,_, fips, year in labels
            for month in months]

prefixes_hist = [f'histograms/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}' 
            for county,_, fips, year in labels
            for month in months]

generate_prefixes = time.perf_counter()

# Batch check all prefixes
results_img = batch_check_blobs(bucket_name, prefixes)
results_hist = batch_check_blobs(bucket_name, prefixes_hist)

check_bucket = time.perf_counter()

# Generate get_input_img_params based on results
get_input_img_params = [
    {"county": county.capitalize(), "state_fips": fips, "year": year, "month": month}
            for county, county_fips, fips, year in labels
            for month in months
            if (results_img[f'images/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}'] and
                not results_hist[f'histograms/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}'])
]

generate_valid_list = time.perf_counter()

print(f"Number of items to process: {len(get_input_img_params)}")
print(f"""
Total run time: {(generate_valid_list - start_time)/60:.02} minutes
Check bucket: {check_bucket - generate_prefixes:02} seconds
Generate list of missing histograms: {generate_valid_list - check_bucket:.02} secods
""")

#add part to check for existing histograms already

Number of items to process: 556

Total run time: 0.62 minutes
Check bucket: 37.28128914599999 seconds
Generate list of missing histograms: 0.004 secods



In [4]:
def img_name_composer(county, state_fips, year, month):
    image_name = f"{IMG_SOURCE_PREFIX}/{SCALE}/{county.capitalize()}_{state_fips}/{year}/{month}-{month+1}"
    return image_name

def blob_name_composer(county, state_fips, year, month):
    blob_name = f"{HIST_DEST_PREFIX}/{SCALE}/{county.capitalize()}_{state_fips}/{year}/{month}-{month+1}"
    return blob_name

images_to_process = [img_name_composer(**params) for params in get_input_img_params]
blob_names = [blob_name_composer(**params) for params in get_input_img_params]
print(len(blob_names))
print(images_to_process[10],blob_names[10], sep=" -> ")

556
images/60/Leelanau_26/2021/7-8 -> histograms/60/Leelanau_26/2021/7-8


In [None]:
# Usage
start_time = time.perf_counter()
count = 0
for image_name, blob_name in zip(images_to_process, blob_names):
    recombine_image_hist = recombine_image(BUCKET, image_name, HIST_BINS_LIST, NUM_BANDS)
    write_histogram_to_gcs(recombine_image_hist, BUCKET, blob_name)
    count += 1 
    if count % 100 == 0:
        print(f"Created {count} histograms already")
end_time = time.perf_counter()

print(f"Finished, created {count} histograms")
print(f"Elapsed time {(end_time - start_time)/60} minutes")

Created 100 histograms already
Created 200 histograms already
Created 300 histograms already
Created 400 histograms already
Created 500 histograms already
Finished, created 556 histograms
