In [30]:
# Setup
!pip install --quiet --upgrade pip

# Install the dependencies.
!pip install --quiet -r gdal_req.txt



In [14]:
# Restart the runtime by ending the process.
exit()

In [1]:
import numpy as np
import rasterio
from google.cloud import storage
import os
import matplotlib.pyplot as plt
from serving.constants import  SCALE, BUCKET, IMG_SOURCE_PREFIX, HIST_DEST_PREFIX, HIST_BINS_LIST, SCALE, CROP, MONTHS, IMAGE_BATCH, hist_bins, SELECTED_BANDS, MAP_NAN, NORMALIZE, NUM_BINS, BANDS, get_bins_bands
from serving.hist_data import recombine_image, write_histogram_to_gcs
from serving.common import list_blobs_with_prefix
from serving.data import get_varied_labels, get_labels
import io
import google.auth
from rasterio.io import MemoryFile
from osgeo import gdal
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import product

In [20]:
# SETUP
credentials, _ = google.auth.default()

bucket_name = BUCKET
bins_list = HIST_BINS_LIST
hist_buckets = NUM_BINS

map_nan = MAP_NAN #Replace nan by 0 (False), mask (True)
normalize = NORMALIZE #Divide by 10 000 reflectance scaling bringign values roughly to interval [0,1.6]

directory_prefix = IMG_SOURCE_PREFIX
output_prefix = HIST_DEST_PREFIX

immgs_to_check =  {"count_start":0,
                   "no_records":2000, # total num of possible choices ~1900 above 2000 means all images
                   "ascending": False} 
months = MONTHS



logging.basicConfig(filename="hist.log",level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [21]:
def img_name_composer(county, state_fips, year, month):
    image_name = f"{IMG_SOURCE_PREFIX}/{SCALE}/{county.capitalize()}_{state_fips}/{year}/{month}-{month+1}"
    return image_name

def check_blob_prefix_exists(bucket_name, prefix):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    hist_blob = bucket.blob(prefix)
    return hist_blob.exists()

def batch_check_blobs(bucket_name, prefixes):
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_prefix = {executor.submit(check_blob_prefix_exists, bucket_name, prefix): prefix for prefix in prefixes}
        results = {}
        for future in as_completed(future_to_prefix):
            prefix = future_to_prefix[future]
            results[prefix] = future.result()
    return results


In [None]:
band_set = [
            #[False, True, True, True, True, True, True, True, True, True, False, False, False],
            [True, True, True, True, True, True, True, True, True, True, False, True, True]
           ]
bukcets_set = [60]
map_nan = [True,]
           #False]

combinations = list(product(bukcets_set, band_set, map_nan))


In [None]:
# # Create a few sets of histograms
#     - vary sample size
#     - number of buckets
#     - number of bands 

for bin_band_combo_map_nan in combinations:
    n_bins, band_selector, map_nan = bin_band_combo_map_nan
    hist_buckets, sel_bands = get_bins_bands(n_bins,band_selector).values()
    
    def blob_name_composer(county, state_fips, year, month, map_nan, normalize):
        blob_name = f"{HIST_DEST_PREFIX}/nan_map_{map_nan}/norm_{normalize}/{n_bins}_buckets_{len(sel_bands)}_bands/{SCALE}/{county.capitalize()}_{state_fips}/{year}/{month}-{month+1}"
        return blob_name    
    
    # Generate all prefixes
    start_time = time.perf_counter()

    labels_df = get_varied_labels(**immgs_to_check)
    labels = list(zip(labels_df["county_name"],
                      labels_df["county_ansi"],
                 labels_df["state_ansi"],
                 labels_df["year"]))

    prefixes = [f'images/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}.tif' 
                for county,_, fips, year in labels
                for month in months]

    prefixes_hist = [f'histograms/nan_map_{map_nan}/norm_{normalize}/{n_bins}_buckets_{len(sel_bands)}_bands/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}.npy' 
                for county,_, fips, year in labels
                for month in months]

    generate_prefixes = time.perf_counter()

    # Batch check all prefixes
    results_img = batch_check_blobs(bucket_name, prefixes)
    results_hist = batch_check_blobs(bucket_name, prefixes_hist)

    check_bucket = time.perf_counter()

    # Generate get_input_img_params based on results
    get_input_img_params = [
        {"county": county.capitalize(), "state_fips": fips, "year": year, "month": month}
                for county, county_fips, fips, year in labels
                for month in months
                if (results_img[f'images/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}.tif'] and
                    not results_hist[f'histograms/nan_map_{map_nan}/norm_{normalize}/{n_bins}_buckets_{len(sel_bands)}_bands/{SCALE}/{county.capitalize()}_{fips}/{year}/{month}-{month+1}.npy'])
    ]

    generate_valid_list = time.perf_counter()
    print(f"Number of images matching the name pattern: {sum(results_img.values())}")
    print(f"Number of items to process: {len(get_input_img_params)}")
    print(f"""
    Total run time: {(generate_valid_list - start_time)/60:.02} minutes
    Check bucket: {check_bucket - generate_prefixes:.02} seconds
    Generate list of missing histograms: {generate_valid_list - check_bucket:.02} secods
    """)

    images_to_process = [img_name_composer(**params) for params in get_input_img_params]
    blob_names = [blob_name_composer(**params, map_nan = map_nan, normalize = normalize) for params in get_input_img_params]
    print(len(blob_names))
    print(images_to_process[2],blob_names[2], sep=" -> ")

    # Usage
    start_time = time.perf_counter()
    count = 0
    for image_name, blob_name in zip(images_to_process, blob_names):
        recombine_image_hist = recombine_image(BUCKET, image_name, hist_buckets, sel_bands, MAP_NAN, NORMALIZE)
        write_histogram_to_gcs(recombine_image_hist, BUCKET, blob_name)
        count += 1 
        if count % 100 == 0:
            print(f"Created {count} histograms already")
    end_time = time.perf_counter()

    print(f"Finished, created {count} histograms")
    print(f"Elapsed time {(end_time - start_time)/60:.02} minutes")

Number of images matching the name pattern: 20697
Number of items to process: 18254

    Total run time: 9.1 minutes
    Check bucket: 5.5e+02 seconds
    Generate list of missing histograms: 0.082 secods
    
18254
images/60/Genesee_36/2016/9-10 -> histograms/nan_map_True/norm_True/60_buckets_9_bands/60/Genesee_36/2016/9-10
Created 100 histograms already
Created 200 histograms already
Created 300 histograms already
Created 400 histograms already
Created 500 histograms already
Created 600 histograms already
Created 700 histograms already
Created 800 histograms already
Created 900 histograms already
Created 1000 histograms already
Created 1100 histograms already
Created 1200 histograms already
Created 1300 histograms already
Created 1400 histograms already
Created 1500 histograms already
Created 1600 histograms already
Created 1700 histograms already
Created 1800 histograms already
Created 1900 histograms already
Created 2000 histograms already
Created 2100 histograms already
Created 22