In [23]:
# Setup
!pip install --quiet --upgrade pip

# Install the dependencies.
!pip install --quiet -r gdal_req.txt

# Restart the runtime by ending the process.
exit()

In [1]:
import numpy as np
import rasterio
from google.cloud import storage
import os
import matplotlib.pyplot as plt
from utils.constants import  BUCKET, IMG_SOURCE_PREFIX, HIST_DEST_PREFIX, NUM_BANDS, HIST_BINS_LIST
from serving_hist.hist_data import list_blobs_with_prefix, load_tiff_from_gcs_mem, download_and_process_tiff,load_tiff_from_gcs_temp
import io
import google.auth
from rasterio.io import MemoryFile
from osgeo import gdal
import time
import logging

In [2]:
# SETUP
credentials, _ = google.auth.default()

bucket_name = BUCKET

directory_prefix = IMG_SOURCE_PREFIX
output_prefix = HIST_DEST_PREFIX

logging.basicConfig(filename="hist.log",level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# Data processing example
image_name = directory_prefix
# List all files from 2023

image_name += "Orleans"
image_name += "_2017"
image_name += "_9-10_100"


In [4]:
image_name

'images/Orleans_2017_9-10_100'

In [5]:
blobs = list_blobs_with_prefix(bucket_name, image_name)

In [6]:
def download_blob_into_memory(bucket_name, blob_name):
    """Downloads a blob into memory."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(blob_name)
    contents = blob.download_as_bytes()
    return contents


In [25]:
blob_name

'images/Orleans_2017_9-10_100_0000000000-0000008192.tif'

In [24]:
start_time = time.time()

test = download_blob_into_memory(bucket_name, blob_name)

end_time = time.time()
execution_time = end_time - start_time
print(f"Load all bands of one image into memory: {execution_time:.4f} seconds")

Load all bands of one image into memory: 0.4145 seconds


In [11]:
start_time = time.time()

storage_client = storage.Client()
blob = storage_client.bucket(BUCKET).blob(blob_name)
with blob.open("rb") as f:
     with rasterio.open(f) as src:
            num_bands = src.count

end_time = time.time()
execution_time = end_time - start_time
print(f"whiel keeping file as file and reading as needed: {execution_time:.4f} seconds")            

whiel keeping file as file and reading as needed: 0.1744 seconds


In [26]:
start_time = time.time()

with MemoryFile(test) as memfile:
    with memfile.open() as src:
        array = src.read(11)
        
end_time = time.time()
execution_time = end_time - start_time
print(f"Convery one band into array: {execution_time:.4f} seconds")        

Convery one band into array: 11.2360 seconds


In [29]:
array_clean = array[~np.isnan(array)]

array([10. ,  9. ,  9. , ..., 10. ,  9.5,  9. ], dtype=float32)

In [35]:
clean_hist = np.histogram(array_clean, HIST_BINS_LIST[10])
clean_hist

(array([   0,    0,    0,    0,    0,    0,    0,    0,    1,    5,  119,
         903, 1138, 3199, 3009, 5217, 8462, 1126,  776,  185,  205,   41,
           2,    1,    0,    0,    0,    0,    0,    0,    0,    0]),
 array([ 1.     ,  1.59375,  2.1875 ,  2.78125,  3.375  ,  3.96875,
         4.5625 ,  5.15625,  5.75   ,  6.34375,  6.9375 ,  7.53125,
         8.125  ,  8.71875,  9.3125 ,  9.90625, 10.5    , 11.09375,
        11.6875 , 12.28125, 12.875  , 13.46875, 14.0625 , 14.65625,
        15.25   , 15.84375, 16.4375 , 17.03125, 17.625  , 18.21875,
        18.8125 , 19.40625, 20.     ]))

In [9]:
def create_histogram_skip_nan(image, bins=256):
    # Flatten the image and remove NaN values
    flat_image = image.flatten()
    
    non_nan_values = flat_image[~np.isnan(flat_image)].astype(np.uint16)
    
    # Create histogram
    hist, bin_edges = np.histogram(non_nan_values, bins=bins, density=False)
    
    return hist, bin_edges

In [19]:
start_time = time.time()

hist_skip_nan = create_histogram_skip_nan(array,bins)

end_time = time.time()
execution_time = end_time - start_time
print(f"Create single band histogram: {execution_time:.4f} seconds")    

Create single band histogram: 0.2079 seconds


In [48]:
from concurrent.futures import ThreadPoolExecutor, as_completed
image_name = r"images/Canyon_2017_5-6_100"
# image_name = r"images/Story_2018_9-10_100_.tif"
blobs = list_blobs_with_prefix(bucket_name, image_name)

def process_band(blob_name, band, bins):

    storage_client = storage.Client()
    blob = storage_client.bucket(BUCKET).blob(blob_name)
    
    with blob.open("rb") as f:
        with rasterio.open(f) as src:    
    
            data = src.read(band)
            valid_data = data[~np.isnan(data)].astype(np.uint16)
            valid_max = np.max(valid_data)
            valid_mean = np.min(valid_data)
            
            if valid_max > bins[-1]:
                logging.warning(f"image: {image_name}, band: {band}, {valid_max} value is larger than assumed possible values for this band")
            elif valid_min < bins[0]:
                logging.warning(f"image: {image_name}, band: {band}, {valid_max} value is smaller than assumed possible values for this band")
            if valid_data.size > 0:
                total_sum = np.sum(valid_data)
                total_count = valid_data.size
                mean = total_sum / total_count
                hist, _ = create_histogram_skip_nan(valid_data, bins)
            else:
                mean = np.nan
                hist = np.zeros_like(bins[:-1])  # histogram will have one less element than bins

    return hist

def process_tiff(bucket_name, blob_name, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_band = {executor.submit(process_band, blob_name, band, HIST_BINS_LIST[band-1]): band 
                          for band in range(1, NUM_BANDS + 1)}
        results = []
        
        for future in as_completed(future_to_band):
            band = future_to_band[future]
            try:
                result = future.result()
                results.append(result)
                logging.info(f"Processed band {band} successfully")
            except Exception as exc:
                logging.exception(f'Band {band} generated an exception: {exc}')
    
    sorted_results = sorted(results, key=lambda x: x[0])
    return  np.array(sorted_results).flatten() # one long array instead of bands

def recombine_image(blob_name):
    start_time = time.time()
    
    hist_per_blob = []
    blobs = list_blobs_with_prefix(BUCKET, blob_name)
    
    for blob in blobs:
        results = process_tiff(bucket_name, blob.name)
        hist_per_blob.append(results)
    
    combined_hist = np.sum(np.array(hist_per_blob), axis=0)
    
    end_time = time.time()
    execution_time = end_time - start_time
    logging.info(f"Image {blob_name} has been processed in {execution_time/60:.4f} minuntes"
    
    return combined_hist

# Usage
start_time = time.time()

recombine_image_hist = recombine_image(image_name)
    

print(f"Total execution time: {execution_time/60:.4f} minuntes")


(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
(658, 624)
Total execution time: 0.0203 minuntes


In [52]:
sum(recombine_image_hist)

np.int64(295125)

In [21]:
start_time = time.time()
storage_client = storage.Client()
blob = storage_client.bucket(BUCKET).blob(blob_name)
with blob.open("rb") as f:
     with rasterio.open(f) as src:
            num_bands = src.count
            _, _, hist = process_band(src, 1, bins)

end_time = time.time()
execution_time = end_time - start_time
print(f"Read and process one img band: {execution_time:.4f} seconds")  

Read and process one img band: 11.5494 seconds


bedzie
