In [14]:
# Setup
!pip install --quiet --upgrade pip

# Install the dependencies.
!pip install --quiet -r gdal_req.txt

# Restart the runtime by ending the process.
exit()

In [6]:
import numpy as np
import rasterio
from google.cloud import storage
import os
import matplotlib.pyplot as plt
from utils.constants import  BUCKET, IMG_SOURCE_PREFIX, HIST_DEST_PREFIX, NUM_BANDS, HIST_BINS_LIST
from serving_hist.hist_data import list_blobs_with_prefix, load_tiff_from_gcs_mem, download_and_process_tiff,load_tiff_from_gcs_temp
import io
import google.auth
from rasterio.io import MemoryFile
from osgeo import gdal
import time
import logging

In [7]:
# SETUP
credentials, _ = google.auth.default()

bucket_name = BUCKET

directory_prefix = IMG_SOURCE_PREFIX
output_prefix = HIST_DEST_PREFIX

logging.basicConfig(filename="hist.log",level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [8]:
# Data processing example
image_name = directory_prefix
# List all files from 2023

image_name += "Orleans"
image_name += "_2017"
image_name += "_9-10_100"


In [9]:
blobs = list_blobs_with_prefix(bucket_name, image_name)

In [5]:
def download_blob_into_memory(bucket_name, blob_name):
    """Downloads a blob into memory."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(blob_name)
    contents = blob.download_as_bytes()
    return contents


In [10]:
blob_name = r'images/Orleans_2017_9-10_100_0000000000-0000008192.tif'

In [13]:
start_time = time.time()

test = download_blob_into_memory(bucket_name, blob_name)

end_time = time.time()
execution_time = end_time - start_time
print(f"Load all bands of one image into memory: {execution_time:.4f} seconds")

Load all bands of one image into memory: 0.1229 seconds


In [11]:
start_time = time.time()

storage_client = storage.Client()
blob = storage_client.bucket(BUCKET).blob(blob_name)
with blob.open("rb") as f:
     with rasterio.open(f) as src:
            num_bands = src.count

end_time = time.time()
execution_time = end_time - start_time
print(f"whiel keeping file as file and reading as needed: {execution_time:.4f} seconds")            

whiel keeping file as file and reading as needed: 0.1744 seconds


In [18]:
start_time = time.time()

with MemoryFile(test) as memfile:
    with memfile.open() as src:
        array = src.read(1)
        
end_time = time.time()
execution_time = end_time - start_time
print(f"Convery one band into array: {execution_time:.4f} seconds")        

Convery one band into array: 11.2461 seconds


In [41]:
array.max()

np.float32(nan)

In [11]:
array_clean = np.nan_to_num(array, nan=0.0)
array_clean.max()

np.float32(2193.5)

In [11]:
def create_histogram_skip_nan(image, bins=256):
    # Flatten the image and remove NaN values
    flat_image = image.flatten()
    
    non_nan_values = flat_image[~np.isnan(flat_image)].astype(np.uint16)
    
    # Create histogram
    hist, bin_edges = np.histogram(non_nan_values, bins=bins, density=False)
    
    return hist, bin_edges

bins = len(np.linspace(1, 2200, 33))

In [19]:
start_time = time.time()

hist_skip_nan = create_histogram_skip_nan(array,bins)

end_time = time.time()
execution_time = end_time - start_time
print(f"Create single band histogram: {execution_time:.4f} seconds")    

Create single band histogram: 0.2079 seconds


In [46]:
type(hist_skip_nan[0])

numpy.ndarray

Create single band histogram: 12.8084 seconds


In [24]:
data

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)

In [27]:
max(data[~np.isnan(data)])

np.float32(2193.5)

In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_band(band, bins):

    storage_client = storage.Client()
    blob = storage_client.bucket(BUCKET).blob(blob_name)
    
    with blob.open("rb") as f:
        with rasterio.open(f) as src:    
    
            data = src.read(band)
            valid_data = data[~np.isnan(data)].astype(np.uint16)
            print(data.shape)
            logging.info(f"image: {image_name}, band: {band}, min: {np.min(valid_data)}, max: {np.max(valid_data)}")
            if valid_data.size > 0:
                total_sum = np.sum(valid_data)
                total_count = valid_data.size
                mean = total_sum / total_count
                hist, _ = create_histogram_skip_nan(valid_data, bins)
            else:
                mean = np.nan
                hist = np.zeros_like(bins[:-1])  # histogram will have one less element than bins

    return band, mean, hist

def process_tiff(bucket_name, blob_name, bins, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_band = {executor.submit(process_band, band, HIST_BINS_LIST[band-1]): band 
                          for band in range(1, NUM_BANDS + 1)}
        results = []
        
        for future in as_completed(future_to_band):
            band = future_to_band[future]
            try:
                result = future.result()
                results.append(result)
                print(f"Processed band {band} successfully")
            except Exception as exc:
                print(f'Band {band} generated an exception: {exc}')
                
    return sorted(results, key=lambda x: x[0])  # Sort results by band number

# Usage
start_time = time.time()

bucket_name = BUCKET

bins = len(np.linspace(1, 2200, 33))  

results = process_tiff(bucket_name, blob_name, bins)

end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.4f} seconds")


(8192, 8192)
Processed band 2 successfully
(8192, 8192)
Processed band 4 successfully
(8192, 8192)
Processed band 1 successfully
(8192, 8192)
Processed band 3 successfully
(8192, 8192)
Processed band 8 successfully
(8192, 8192)
Processed band 7 successfully
(8192, 8192)
Processed band 5 successfully
(8192, 8192)
Processed band 6 successfully
(8192, 8192)
Processed band 9 successfully
(8192, 8192)
Processed band 10 successfully
(8192, 8192)
Processed band 11 successfully
(8192, 8192)
Processed band 12 successfully
(8192, 8192)
Processed band 13 successfully
Total execution time: 55.0878 seconds


In [21]:
start_time = time.time()
storage_client = storage.Client()
blob = storage_client.bucket(BUCKET).blob(blob_name)
with blob.open("rb") as f:
     with rasterio.open(f) as src:
            num_bands = src.count
            _, _, hist = process_band(src, 1, bins)

end_time = time.time()
execution_time = end_time - start_time
print(f"Read and process one img band: {execution_time:.4f} seconds")  

Read and process one img band: 11.5494 seconds


In [22]:
hist

array([ 309, 2456, 5964, 9962, 3882, 1049,  329,  169,  125,   73,   31,
         14,    5,    1,    3,    4,    3,    0,    0,    4,    0,    0,
          0,    1,    0,    0,    0,    1,    1,    2,    0,    0,    1])

In [26]:
hist_skip_nan[0]

array([ 309, 2456, 5964, 9962, 3882, 1049,  329,  169,  125,   73,   31,
         14,    5,    1,    3,    4,    3,    0,    0,    4,    0,    0,
          0,    1,    0,    0,    0,    1,    1,    2,    0,    0,    1])

In [None]:
for band, mean, histogram in results:
    print(f"Band {band}:")
    print(f"  Mean: {mean}")
    print(f"  Histogram sum: {np.sum(histogram)}")

In [13]:
results

[(1,
  np.float64(1303.4508999958998),
  array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,   632,
         13616,  9257,   631,   197,    34,     5,     7,     2,     2,
             1,     0,     1,     3,     1])),
 (2,
  np.float64(1000.4196564024766),
  array([20096,  3825,   368,    66,     6,     4,     4,     1,     1,
             3,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0])),
 (3,
  np.float64(895.517487391857),
  array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    4,   37,  314, 1567, 5182,
         7983, 4704, 2364, 1051,  433,  236,  141,   97,  105,   76])),
 (4,
  np.float64(599.0790520316536),
  array([    0,     0,     0,     0,     0,   201,  4172, 10478,  4827,
          1995,   939,   50