In [1]:
# Setup
!pip install --quiet --upgrade pip

# Install the dependencies.
!pip install --quiet -r gdal_req.txt

# Restart the runtime by ending the process.
exit()

In [19]:
import numpy as np
import rasterio
from google.cloud import storage
import os
import matplotlib.pyplot as plt
from utils.constants import  BUCKET, IMG_SOURCE_PREFIX, HIST_DEST_PREFIX
from serving_hist.hist_data import list_blobs_with_prefix, load_tiff_from_gcs_mem, download_and_process_tiff,load_tiff_from_gcs_temp
import io
import google.auth
from rasterio.io import MemoryFile
from osgeo import gdal
import time

In [3]:
# SETUP
credentials, _ = google.auth.default()

bucket_name = BUCKET

directory_prefix = IMG_SOURCE_PREFIX
output_prefix = HIST_DEST_PREFIX


In [4]:
# Data processing example
image_name = directory_prefix
# List all files from 2023

image_name += "Orleans"
image_name += "_2017"
image_name += "_9-10_100"


In [5]:
blobs = list_blobs_with_prefix(bucket_name, image_name)

In [7]:
def download_blob_into_memory(bucket_name, blob_name):
    """Downloads a blob into memory."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(blob_name)
    contents = blob.download_as_bytes()
    return contents


In [20]:
start_time = time.time()

blob_name = r'images/Orleans_2017_9-10_100_0000000000-0000008192.tif'
test = download_blob_into_memory(bucket_name, blob_name)

end_time = time.time()
execution_time = end_time - start_time
print(f"Load all bands of one image into memory: {execution_time:.4f} seconds")

Load all bands of one image into memory: 0.3352 seconds


In [8]:
bucket_name

'vgnn'

In [12]:
len(test)

11570361

In [21]:
start_time = time.time()

with MemoryFile(test) as memfile:
    with memfile.open() as src:
        array = src.read(1)
        
end_time = time.time()
execution_time = end_time - start_time
print(f"Convery one band into array: {execution_time:.4f} seconds")        

Convery one band into array: 11.3441 seconds


In [26]:
# Using gdal
def read_tif_from_gcs(bucket_name, blob_name):
    """
    Reads tif image from Google Cloud Storage into a tensor and attempts to print date information.
    """
    # Use exceptions for error handling
    gdal.UseExceptions()

    # Construct the GCS path
    gcs_path = f'/vsigs/{bucket_name}/{blob_name}'

    logging.info(f"Attempting to open: {gcs_path}")

    try:
        gdal_dataset = gdal.Open(gcs_path)
    except RuntimeError as e:
        logging.error(f"Error opening {gcs_path} with gdal: {str(e)}")
        
        # Check if the file exists in the bucket
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        
        if blob.exists():
            logging.info(f"The file {blob_name} exists in the bucket, but GDAL couldn't open it.")
        else:
            logging.info(f"The file {blob_name} does not exist in the bucket {bucket_name}.")
        
        return None

    # Read image data
    gdal_result = gdal_dataset.ReadAsArray().astype(np.uint16)
    if len(gdal_result.shape) == 2:
        gdal_result = np.reshape(gdal_result, [1] + list(gdal_result.shape))
    image_data = np.transpose(gdal_result, [1, 2, 0])

    logging.info(f"Successfully read image with shape: {image_data.shape}")

    return image_data

In [None]:
start_time = time.time()

gdal_array = read_tif_from_gcs(BUCKET, r'images/Orleans_2017_9-10_100_0000000000-0000008192.tif')

end_time = time.time()
execution_time = end_time - start_time
print(f"Gdal array: {execution_time:.4f} seconds")  

In [28]:
gdal_array

In [15]:
array_clean = np.nan_to_num(array, nan=0.0).astype(np.uint16)

In [17]:
def create_histogram_skip_nan(image, bins=256, range=(0, 255)):
    # Flatten the image and remove NaN values
    flat_image = image.flatten()
    
    zero_mask = np.any(flat_image != 0.0, -1)
    non_nan_values = flat_image[~np.isnan(flat_image)]
    
    # Create histogram
    hist, bin_edges = np.histogram(non_nan_values, bins=bins, density=False)
    
    return hist, bin_edges

bins = len(np.linspace(1, 2200, 33))

In [22]:
start_time = time.time()

create_histogram_skip_nan(array_clean)

end_time = time.time()
execution_time = end_time - start_time
print(f"Create single band histogram: {execution_time:.4f} seconds")    

Create single band histogram: 1.1596 seconds


In [None]:
import rasterio
import numpy as np
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_band(src, band):
    total_sum = 0
    total_count = 0
    histograms = []

    for ji, window in src.block_windows(band):
        data = src.read(band, window=window)
        valid_data = data[~np.isnan(data)]
        
        if valid_data.size > 0:
            total_sum += np.sum(valid_data)
            total_count += valid_data.size
            hist, _ = np.histogram(valid_data, bins=256, range=(0, 255))
            histograms.append(hist)

    mean = total_sum / total_count if total_count > 0 else np.nan
    combined_histogram = np.sum(histograms, axis=0) if histograms else np.zeros(256)
    return band, mean, combined_histogram

def process_tiff_parallel(bucket_name, blob_name, max_workers=4):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    with blob.open("rb") as f:
        with rasterio.open(f) as src:
            num_bands = src.count
            
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_band = {executor.submit(process_band, src, band): band for band in range(1, num_bands + 1)}
                results = []
                
                for future in as_completed(future_to_band):
                    band = future_to_band[future]
                    try:
                        result = future.result()
                        results.append(result)
                    except Exception as exc:
                        print(f'Band {band} generated an exception: {exc}')

    return sorted(results, key=lambda x: x[0])  # Sort results by band number

# Usage
bucket_name = "your-bucket-name"
blob_name = "path/to/your/large_image.tif"

results = process_tiff_parallel(bucket_name, blob_name)
for band, mean, histogram in results:
    print(f"Band {band}:")
    print(f"  Mean: {mean}")
    print(f"  Histogram sum: {np.sum(histogram)}")