In [5]:
import os
import numpy as np
from PIL import Image
import glob

# Define directories
landsat_dir = './landsat_model'
soilgrid_dir = './soilgrid_model'

# Function to check if image is mostly blank (contains mostly zero or very low values)
def is_blank_image(image_path, threshold=0.9999999):
    try:
        with Image.open(image_path) as img:
            img_array = np.array(img)
            # Calculate the percentage of pixels that are zero or very close to zero
            blank_pixels = np.sum(img_array < 55) / img_array.size
            return blank_pixels > threshold
    except:
        # If image can't be opened, consider it corrupted
        return True

# Get unique scene IDs from landsat directory
scene_ids = set()
for file in os.listdir(landsat_dir):
    if file.endswith('.tif'):
        # Extract the scene ID (everything before _b)
        scene_id = file.split('_b')[0]
        scene_ids.add(scene_id)

num_deleted = 0

# Process each scene
for scene_id in scene_ids:
    # Check all band variations for this scene
    landsat_files = glob.glob(os.path.join(landsat_dir, f'{scene_id}_b*.tif'))
    
    # If any of the band variations is blank/corrupted, delete all related files
    should_delete = False
    for landsat_file in landsat_files:
        if is_blank_image(landsat_file):
            should_delete = True
            break
    
    if should_delete:
        # Delete all landsat band variations
        for landsat_file in landsat_files:
            if os.path.exists(landsat_file):
                os.remove(landsat_file)
                print(f'Deleted: {landsat_file}')
        
        # Delete corresponding soilgrid files
        soilgrid_s1 = os.path.join(soilgrid_dir, f'{scene_id}_s1.tif')
        soilgrid_s2 = os.path.join(soilgrid_dir, f'{scene_id}_s2.tif')
        
        
        if os.path.exists(soilgrid_s1):
            os.remove(soilgrid_s1)
            num_deleted += 1
            print(f'Deleted: {soilgrid_s1}')
        if os.path.exists(soilgrid_s2):
            os.remove(soilgrid_s2)
            num_deleted += 1
            print(f'Deleted: {soilgrid_s2}')

        
         

print("Finished cleaning up blank/corrupted images. Deleted", num_deleted/2, "files.")

Deleted: ./landsat_model\scene1755_loc062c5637_b1.tif
Deleted: ./landsat_model\scene1755_loc062c5637_b2.tif
Deleted: ./landsat_model\scene1755_loc062c5637_b3.tif
Deleted: ./landsat_model\scene1755_loc062c5637_b4.tif
Deleted: ./landsat_model\scene1755_loc062c5637_b5.tif
Deleted: ./landsat_model\scene1755_loc062c5637_b7.tif
Deleted: ./soilgrid_model\scene1755_loc062c5637_s1.tif
Deleted: ./soilgrid_model\scene1755_loc062c5637_s2.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b1.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b2.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b3.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b4.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b5.tif
Deleted: ./landsat_model\scene0019_loc203f680d_b7.tif
Deleted: ./soilgrid_model\scene0019_loc203f680d_s1.tif
Deleted: ./soilgrid_model\scene0019_loc203f680d_s2.tif
Deleted: ./landsat_model\scene0150_locf4755c8a_b1.tif
Deleted: ./landsat_model\scene0150_locf4755c8a_b2.tif
Deleted: ./landsat_model

In [6]:
from PIL import Image
import random
import numpy as np

def add_noise(img_array, noise_factor=0.1):
    noise = np.random.normal(0, noise_factor, img_array.shape)
    noisy_img = img_array + noise
    return np.clip(noisy_img, 0, 255).astype(np.uint8)

def apply_random_transforms(img):
    transforms = []
    
    # Randomly decide which transforms to apply
    if random.random() > 0.5:
        img = img.rotate(90)
        transforms.append('rotate90')
    if random.random() > 0.5:
        img = img.rotate(180)
        transforms.append('rotate180')
    if random.random() > 0.5:
        img = img.transpose(Image.FLIP_LEFT_RIGHT)
        transforms.append('flip')
    if random.random() > 0.5:
        img_array = np.array(img)
        img = Image.fromarray(add_noise(img_array))
        transforms.append('noise')
    
    return img, '_'.join(transforms) if transforms else 'original'

def augment_scene(scene_id, landsat_files, soilgrid_s1, soilgrid_s2, new_scene_id):
    # Apply same random transformations to all files in the scene
    transform_sequence = None
    
    # Process Landsat files
    for landsat_file in landsat_files:
        band = landsat_file.split('_b')[-1].split('.')[0]
        img = Image.open(landsat_file)
        
        # Apply transforms for first file or use same sequence
        if transform_sequence is None:
            img, transform_sequence = apply_random_transforms(img)
        else:
            img, _ = apply_random_transforms(img)
            
        img.save(f"{landsat_dir}/scene{new_scene_id}_loc{scene_id[-8:]}_b{band}.tif")

    # Process Soilgrid files with same transforms
    for suffix in ['s1', 's2']:
        soilgrid_file = soilgrid_s1 if suffix == 's1' else soilgrid_s2
        if os.path.exists(soilgrid_file):
            img = Image.open(soilgrid_file)
            img, _ = apply_random_transforms(img)
            img.save(f"{soilgrid_dir}/scene{new_scene_id}_loc{scene_id[-8:]}_{suffix}.tif")

# Randomly select 50% of complete scenes
complete_scenes = [scene_id for scene_id in scene_ids 
                  if len(glob.glob(os.path.join(landsat_dir, f'{scene_id}_b*.tif'))) == 6]
selected_scenes = random.sample(complete_scenes, k=len(complete_scenes)//2)

new_scene_counter = 2001
for scene_id in selected_scenes:
    landsat_files = glob.glob(os.path.join(landsat_dir, f'{scene_id}_b*.tif'))
    soilgrid_s1 = os.path.join(soilgrid_dir, f'{scene_id}_s1.tif')
    soilgrid_s2 = os.path.join(soilgrid_dir, f'{scene_id}_s2.tif')
    
    augment_scene(scene_id, landsat_files, soilgrid_s1, soilgrid_s2, new_scene_counter)
    new_scene_counter += 1

print(f"Created {(new_scene_counter-2001)} new augmented scenes")

Created 1611 new augmented scenes
