In [7]:
import os
import shutil
import pandas as pd
from pathlib import Path
import logging
import glob
import time
from functools import wraps

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('file_organization.log'),
        logging.StreamHandler()
    ]
)

# Retry decorator to handle transient file access errors
def retry_on_file_lock(max_retries=3, delay=1):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            retries = 0
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except (PermissionError, OSError) as e:
                    if e.winerror in (32, 1224):
                        logging.warning(f"File access error: {e}. Retrying in {delay} seconds...")
                        retries += 1
                        time.sleep(delay)
                    else:
                        raise
            raise RuntimeError(f"Failed after {max_retries} retries due to file access issues.")
        return wrapper
    return decorator

@retry_on_file_lock(max_retries=5, delay=2)
def safe_copy(src, dst):
    """Safely copy files with retry logic."""
    shutil.copy2(src, dst)
    logging.info(f"Successfully copied file: {src} -> {dst}")

try:
    # Read master locations file
    master_df = pd.read_csv('processed_data/master_locations.csv')
    
    # Validate required columns exist
    required_columns = ['location_id', 'soilgrids_id']
    missing_columns = [col for col in required_columns if col not in master_df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns in CSV: {missing_columns}")

    # Create output directories if they don't exist
    landsat_out_dir = Path('landsat_model')
    soilgrid_out_dir = Path('soilgrid_model')

    landsat_out_dir.mkdir(exist_ok=True)
    soilgrid_out_dir.mkdir(exist_ok=True)

    # Mapping for tracking processed files
    scene_mapping = {}
    scene_counter = 1
    error_count = 0

    # Process each row in master locations file
    for idx, row in master_df.iterrows():
        try:
            location_id = row['location_id']
            soilgrids_id = row['soilgrids_id']
            
            if pd.isna(location_id):
                logging.warning(f"Skipping row {idx}: Missing location_id")
                continue

            # Assign sequential number if new scene
            if location_id not in scene_mapping:
                scene_mapping[location_id] = f"{scene_counter:04d}"
                scene_counter += 1
            
            scene_num = scene_mapping[location_id]
            
            # Process Landsat bands using only location_id
            landsat_path = Path('processed_data/landsat_data/resampled')
            if not landsat_path.exists():
                raise FileNotFoundError(f"Landsat directory not found: {landsat_path}")
                
            landsat_files = glob.glob(str(landsat_path / f"*{location_id}*_SR_B*.TIF"))
            
            if not landsat_files:
                logging.warning(f"No Landsat files found for location_id: {location_id}")
                continue

            for file in landsat_files:
                try:
                    band_number = os.path.basename(file).split('_SR_B')[-1].split('.')[0]
                    dst = landsat_out_dir / f"scene{scene_num}_loc{location_id}_b{band_number}.tif"
                    safe_copy(file, dst)
                except Exception as e:
                    logging.error(f"Error copying Landsat band {band_number} for scene {scene_num}: {e}")
                    error_count += 1
            
            # Process Soilgrids bands using location_id
            if pd.isna(soilgrids_id):
                logging.warning(f"Missing soilgrids_id for location_id {location_id}")
                continue
                
            soilgrid_path = Path('processed_data/soilgrids_data/tifs') / f"location_{location_id}"
            if not soilgrid_path.exists():
                logging.warning(f"Soilgrids directory not found: {soilgrid_path}")
                continue

            # OCD -> s1
            ocd_src = soilgrid_path / 'ocd_0-5cm_mean.tif'
            try:
                if ocd_src.exists():
                    ocd_dst = soilgrid_out_dir / f"scene{scene_num}_loc{location_id}_s1.tif"
                    safe_copy(ocd_src, ocd_dst)
            except Exception as e:
                logging.error(f"Error copying OCD file for scene {scene_num}: {e}")
                error_count += 1
                
            # SOC -> s2
            soc_src = soilgrid_path / 'soc_0-5cm_mean.tif'
            try:
                if soc_src.exists():
                    soc_dst = soilgrid_out_dir / f"scene{scene_num}_loc{location_id}_s2.tif"
                    safe_copy(soc_src, soc_dst)
            except Exception as e:
                logging.error(f"Error copying SOC file for scene {scene_num}: {e}")
                error_count += 1
                
        except Exception as e:
            logging.error(f"Error processing row {idx}: {e}")
            error_count += 1
            continue

    logging.info(f"Processing complete:")
    logging.info(f"- Processed {scene_counter-1} unique scenes")
    logging.info(f"- Encountered {error_count} errors")
    if error_count > 0:
        logging.info("Check the log file for detailed error information")
        
except Exception as e:
    logging.error(f"Fatal error: {e}")
    raise


2024-12-20 18:26:55,868 - INFO - Successfully copied file: processed_data\landsat_data\resampled\resampled_33d6cfcd_LC09_L2SP_041021_20240722_20240723_02_T1_SR_B1.TIF -> landsat_model\scene0003_loc33d6cfcd_b1.tif
2024-12-20 18:26:55,873 - INFO - Successfully copied file: processed_data\landsat_data\resampled\resampled_33d6cfcd_LC09_L2SP_041021_20240722_20240723_02_T1_SR_B2.TIF -> landsat_model\scene0003_loc33d6cfcd_b2.tif
2024-12-20 18:26:55,885 - INFO - Successfully copied file: processed_data\landsat_data\resampled\resampled_33d6cfcd_LC09_L2SP_041021_20240722_20240723_02_T1_SR_B3.TIF -> landsat_model\scene0003_loc33d6cfcd_b3.tif
2024-12-20 18:26:55,889 - INFO - Successfully copied file: processed_data\landsat_data\resampled\resampled_33d6cfcd_LC09_L2SP_041021_20240722_20240723_02_T1_SR_B4.TIF -> landsat_model\scene0003_loc33d6cfcd_b4.tif
2024-12-20 18:26:55,893 - INFO - Successfully copied file: processed_data\landsat_data\resampled\resampled_33d6cfcd_LC09_L2SP_041021_20240722_202407