# Satellite Image Preprocessing

This notebook handles all preprocessing steps for satellite imagery data including:
- Data loading and validation
- Image normalization
- Data augmentation
- Feature extraction
- Data splitting

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import xarray as xr
import satpy
from datetime import datetime
import cv2
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Exploration

In [None]:
# Define data paths
DATA_DIR = Path('../data')

def load_satellite_data(data_dir):
    """
    Load satellite imagery data from the specified directory
    
    Args:
        data_dir (Path): Directory containing satellite image files
    
    Returns:
        dict: Loaded satellite imagery data
    """
    # Placeholder for actual data loading logic
    # This might involve using satpy or other satellite data libraries
    images = []
    labels = []
    
    for img_path in data_dir.glob('*.tif'):  # Adjust file extension as needed
        try:
            # Load image
            img = cv2.imread(str(img_path))
            images.append(img)
            
            # Extract label (example: filename-based labeling)
            label = img_path.stem.split('_')[0]  # Adjust based on your naming convention
            labels.append(label)
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
    
    return {
        'images': np.array(images),
        'labels': np.array(labels)
    }

# Load satellite data
satellite_data = load_satellite_data(DATA_DIR)

# Basic data exploration
print("Total images:", len(satellite_data['images']))
print("Image shape:", satellite_data['images'][0].shape)
print("Unique labels:", np.unique(satellite_data['labels']))

## 2. Data Preprocessing

In [None]:
def preprocess_images(images, target_size=(224, 224)):
    """
    Preprocess satellite images
    
    Args:
        images (np.ndarray): Input images
        target_size (tuple): Desired output image size
    
    Returns:
        np.ndarray: Preprocessed images
    """
    processed_images = []
    
    for img in images:
        # Resize image
        resized = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
        
        # Normalize pixel values
        normalized = resized.astype('float32') / 255.0
        
        processed_images.append(normalized)
    
    return np.array(processed_images)

def augment_images(images):
    """
    Apply data augmentation techniques
    
    Args:
        images (np.ndarray): Input images
    
    Returns:
        np.ndarray: Augmented images
    """
    augmented_images = []
    
    for img in images:
        # Random horizontal flip
        if np.random.rand() > 0.5:
            img = cv2.flip(img, 1)
        
        # Random rotation
        angle = np.random.uniform(-15, 15)
        rows, cols = img.shape[:2]
        rotation_matrix = cv2.getRotationMatrix2D((cols/2, rows/2), angle, 1)
        rotated = cv2.warpAffine(img, rotation_matrix, (cols, rows))
        
        augmented_images.append(rotated)
    
    return np.array(augmented_images)

# Preprocess images
processed_images = preprocess_images(satellite_data['images'])

# Optional: Data augmentation
augmented_images = augment_images(processed_images)

## 3. Data Splitting

In [None]:
# Perform train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    processed_images, 
    satellite_data['labels'], 
    test_size=0.3, 
    random_state=42
)

# Split temp into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, 
    y_temp, 
    test_size=0.5, 
    random_state=42
)

# Print dataset sizes
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Test set:", X_test.shape)

## 4. Save Preprocessed Data

In [None]:
def save_preprocessed_data(X_train, X_val, X_test, y_train, y_val, y_test, output_dir):
    """
    Save preprocessed data to numpy files
    
    Args:
        X_train, X_val, X_test: Image datasets
        y_train, y_val, y_test: Label datasets
        output_dir (Path): Directory to save preprocessed data
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save training data
    np.save(output_dir / 'X_train.npy', X_train)
    np.save(output_dir / 'y_train.npy', y_train)
    
    # Save validation data
    np.save(output_dir / 'X_val.npy', X_val)
    np.save(output_dir / 'y_val.npy', y_val)
    
    # Save test data
    np.save(output_dir / 'X_test.npy', X_test)
    np.save(output_dir / 'y_test.npy', y_test)
    
    print("Preprocessed data saved successfully.")

# Save preprocessed data
save_preprocessed_data(
    X_train, X_val, X_test, 
    y_train, y_val, y_test, 
    output_dir='../processed_data'
)