# Satellite Image Preprocessing

This notebook handles all preprocessing steps for satellite imagery data including:
- Data loading and validation
- Image normalization
- Data augmentation
- Feature extraction
- Data splitting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# Remove directory and all its contents
import os
os.system('rm -rf /content/ml-satellite')

0

In [5]:
import os

# Change to project directory
os.chdir('/content/drive/MyDrive/gabut-project')

# Current working directory
print(os.getcwd())

# Clone the repository
!git clone https://github.com/xChoco-rmdn/ml-satellite.git

# Change to project directory
%cd ml-satellite

# Install requirements
!pip install -r requirements.txt

/content/drive/MyDrive/gabut-project
Cloning into 'ml-satellite'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 176 (delta 76), reused 116 (delta 39), pack-reused 0 (from 0)[K
Receiving objects: 100% (176/176), 23.68 MiB | 12.42 MiB/s, done.
Resolving deltas: 100% (76/76), done.
/content/drive/MyDrive/gabut-project/ml-satellite


In [6]:
print(os.getcwd())

/content/drive/MyDrive/gabut-project/ml-satellite


In [7]:
import os
import sys
import numpy as np
from pathlib import Path
from datetime import datetime

In [8]:
# Add the project root directory to Python path
project_root = str(Path("/content/ml-satellite").parent.parent)
sys.path.append(project_root)

In [10]:
from src.components.data_ingestion import DataIngestion
from src.components.data_transformations import DataTransformation
from src.logger import logger
from src.exception import CustomException

## 1. Data Loading and Exploration

In [11]:
def setup_directories():
    """Create necessary directories if they don't exist"""
    directories = [
        '/content/drive/MyDrive/Himawari_NTB_202504',
        'data/processed',
        'data/train',
        'data/test',
        'artifacts',
        'logs'
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory: {directory}")

In [12]:
def reshape_data_for_sequences_xy_nonoverlap(data, sequence_length=6):
    """Create X, y pairs for sequence prediction using non-overlapping windows."""
    n_samples = (len(data) - sequence_length) // sequence_length
    X = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    y = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    for i in range(n_samples):
        start = i * sequence_length
        end = start + sequence_length
        X[i] = data[start:end, :, :, np.newaxis]
        y[i] = data[start+1:end+1, :, :, np.newaxis]
    return X, y

def center_crop(data, target_height=256, target_width=256):
    """Crop the center of each frame in the data to the target size."""
    cropped = []
    for frame in data:
        h, w = frame.shape
        start_h = (h - target_height) // 2
        start_w = (w - target_width) // 2
        cropped.append(frame[start_h:start_h+target_height, start_w:start_w+target_width])
    return np.stack(cropped)

## 2. Data Preprocessing

In [13]:
setup_directories()

# Initialize components
data_ingestion = DataIngestion()
data_transformation = DataTransformation()

# Get list of satellite files
raw_data_path = os.path.join('/content/drive/MyDrive', 'Himawari_NTB_202504')
satellite_files = [os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if f.endswith('.nc')]
if not satellite_files:
    print("No satellite files found in data/raw directory", sys)

In [14]:
print(os.getcwd())

/content/drive/MyDrive/gabut-project/ml-satellite


In [15]:
# Process and ingest data
print("Starting data ingestion...")
ingestion_result = data_ingestion.initiate_data_ingestion(satellite_files)

# Load the processed data
train_data = np.load(ingestion_result['train_file_path'])
test_data = np.load(ingestion_result['test_file_path'])
print(f"Loaded training data shape: {train_data.shape}")
print(f"Loaded test data shape: {test_data.shape}")

# Clean and normalize data
print("Cleaning and normalizing data...")
train_data = data_transformation.clean_data(train_data)
test_data = data_transformation.clean_data(test_data)

train_data = data_transformation.normalize_data(train_data)
test_data = data_transformation.normalize_data(test_data)

# Crop data
print("Cropping data...")
train_data = center_crop(train_data, 256, 256)
test_data = center_crop(test_data, 256, 256)

# Create sequences
print("Creating sequences...")
sequence_length = data_transformation.config.sequence_length

X_train, y_train = reshape_data_for_sequences_xy_nonoverlap(train_data, sequence_length)
X_test, y_test = reshape_data_for_sequences_xy_nonoverlap(test_data, sequence_length)

# Save transformed data
print("Saving transformed data...")
np.save('data/processed/X_train.npy', X_train)
np.save('data/processed/y_train.npy', y_train)
np.save('data/processed/X_test.npy', X_test)
np.save('data/processed/y_test.npy', y_test)

print("Preprocessing completed successfully!")
print(f"Transformed data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


KeyboardInterrupt: 

## 3. Data Splitting

In [None]:
# Perform train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    processed_images,
    satellite_data['labels'],
    test_size=0.3,
    random_state=42
)

# Split temp into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42
)

# Print dataset sizes
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Test set:", X_test.shape)

## 4. Save Preprocessed Data

In [None]:
def save_preprocessed_data(X_train, X_val, X_test, y_train, y_val, y_test, output_dir):
    """
    Save preprocessed data to numpy files

    Args:
        X_train, X_val, X_test: Image datasets
        y_train, y_val, y_test: Label datasets
        output_dir (Path): Directory to save preprocessed data
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save training data
    np.save(output_dir / 'X_train.npy', X_train)
    np.save(output_dir / 'y_train.npy', y_train)

    # Save validation data
    np.save(output_dir / 'X_val.npy', X_val)
    np.save(output_dir / 'y_val.npy', y_val)

    # Save test data
    np.save(output_dir / 'X_test.npy', X_test)
    np.save(output_dir / 'y_test.npy', y_test)

    print("Preprocessed data saved successfully.")

# Save preprocessed data
save_preprocessed_data(
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    output_dir='../processed_data'
)