# Satellite Image Preprocessing

This notebook handles all preprocessing steps for satellite imagery data including:
- Data loading and validation
- Image normalization
- Data augmentation
- Feature extraction
- Data splitting

In [None]:
# Remove directory and all its contents
import os
os.system('rm -rf /kaggle/working/ml-satellite')

In [None]:
import os

# Current working directory
print(os.getcwd())

# Clone the repository
!git clone https://github.com/xChoco-rmdn/ml-satellite.git

# Install requirements
!pip install -r requirements.txt

In [76]:
print(os.getcwd())

/kaggle/working/ml-satellite


In [78]:
print(os.listdir())

['.gitignore', 'run_workflow.py', 'notebooks', '.git', 'setup.py', 'cloud_nowcasting_workflow.ipynb', 'README.md', 'src', 'artifacts', 'requirements.txt', 'application.py']


In [77]:
os.chdir('/kaggle/working/ml-satellite')

In [79]:
import os
import sys
import numpy as np
from pathlib import Path
from datetime import datetime

In [80]:
# Add the project root directory to Python path
project_root = str(Path("/kaggle/working/ml-satellite").parent.parent)
sys.path.append(project_root)

In [81]:
from src.components.data_ingestion import DataIngestion
from src.components.data_transformations import DataTransformation
from src.logger import logger
from src.exception import CustomException

In [82]:
print(os.getcwd())

/kaggle/working/ml-satellite


## 1. Data Loading and Exploration

In [83]:
def setup_directories():
    """Create necessary directories if they don't exist"""
    directories = [
        'data/raw',
        'data/processed',
        'data/train',
        'data/test',
        'artifacts',
        'logs'
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory: {directory}")

In [84]:
def reshape_data_for_sequences_xy_nonoverlap(data, sequence_length=6):
    """Create X, y pairs for sequence prediction using non-overlapping windows."""
    n_samples = (len(data) - sequence_length) // sequence_length
    X = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    y = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    for i in range(n_samples):
        start = i * sequence_length
        end = start + sequence_length
        X[i] = data[start:end, :, :, np.newaxis]
        y[i] = data[start+1:end+1, :, :, np.newaxis]
    return X, y

def center_crop(data, target_height=256, target_width=256):
    """Crop the center of each frame in the data to the target size."""
    cropped = []
    for frame in data:
        h, w = frame.shape
        start_h = (h - target_height) // 2
        start_w = (w - target_width) // 2
        cropped.append(frame[start_h:start_h+target_height, start_w:start_w+target_width])
    return np.stack(cropped)

## 2. Data Preprocessing

In [85]:
setup_directories()

Created directory: data/raw
Created directory: data/processed
Created directory: data/train
Created directory: data/test
Created directory: artifacts
Created directory: logs


In [86]:
# Initialize components
data_ingestion = DataIngestion()
data_transformation = DataTransformation()

# Get list of satellite files
raw_data_path = os.path.join('/kaggle/input/himawari-ntb-202504/', 'Himawari_NTB_202504')
satellite_files = [os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if f.endswith('.nc')]
if not satellite_files:
    print("No satellite files found in data/raw directory", sys)

In [87]:
print(os.getcwd())

/kaggle/working/ml-satellite


In [88]:
# Process and ingest data
print("Starting data ingestion...")
ingestion_result = data_ingestion.initiate_data_ingestion(satellite_files)

# Load the processed data
train_data = np.load(ingestion_result['train_file_path'])
test_data = np.load(ingestion_result['test_file_path'])
print(f"Loaded training data shape: {train_data.shape}")
print(f"Loaded test data shape: {test_data.shape}")

# Clean and normalize data
print("Cleaning and normalizing data...")
train_data = data_transformation.clean_data(train_data)
test_data = data_transformation.clean_data(test_data)

train_data = data_transformation.normalize_data(train_data)
test_data = data_transformation.normalize_data(test_data)

# Crop data
print("Cropping data...")
train_data = center_crop(train_data, 256, 256)
test_data = center_crop(test_data, 256, 256)

# Create sequences
print("Creating sequences...")
sequence_length = data_transformation.config.sequence_length

X_train, y_train = reshape_data_for_sequences_xy_nonoverlap(train_data, sequence_length)
X_test, y_test = reshape_data_for_sequences_xy_nonoverlap(test_data, sequence_length)

# Save transformed data
print("Saving transformed data...")
np.save('data/processed/X_train.npy', X_train)
np.save('data/processed/y_train.npy', y_train)
np.save('data/processed/X_test.npy', X_test)
np.save('data/processed/y_test.npy', y_test)

print("Preprocessing completed successfully!")
print(f"Transformed data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


Started data ingestion


Starting data ingestion...


Data ingestion completed. Files saved:
Processed files: 4199 files
Train data: data/train/train_data_20250401_0000_to_20250425_0100.npy
Test data: data/test/test_data_20250425_0110_to_20250501_0000.npy


Loaded training data shape: (3359, 271, 351)
Loaded test data shape: (840, 271, 351)
Cleaning and normalizing data...


  frame_uint8 = ((frame - frame.min()) * (255.0 / (frame.max() - frame.min()))).astype(np.uint8)


Cropping data...
Creating sequences...
Saving transformed data...
Preprocessing completed successfully!
Transformed data shapes:
X_train: (558, 6, 256, 256, 1), y_train: (558, 6, 256, 256, 1)
X_test: (139, 6, 256, 256, 1), y_test: (139, 6, 256, 256, 1)
