# Satellite Image Preprocessing

This notebook handles all preprocessing steps for satellite imagery data including:
- Data loading and validation
- Image normalization
- Data augmentation
- Feature extraction
- Data splitting

In [1]:
# Remove directory and all its contents
import os
os.system('rm -rf /kaggle/working/ml-satellite')

0

In [2]:
import os

# Current working directory
print(os.getcwd())

# Clone the repository
!git clone https://github.com/xChoco-rmdn/ml-satellite.git

/kaggle/working
Cloning into 'ml-satellite'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (174/174), done.[K
remote: Total 231 (delta 114), reused 140 (delta 51), pack-reused 0 (from 0)[K
Receiving objects: 100% (231/231), 23.70 MiB | 31.94 MiB/s, done.
Resolving deltas: 100% (114/114), done.


In [3]:
os.chdir('/kaggle/working/ml-satellite')

# Install requirements
!pip install -r requirements.txt

Collecting satpy>=0.30.0 (from -r requirements.txt (line 6))
  Downloading satpy-0.56.0-py3-none-any.whl.metadata (11 kB)
Collecting donfig (from satpy>=0.30.0->-r requirements.txt (line 6))
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting pykdtree (from satpy>=0.30.0->-r requirements.txt (line 6))
  Downloading pykdtree-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.8 kB)
Collecting pyorbital (from satpy>=0.30.0->-r requirements.txt (line 6))
  Downloading pyorbital-1.10.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyresample>=1.24.0 (from satpy>=0.30.0->-r requirements.txt (line 6))
  Downloading pyresample-1.34.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting trollimage>=1.24 (from satpy>=0.30.0->-r requirements.txt (line 6))
  Downloading trollimage-1.26.0-cp311-

In [4]:
print(os.getcwd())

/kaggle/working/ml-satellite


In [5]:
print(os.listdir())

['.git', 'cloud_nowcasting_workflow.ipynb', 'artifacts', 'src', 'notebooks', 'run_workflow.py', 'setup.py', '.gitignore', 'requirements.txt', 'README.md', 'application.py']


In [6]:
os.chdir('/kaggle/working/ml-satellite')

In [7]:
import os
import sys
import numpy as np
from pathlib import Path
from datetime import datetime

In [8]:
# Add the project root directory to Python path
project_root = str(Path("/kaggle/working/ml-satellite").parent.parent)
sys.path.append(project_root)

In [9]:
from src.components.data_ingestion import DataIngestion
from src.components.data_transformations import DataTransformation
from src.logger import logger
from src.exception import CustomException

2025-05-17 01:10:29.436700: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747444229.897615      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747444230.020577      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
print(os.getcwd())

/kaggle/working/ml-satellite


## 1. Data Loading and Exploration

In [11]:
def setup_directories():
    """Create necessary directories if they don't exist"""
    directories = [
        'data/raw',
        'data/processed',
        'data/train',
        'data/test',
        'artifacts',
        'logs'
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory: {directory}")

In [12]:
def reshape_data_for_sequences_xy_nonoverlap(data, sequence_length=6):
    """Create X, y pairs for sequence prediction using non-overlapping windows."""
    n_samples = (len(data) - sequence_length) // sequence_length
    X = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    y = np.zeros((n_samples, sequence_length, data.shape[1], data.shape[2], 1))
    for i in range(n_samples):
        start = i * sequence_length
        end = start + sequence_length
        X[i] = data[start:end, :, :, np.newaxis]
        y[i] = data[start+1:end+1, :, :, np.newaxis]
    return X, y

def center_crop(data, target_height=256, target_width=256):
    """Crop the center of each frame in the data to the target size."""
    cropped = []
    for frame in data:
        h, w = frame.shape
        start_h = (h - target_height) // 2
        start_w = (w - target_width) // 2
        cropped.append(frame[start_h:start_h+target_height, start_w:start_w+target_width])
    return np.stack(cropped)

## 2. Data Preprocessing

In [13]:
setup_directories()

Created directory: data/raw
Created directory: data/processed
Created directory: data/train
Created directory: data/test
Created directory: artifacts
Created directory: logs


In [14]:
# Initialize components
data_ingestion = DataIngestion()
data_transformation = DataTransformation()

# Get list of satellite files
raw_data_path = os.path.join('/kaggle/input/himawari-ntb-202504/', 'Himawari_NTB_202504')
satellite_files = [os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if f.endswith('.nc')]
if not satellite_files:
    print("No satellite files found in data/raw directory", sys)

In [15]:
print(os.getcwd())

/kaggle/working/ml-satellite


In [16]:
# Process and ingest data
print("Starting data ingestion...")
ingestion_result = data_ingestion.initiate_data_ingestion(satellite_files)

# Load the processed data
train_data = np.load(ingestion_result['train_file_path'])
test_data = np.load(ingestion_result['test_file_path'])
print(f"Loaded training data shape: {train_data.shape}")
print(f"Loaded test data shape: {test_data.shape}")

# Clean and normalize data
print("Cleaning and normalizing data...")
train_data = data_transformation.clean_data(train_data)
test_data = data_transformation.clean_data(test_data)

train_data = data_transformation.normalize_data(train_data)
test_data = data_transformation.normalize_data(test_data)

# Crop data
print("Cropping data...")
train_data = center_crop(train_data, 256, 256)
test_data = center_crop(test_data, 256, 256)

# Create sequences
print("Creating sequences...")
sequence_length = data_transformation.config.sequence_length

X_train, y_train = reshape_data_for_sequences_xy_nonoverlap(train_data, sequence_length)
X_test, y_test = reshape_data_for_sequences_xy_nonoverlap(test_data, sequence_length)

# Save transformed data
print("Saving transformed data...")
np.save('data/processed/X_train.npy', X_train)
np.save('data/processed/y_train.npy', y_train)
np.save('data/processed/X_test.npy', X_test)
np.save('data/processed/y_test.npy', y_test)

print("Preprocessing completed successfully!")
print(f"Transformed data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


Started data ingestion


Starting data ingestion...


Data ingestion completed. Files saved:
Processed files: 4199 files
Train data: data/train/train_data_20250401_0000_to_20250425_0100.npy
Test data: data/test/test_data_20250425_0110_to_20250501_0000.npy


Loaded training data shape: (3359, 271, 351)
Loaded test data shape: (840, 271, 351)
Cleaning and normalizing data...


  frame_uint8 = ((frame - frame.min()) * (255.0 / frame_range)).astype(np.uint8)


Cropping data...
Creating sequences...
Saving transformed data...
Preprocessing completed successfully!
Transformed data shapes:
X_train: (558, 6, 256, 256, 1), y_train: (558, 6, 256, 256, 1)
X_test: (139, 6, 256, 256, 1), y_test: (139, 6, 256, 256, 1)


# Model Training 

In [17]:
import os
import sys
import numpy as np
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt
from datetime import datetime


from src.pipeline.train_pipeline import TrainPipeline
from src.logger import logger
from src.exception import CustomException

In [18]:
def plot_training_history(history, save_path):
    """Plot and save training history"""
    plt.figure(figsize=(12, 4))
    
    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot metrics
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title('Model MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

In [19]:
def main():
    try:
        logger.info("Starting training pipeline")
        os.makedirs('artifacts', exist_ok=True)
        
        # Initialize pipeline
        trainer = TrainPipeline(batch_size=4)
        
        # Setup training strategy first
        strategy = trainer.setup_training_strategy()
        
        # Load preprocessed data directly
        logger.info("Loading preprocessed data from data/processed/ ...")
        X_train = np.load('data/processed/X_train.npy')
        y_train = np.load('data/processed/y_train.npy')
        X_test = np.load('data/processed/X_test.npy')
        y_test = np.load('data/processed/y_test.npy')
        logger.info(f"Loaded X_train: {X_train.shape}, y_train: {y_train.shape}")
        logger.info(f"Loaded X_test: {X_test.shape}, y_test: {y_test.shape}")

        # Create validation split from training data
        val_split = 0.1
        val_size = int(len(X_train) * val_split)
        X_val = X_train[-val_size:]
        y_val = y_train[-val_size:]
        X_train = X_train[:-val_size]
        y_train = y_train[:-val_size]
        logger.info(f"Split: X_train: {X_train.shape}, y_train: {y_train.shape}, X_val: {X_val.shape}, y_val: {y_val.shape}")

        # Build and compile model within strategy scope
        with strategy.scope():
            model = trainer.model_trainer.build_model()
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])
            
        # Callbacks
        callbacks = [
            tf.keras.callbacks.ModelCheckpoint(
                'artifacts/best_model.h5',
                save_best_only=True,
                monitor='val_loss'
            ),
            tf.keras.callbacks.EarlyStopping(
                patience=10,
                monitor='val_loss',
                restore_best_weights=True
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                factor=0.5,
                patience=5,
                monitor='val_loss',
                min_lr=1e-6
            ),
            tf.keras.callbacks.TensorBoard(
                log_dir='logs/fit',
                histogram_freq=1,
                update_freq='epoch',
                profile_batch='100,120'
            )
        ]
        
        logger.info("Training model...")
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=4,
            callbacks=callbacks
        )
        
        logger.info("Evaluating model on test set...")
        test_metrics = model.evaluate(X_test, y_test, verbose=1)
        metrics = dict(zip(model.metrics_names, test_metrics))
        logger.info("Test Set Metrics:")
        for metric_name, value in metrics.items():
            logger.info(f"{metric_name}: {value:.4f}")
            
        logger.info("Saving final model...")
        model.save('artifacts/final_model.h5')
        logger.info("Training pipeline completed successfully!")
        
        plot_training_history(
            history,
            os.path.join('artifacts', f'training_history_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
        )
        return metrics, history
        
    except Exception as e:
        logger.error("Error in training pipeline")
        raise CustomException(e, sys)

In [None]:
if __name__ == "__main__":
    # Set memory growth for GPU if available
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logger.info(f"Found {len(gpus)} GPU(s), memory growth enabled")
        except RuntimeError as e:
            logger.warning(f"Memory growth setting failed: {str(e)}")
    
    main() 

Found 2 GPU(s), memory growth enabled
Starting training pipeline
I0000 00:00:1747444536.907517      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1747444536.908160      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5
Using MirroredStrategy with 2 GPUs
Loading preprocessed data from data/processed/ ...
Loaded X_train: (558, 6, 256, 256, 1), y_train: (558, 6, 256, 256, 1)
Loaded X_test: (139, 6, 256, 256, 1), y_test: (139, 6, 256, 256, 1)
Split: X_train: (503, 6, 256, 256, 1), y_train: (503, 6, 256, 256, 1), X_val: (55, 6, 256, 256, 1), y_val: (55, 6, 256, 256, 1)
Enhanced 2D ConvLSTM Model built successfully
Input shape: (6, 256, 256, 1)
Output shape: (None, 6, 256, 256, 1)
Total parameters: 1,699,749


Epoch 1/50


I0000 00:00:1747444596.801364     126 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1747444596.801381     123 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: nan - mae: nan

  if self.monitor_op(current, self.best):
  self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 1s/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan - learning_rate: 0.0010
Epoch 2/50
[1m 16/126[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2:09[0m 1s/step - loss: nan - mae: nan