# Satellite Image Preprocessing

This notebook handles data ingestion and preprocessing using the project's pipeline system.
It includes:
- Data ingestion from Himawari satellite
- Data validation and cleaning
- Data transformation and normalization
- Train/Test split

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import glob

# Add project root to path
sys.path.append('..')

# Import project modules
from src.components.data_ingestion import DataIngestion
from src.components.data_transformations import DataTransformation
from src.logger import logger
from src.exception import CustomException

# Configure logging and error handling
logger.info("Starting data preprocessing pipeline")

## 1. Data Ingestion

In [None]:
def perform_data_ingestion(start_time, end_time):
    """
    Perform data ingestion from Himawari satellite
    
    Args:
        start_time (datetime): Start of data collection period
        end_time (datetime): End of data collection period
    
    Returns:
        dict: Paths to ingested data files
    """
    try:
        # Initialize data ingestion
        data_ingestion = DataIngestion()
        logger.info(f"Fetching Himawari data from {start_time} to {end_time}")
        
        # Fetch satellite data
        satellite_files = data_ingestion.get_himawari_data(start_time, end_time)
        
        # Process and save the data
        ingestion_result = data_ingestion.initiate_data_ingestion(satellite_files)
        
        logger.info("Data ingestion completed successfully")
        return ingestion_result
    
    except Exception as e:
        logger.error(f"Data ingestion failed: {e}")
        raise CustomException(e, sys)

# Get Himawari data for a specific time range
start_time = datetime(2025, 4, 1, 0, 0)  # Example start time
end_time = datetime(2025, 5, 1, 0, 0)    # Example end time

# Perform data ingestion
ingestion_result = perform_data_ingestion(start_time, end_time)

print("Data ingestion completed:")
print(f"Train data saved at: {ingestion_result['train_file_path']}")
print(f"Test data saved at: {ingestion_result['test_file_path']}")

## 2. Data Transformation

In [None]:
def perform_data_transformation(train_file_path, test_file_path):
    """
    Perform data transformation and preprocessing
    
    Args:
        train_file_path (str): Path to training data file
        test_file_path (str): Path to test data file
    
    Returns:
        tuple: Transformed training and test datasets
    """
    try:
        # Initialize data transformation
        data_transform = DataTransformation()
        logger.info("Starting data transformation")
        
        # Load the ingested data
        train_data = np.load(train_file_path)
        test_data = np.load(test_file_path)
        
        # Transform the data
        (X_train, y_train), (X_test, y_test) = data_transform.initiate_data_transformation(
            train_data, test_data
        )
        
        logger.info("Data transformation completed successfully")
        return (X_train, y_train), (X_test, y_test)
    
    except Exception as e:
        logger.error(f"Data transformation failed: {e}")
        raise CustomException(e, sys)

# Perform data transformation
((X_train, y_train), (X_test, y_test)) = perform_data_transformation(
    ingestion_result['train_file_path'], 
    ingestion_result['test_file_path']
)

print("Data transformation completed:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

## 3. Data Visualization

In [None]:
def plot_sample_images(X, y, num_samples=3):
    """
    Visualize sample input sequences and corresponding targets
    
    Args:
        X (np.ndarray): Input sequences
        y (np.ndarray): Target sequences
        num_samples (int): Number of samples to plot
    """
    try:
        fig, axes = plt.subplots(num_samples, 2, figsize=(12, 4*num_samples))
        
        for i in range(num_samples):
            # Plot input sequence (last time step)
            axes[i, 0].imshow(X[i, -1, :, :, 0], cmap='viridis')
            axes[i, 0].set_title(f'Input Sequence (t={i})')
            axes[i, 0].axis('off')
            
            # Plot target
            axes[i, 1].imshow(y[i, 0, :, :, 0], cmap='viridis')
            axes[i, 1].set_title(f'Target (t={i+1})')
            axes[i, 1].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        logger.info("Sample images plotted successfully")
    
    except Exception as e:
        logger.error(f"Image plotting failed: {e}")
        raise CustomException(e, sys)

# Plot sample images from training set
plot_sample_images(X_train, y_train)

## 4. Save Preprocessed Data

In [None]:
def save_preprocessed_data(X_train, y_train, X_test, y_test, output_dir='artifacts'):
    """
    Save preprocessed data to NumPy files
    
    Args:
        X_train, y_train (np.ndarray): Training data
        X_test, y_test (np.ndarray): Test data
        output_dir (str): Directory to save preprocessed data
    """
    try:
        # Create artifacts directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Save preprocessed data
        np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
        np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
        np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
        np.save(os.path.join(output_dir, 'y_test.npy'), y_test)
        
        logger.info(f"Preprocessed data saved in {output_dir} directory")
        print(f"Preprocessed data saved in {output_dir} directory")
    
    except Exception as e:
        logger.error(f"Failed to save preprocessed data: {e}")
        raise CustomException(e, sys)

# Save preprocessed data
save_preprocessed_data(X_train, y_train, X_test, y_test)