
## Introduction to Fast Data Cleaning Approach
In data science, efficient data cleaning is crucial for handling large datasets, especially in fields like astronomy where data volumes can be immense. The traditional approach using masked arrays allows for flexibility in masking invalid or missing data, but it can introduce performance bottlenecks, particularly with large-scale datasets.

In the this notebook, I adopt a more performant approach by utilizing NaN (Not a Number) values for masking. This method leverages the optimized operations available in NumPy and Pandas, which handle NaN values efficiently and allow for faster computations compared to traditional masked arrays. By replacing masked elements with NaNs, I streamline data processing and improve the speed of operations like filtering, aggregating, and statistical analysis.

Additionally, I keep the data dimensions two dimensional while processing. This makes it slightly harder to reason about dimensions during data processing, but ensures better data-locality while processing the large files of this challenge.

This code is inspired by others in this competition. Enormous thanks goes to: @AmbrosM and @GordonYip.

Code licensed under ["Rejoice of Fun License (ROFL)"](https://lmy.medium.com/rofl-an-open-source-license-that-promotes-fun-in-coding-620388502891). Attribution appreciated :-)

In [None]:
import pickle
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd
import scipy.stats
from astropy.stats import sigma_clip

import matplotlib.pyplot as plt

In [None]:
# Read metadata

train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv', index_col='planet_id')
train_labels = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_labels.csv', index_col='planet_id')
wavelengths = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/wavelengths.csv')
axis_info = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/axis_info.parquet')

In [None]:
%%writefile preprocess_fast.py


def ADC_convert(signal, gain, offset):
    signal /= gain
    signal += offset
    return signal

def mask_hot_dead(signal, dead, dark):
    hot = sigma_clip(dark, sigma=5, maxiters=5).mask
    hot_mask = hot.reshape((-1,))
    dead_mask = (dead == 1.0).reshape((-1,))

    signal[:, hot_mask] = np.NaN
    signal[:, dead_mask] = np.NaN
    return signal

def clean_dark(signal, dark, dt):
    dark_current = (dt[:, np.newaxis] * dark)
    signal -= dark_current
    return signal

def clean_flat(signal, flat):
    signal = (signal) / (flat)
    return signal

def apply_linear_corr(linear_corr,signal):
    for i in range(signal.shape[1]):
        poli = np.poly1d(np.flip(linear_corr[:, i]))
        signal[:, i] = poli(signal[:, i])
    return signal

def bin_obs(signal ,binning):
    signal_binned = np.zeros((signal.shape[0]//binning, signal.shape[1]))
    for i in range(signal.shape[0]//binning):
        signal_binned[i, :] = np.mean(signal[i*binning:(i+1)*binning, :], axis=0)
    return signal_binned

def airs_preprocess(dataset, adc_info, axis_info, planet_ids):
    """Read the AIRS-CH0 files for all planet_ids and extract the time series.
    
    Parameters
    dataset: 'train' or 'test'
    adc_info: metadata dataframe, either train_adc_info or test_adc_info
    axis_info: axis info, includes gain/offset/integration
    planet_ids: list of planet ids
    
    Returns
    dataframe with one row per planet_id and 5625//binning values per row
    
    """
    binning = 60
    
    AIRS_CH0_gain, AIRS_CH0_offset = adc_info['AIRS-CH0_adc_gain'].values, adc_info['AIRS-CH0_adc_offset'].values
    dt_airs = axis_info['AIRS-CH0-integration_time'].dropna().values
    
    # planet x time-binned data x frequency
    a_raw = np.full((len(planet_ids), 5625//binning, 356), np.nan, dtype=np.float32)
    
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_signal.parquet').values.astype(np.float64)
        flat = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_calibration/flat.parquet').values.astype(np.float64).reshape((1, -1))
        dark = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_calibration/dark.parquet').values.astype(np.float64).reshape((1, -1))
        dead = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_calibration/dead.parquet').values.astype(np.float64).reshape((1, -1))
        linear_corr = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_calibration/linear_corr.parquet').values.astype(np.float64).reshape(6,-1)
        
        signal = ADC_convert(signal, AIRS_CH0_gain[i], AIRS_CH0_offset[i])
        
        signal = mask_hot_dead(signal, dead, dark) # with NaN, avoiding use of masked arrays
        
        signal = apply_linear_corr(linear_corr,signal) # operating on non-masked *much* faster
        
        signal = clean_dark(signal, dark, dt_airs)
        
        signal = signal[1::2, :] - signal[0::2, :] # Correlated double sampling
        signal = bin_obs(signal, binning)
        
        signal = clean_flat(signal, flat)
        
        # mean over pixels, preserve wavelength dimension
        signal = signal.reshape((signal.shape[0], 32, 356))
        signal = np.nanmean(signal, axis=1) # nanmean because we're masking with nan

        a_raw[i] = signal
    return a_raw

def fgs_preprocess(dataset, adc_info, axis_info, planet_ids):
    """Read the AIRS-CH0 files for all planet_ids and extract the time series.
    
    Parameters
    dataset: 'train' or 'test'
    adc_info: metadata dataframe, either train_adc_info or test_adc_info
    axis_info: axis info, includes gain/offset/integration
    planet_ids: list of planet ids
    
    Returns
    dataframe with one row per planet_id and 67500//binning values per row
    
    """
    binning = 720
        
    # planet x time-binned data
    fgs_raw = np.full((len(planet_ids), 67500//binning), np.nan, dtype=np.float32)
    
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_signal.parquet').values.astype(np.float64)
        flat = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_calibration/flat.parquet').values.astype(np.float64).reshape((1, -1))
        dark = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_calibration/dark.parquet').values.astype(np.float64).reshape((1, -1))
        dead = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_calibration/dead.parquet').values.astype(np.float64).reshape((1, -1))
        linear_corr = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_calibration/linear_corr.parquet').values.astype(np.float64).reshape(6,-1)
                
        FGS1_gain = train_adc_info['FGS1_adc_gain'].values[i]
        FGS1_offset = train_adc_info['FGS1_adc_offset'].values[i]
        
        signal = ADC_convert(signal, FGS1_gain, FGS1_offset)
            
        signal = mask_hot_dead(signal, dead, dark) # with NaN, avoiding use of masked arrays
        
        signal = apply_linear_corr(linear_corr,signal) # operating on non-masked *much* faster
        
        dt_fgs1 = np.ones(len(signal))*0.1
        signal = clean_dark(signal, dark, dt_fgs1)
        
        signal = signal[1::2, :] - signal[0::2, :] # Correlated double sampling
        signal = bin_obs(signal, binning)
        
        signal = clean_flat(signal, flat)
        
        # mean over pixels
        signal = np.nanmean(signal, axis=1) # nanmean because we're masking with nan

        fgs_raw[i] = signal
    return fgs_raw
    

In [None]:
%%time
exec(open('preprocess_fast.py', 'r').read())
print('AIRS Data:')
airs_raw_train = airs_preprocess('train', train_adc_info, axis_info, train_labels.index[0:5])
print('FGS Data:')
fgs_raw_train = fgs_preprocess('train', train_adc_info, axis_info, train_labels.index[0:5])

## Obligatory Light Curve Plots

Just to make sure we didn't break something too badly... 🙃

In [None]:
for i in range(5):
    plt.plot((fgs_raw_train[i,:])/np.mean(fgs_raw_train[i,:]))

In [None]:
for i in range(5):
    plt.plot((np.mean(airs_raw_train[i,:, :], axis=1)/np.mean(airs_raw_train[i,:, :])))