References

1. [Why Did Calibration Lead to a Lower Public Score When Combining Two Kaggle Notebooks?](https://www.kaggle.com/competitions/ariel-data-challenge-2024/discussion/530472)

2. [Fork of NeurIPS Ariel 2024 - Starter 5be123](https://www.kaggle.com/code/regisvargas/fork-of-neurips-ariel-2024-starter-5be123)

3. [NeurIPS Ariel 2024 - Starter withdifferentparametr](https://www.kaggle.com/code/bingyuniu/neurips-ariel-2024-starter-withdifferentparametr)

4. [[UPDATE]Calibrating and Binning Astronomical Data](https://www.kaggle.com/code/gordonyip/update-calibrating-and-binning-astronomical-data)

5. [[UPDATE]Calibrating and Binning Astronomical Data (copy)](https://www.kaggle.com/code/aaronjday/update-calibrating-and-binning-astronomical-data)

# Initialization

This competition seems requires strong scientific background and I had lot of confusion during EDA process. Therefore, I just build a simple starter for future coding.

## Load Library

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import polars as pl
import numpy as np
import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import pickle
import time
import os
import pickle
import seaborn as sns
import scipy.stats
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Load Meta-Data
PATH = "/kaggle/input/ariel-data-challenge-2024"
train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv', 
                             index_col='planet_id')
train_labels = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_labels.csv',
                           index_col='planet_id')
wavelengths = pd.read_csv(f'{PATH}/wavelengths.csv')
axis_info = pd.read_parquet(os.path.join(PATH,'axis_info.parquet'))

In [None]:
train_adc_info['AIRS-CH0_adc_gain'].loc[785834]

# Pre-Processing
## Load Functions

In [None]:
import pywt
def wavelet_denoising(data, wavelet = 'db10', sigma=None):
    """Denoises the signal using SURE wavelet shrinkage with the specified wavelet."""
    # Criar uma cópia do array para evitar o erro de "read-only"
    data = np.array(data, copy=True)
    
    # Decompose the signal using discrete wavelet transform
    coeffs = pywt.wavedec(data, wavelet)
    
    # Estimate noise level if sigma is not provided
    if sigma is None:
        # Using the Median Absolute Deviation (MAD) estimator for noise level
        sigma = np.median(np.abs(coeffs[-1])) / 0.6745
    # Apply thresholding (SURE or hard/soft)
    threshold = sigma * np.sqrt(2 * np.log(len(data)))
    new_coeffs = [pywt.threshold(c, threshold, mode='soft') for c in coeffs]
    # Reconstruct the signal using the modified coefficients
    return pywt.waverec(new_coeffs, wavelet)

In [None]:
%%writefile utils.py
import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm
import pickle
PATH = "/kaggle/input/ariel-data-challenge-2024"
cut_inf, cut_sup = 39, 321
def load_signal_data(planet_id, dataset, instrument, img_size):
    file_path = f'{PATH}/{dataset}/{planet_id}/{instrument}_signal.parquet'
    signal = pd.read_parquet(file_path)
    if instrument == "AIRS-CH0":
        signal = signal.values.astype(np.float64).reshape((signal.shape[0], 32, 356))
    else:
        signal = signal.values.astype(np.float64).reshape((signal.shape[0], 32, 32))
    if dataset == 'train':
        gain = train_adc_info[instrument+'_adc_gain'].loc[planet_id]
        offset = train_adc_info[instrument+'_adc_offset'].loc[planet_id]
        signal = ADC_convert(signal, gain, offset)
    else:
        gain = test_adc_info[instrument+'_adc_gain'].loc[planet_id]
        offset = test_adc_info[instrument+'_adc_offset'].loc[planet_id]
        signal = ADC_convert(signal, gain, offset)
    if instrument == "AIRS-CH0":
        dt_airs = axis_info['AIRS-CH0-integration_time'].dropna().values
        dt_airs[1::2] += 0.1
        signal = signal[:, :, cut_inf:cut_sup]
    signal = signal.reshape(signal.shape[0], signal.shape[1] * signal.shape[2])
    mean_signal = signal.mean(axis=1)
    #mean_signal = mean_signal / np.linalg.norm(mean_signal)
    net_signal = mean_signal[1::2] - mean_signal[0::2]
    #return wavelet_denoising(net_signal)
    return net_signal
def ADC_convert(signal, gain, offset):
    signal = signal.astype(np.float64)
    signal /= gain
    signal += offset
    return signal
def read_and_preprocess(dataset, planet_ids, instrument = "AIRS-CH0"):
    """Read the files for all planet_ids and extract the time series.
    Parameters
    dataset: 'train' or 'test'
    planet_ids: list of planet ids
    instrument: the instrument of observation, 'AIRS-CH0' or 'FGS1', default to 'AIRS-CH0'
    Returns
    dataframe with one row per planet_id and 67500 values per row for FGS1 and 5624 for AIRS-CH0
    """
    img_size = 1024 if instrument == "FGS1" else 32*356
    column_num = 67500 if instrument == 'FGS1' else 5625
    raw_train = np.full((len(planet_ids), column_num), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        raw_train[i] = load_signal_data(planet_id, dataset, instrument, img_size)
    return raw_train
def feature_engineering(f_raw, a_raw, adc_info, window_size=50, step_size=15):
    """Create a dataframe with combined features from the raw data, including sliding window and time-series statistics.
    
    Parameters:
    f_raw: ndarray of shape (n_planets, 67500)
    a_raw: ndarray of shape (n_planets, 5625)
    window_size: int, size of the sliding window for time-series statistics
    step_size: int, step size for the sliding window
    
    Return value:
    df: DataFrame of shape (n_planets, several features)
    """
    f_obscured = f_raw[:, 23500:44000].mean(axis=1)
    f_unobscured = (f_raw[:, :20500].mean(axis=1) + f_raw[:, 47000:].mean(axis=1)) / 2
    f_relative_reduction = (f_unobscured - f_obscured) / f_unobscured
    f_std_dev = f_raw.std(axis=1)
    f_signal_to_noise = f_unobscured / f_std_dev
    a_obscured = a_raw[:, 1958:3666].mean(axis=1)
    a_unobscured = (a_raw[:, :1708].mean(axis=1) + a_raw[:, 3916:].mean(axis=1)) / 2
    a_relative_reduction = (a_unobscured - a_obscured) / a_unobscured
    a_std_dev = a_raw.std(axis=1)
    a_signal_to_noise = a_unobscured / a_std_dev
    f_variance = f_raw.var(axis=1)
    a_variance = a_raw.var(axis=1)
    
    f_skewness = pd.DataFrame(f_raw).skew(axis=1).values
    a_skewness = pd.DataFrame(a_raw).skew(axis=1).values
    f_kurtosis = pd.DataFrame(f_raw).kurtosis(axis=1).values
    a_kurtosis = pd.DataFrame(a_raw).kurtosis(axis=1).values
    
    f_half_obscured1 = f_raw[:, 20500:23500].mean(axis=1)
    f_half_obscured2 = f_raw[:, 44000:47000].mean(axis=1)
    f_half_reduction1 = (f_unobscured - f_half_obscured1) / f_unobscured
    f_half_reduction2 = (f_unobscured - f_half_obscured2) / f_unobscured
    a_half_obscured1 = a_raw[:, 1708:1958].mean(axis=1)
    a_half_obscured2 = a_raw[:, 3666:3916].mean(axis=1)
    a_half_reduction1 = (a_unobscured - a_half_obscured1) / a_unobscured
    a_half_reduction2 = (a_unobscured - a_half_obscured2) / a_unobscured
    # Sliding window features
    def sliding_window_features(data, window_size, step_size):
        features = []
        max_index = data.shape[1]
        for start in range(0, max_index - window_size + 1, step_size):
            end = start + window_size
            window = data[:, start:end]
            features.append([
                np.mean(window, axis=1),
                np.std(window, axis=1),
                np.min(window, axis=1),
                np.max(window, axis=1)
            ])
        if features:
            return np.vstack(features).T  # Stack vertically and transpose to get the correct shape
        else:
            return np.empty((data.shape[0], 0))  # Return empty array with correct shape
    
    f_sliding_features = sliding_window_features(f_raw, window_size, step_size)
    a_sliding_features = sliding_window_features(a_raw, window_size, step_size)
    print(f'f_sliding_features.shape: {f_sliding_features.shape}')
    print(f'a_sliding_features.shape: {a_sliding_features.shape}')
    df = pd.DataFrame({
        'f_relative_reduction': f_relative_reduction,
        'f_signal_to_noise': f_signal_to_noise,
        'f_variance': f_variance,
        'f_skewness': f_skewness,
        'f_kurtosis': f_kurtosis,
        'a_relative_reduction': a_relative_reduction,
        'a_signal_to_noise': a_signal_to_noise,
        'a_variance': a_variance,
        'a_skewness': a_skewness,
        'a_kurtosis': a_kurtosis,
        'f_half_reduction1': f_half_reduction1,
        'f_half_reduction2': f_half_reduction2,
        'a_half_reduction1': a_half_reduction1,
        'a_half_reduction2': a_half_reduction2
    })
    if f_sliding_features.size > 0:
        f_sliding_df = pd.DataFrame(f_sliding_features, columns=[f'f_slide_{i}' for i in range(f_sliding_features.shape[1])])
        df = pd.concat([df, f_sliding_df], axis=1)
    if a_sliding_features.size > 0:
        a_sliding_df = pd.DataFrame(a_sliding_features, columns=[f'a_slide_{i}' for i in range(a_sliding_features.shape[1])])
        df = pd.concat([df, a_sliding_df], axis=1)
    
    df = pd.concat([df, adc_info.reset_index().iloc[:, 1:6]], axis=1)
    
    return df

In [None]:
%%writefile -a utils.py

def postprocessing(pred_array, index, sigma_pred):
    """Create a submission dataframe from its components
    
    Parameters:
    pred_array: ndarray of shape (n_samples, 283)
    index: pandas.Index of length n_samples with name 'planet_id'
    sigma_pred: float
    
    Return value:
    df: DataFrame of shape (n_samples, 566) with planet_id as index
    """
    return pd.concat([pd.DataFrame(pred_array.clip(0, None), index=index, columns=wavelengths.columns),
                      pd.DataFrame(sigma_pred, index=index, columns=[f"sigma_{i}" for i in range(1, 284)])],
                     axis=1)

class ParticipantVisibleError(Exception):
    pass

def competition_score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        naive_mean: float,
        naive_sigma: float,
        sigma_true: float,
        row_id_column_name='planet_id',
    ) -> float:
    '''
    This is a Gaussian Log Likelihood based metric. For a submission, which contains the predicted mean (x_hat) and variance (x_hat_std),
    we calculate the Gaussian Log-likelihood (GLL) value to the provided ground truth (x). We treat each pair of x_hat,
    x_hat_std as a 1D gaussian, meaning there will be 283 1D gaussian distributions, hence 283 values for each test spectrum,
    the GLL value for one spectrum is the sum of all of them.

    Inputs:
        - solution: Ground Truth spectra (from test set)
            - shape: (nsamples, n_wavelengths)
        - submission: Predicted spectra and errors (from participants)
            - shape: (nsamples, n_wavelengths*2)
        naive_mean: (float) mean from the train set.
        naive_sigma: (float) standard deviation from the train set.
        sigma_true: (float) essentially sets the scale of the outputs.
    '''

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    if submission.min().min() < 0:
        raise ParticipantVisibleError('Negative values in the submission')
    for col in submission.columns:
        if not pd.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')

    n_wavelengths = len(solution.columns)
    if len(submission.columns) != n_wavelengths*2:
        raise ParticipantVisibleError('Wrong number of columns in the submission')

    y_pred = submission.iloc[:, :n_wavelengths].values
    # Set a non-zero minimum sigma pred to prevent division by zero errors.
    sigma_pred = np.clip(submission.iloc[:, n_wavelengths:].values, a_min=10**-15, a_max=None)
    y_true = solution.values

    GLL_pred = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_pred, scale=sigma_pred))
    GLL_true = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_true, scale=sigma_true * np.ones_like(y_true)))
    GLL_mean = np.sum(scipy.stats.norm.logpdf(y_true, loc=naive_mean * np.ones_like(y_true), scale=naive_sigma * np.ones_like(y_true)))

    submit_score = (GLL_pred - GLL_mean)/(GLL_true - GLL_mean)
    return float(np.clip(submit_score, 0.0, 1.0))

In [None]:
exec(open('utils.py', 'r').read())

## Load Data

In [None]:
%%time
if os.path.exists("/kaggle/input/adc24-intro-training/f_raw_train.pickle"):
    f_raw_train = np.load('/kaggle/input/adc24-intro-training/f_raw_train.pickle', allow_pickle=True)
else:
    f_raw_train = read_and_preprocess('train', train_labels.index, 'FGS1')
    with open('f_raw_train.pickle', 'wb') as f:
        pickle.dump(f_raw_train, f)

In [None]:
%%time
if os.path.exists("/kaggle/input/adc24-intro-training/a_raw_train.pickle"):
    a_raw_train = np.load('/kaggle/input/adc24-intro-training/a_raw_train.pickle', allow_pickle=True)
else:
    a_raw_train = read_and_preprocess('train', train_labels.index)
    with open('a_raw_train.pickle', 'wb') as f:
        pickle.dump(a_raw_train, f)

## Feature Engineering

In [None]:
%%time
train = feature_engineering(f_raw_train, a_raw_train, train_adc_info)

In [None]:
train.head()

In [None]:
train = train.iloc[:,:-1]

## Data Plot

In [None]:
plt.figure(figsize=(6, 2))
plt.plot(f_raw_train.mean(axis=0))
for time_step in [20500, 23500, 44000, 47000]:
    plt.axvline(time_step, color='gray')
plt.xlabel('time step')
plt.title('FGS1: Overall mean')
plt.show()

plt.figure(figsize=(6, 2))
plt.plot(a_raw_train.mean(axis=0))
for time_step in [20500, 23500, 44000, 47000]:
    plt.axvline(time_step * 11250 // 135000, color='gray')
plt.xlabel('time step')
plt.title('AIRS-CH0: Overall mean')
plt.show()

In [None]:
color_array = np.array(plt.rcParams['axes.prop_cycle'].by_key()['color'])
plt.scatter(train.a_relative_reduction, train_labels.wl_1, s=15, alpha=0.5,
            c=color_array[train_adc_info.star])
plt.xlabel('relative signal reduction when planet is in front')
plt.ylabel('target')
plt.title('Correlation between relative signal reduction and target')
# plt.gca().set_aspect('equal')
points = [plt.Line2D([0], [0], label=f'star {i}', marker='o', markersize=3,
         markeredgecolor=color_array[i], markerfacecolor=color_array[i], linestyle='') for i in range(2)]

plt.legend(handles=points)
plt.show()

# Model
## Rigde Model

In [None]:
model = Ridge(alpha=1e-12)

oof_pred = cross_val_predict(model, train, train_labels)

print(f"# R2 score: {r2_score(train_labels, oof_pred):.4f}")
sigma_pred = mean_squared_error(train_labels, oof_pred, squared=False)
print(f"# Root mean squared error: {sigma_pred:.7f}")

In [None]:
oof_df = postprocessing(oof_pred, train_adc_info.index, sigma_pred)
display(oof_df)

gll_score = competition_score(train_labels.copy().reset_index(),
                              oof_df.copy().reset_index(),
                              naive_mean=train_labels.values.mean(),
                              naive_sigma=train_labels.values.std(),
                              sigma_true=0.000003)
print(f"# Estimated competition score: {gll_score:.4f}")

In [None]:
model.fit(train, train_labels)
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)
with open('sigma_pred.pickle', 'wb') as f:
    pickle.dump(sigma_pred, f)

## Inference

In [None]:
# Load the data
test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/test_adc_info.csv',
                           index_col='planet_id')
sample_submission = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/sample_submission.csv',
                                index_col='planet_id')
f_raw_test = read_and_preprocess('test', sample_submission.index, 'FGS1')
a_raw_test = read_and_preprocess('test', sample_submission.index)
test = feature_engineering(f_raw_test, a_raw_test, test_adc_info)
test = test.iloc[: , :-1]
# Load the model
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)
with open('sigma_pred.pickle', 'rb') as f:
    sigma_pred = pickle.load(f)

# Predict
test_pred = model.predict(test)

# Package into submission file
sub_df = sub_df = postprocessing(test_pred,
                        test_adc_info.index,
                        sigma_pred=np.tile(np.where(test_adc_info[['star']] <= 1, 0.0001555, 0.00085), (1, 283)))
display(sub_df)
sub_df.to_csv('submission.csv')