In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from scipy.interpolate import griddata
from tqdm import tqdm
import os
import shutil
import time
import warnings
warnings.filterwarnings('ignore', module='statsmodels')

from statsmodels.tsa.stattools import adfuller, kpss
from scipy.signal import butter, filtfilt
from scipy.fft import fft, fftfreq
import pywt

import pandas.api.types
import scipy.stats

import glob
import joblib
import gc 

# Task Understanding
The task of this competition is to extract the atmospheric spectra for each observations with an **estimate of their level of uncertainty**. (Performing this detrending process to extract ***atmospheric spectra*** and their ***associated errorbars*** from raw observational data is a crucial and common prerequisite step for any modern astronomical instrument before the data can undergo scientific analysis.)

# Data Understanding

## train_labels.csv
Ground truth spectra. (Ground truth spectroscopy refers to known spectral data for a particular object, such as a planet or a star. These data are usually derived from observations from high-precision instruments or validated model calculations.)

In [None]:
train_labels = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/train_labels.csv").set_index("planet_id")
print(train_labels.shape)
train_labels.head(5)

In [None]:
plt.figure(figsize=(10, 4))

plt.plot(train_labels.columns, train_labels.iloc[0], label = f'id_{train_labels.index[0]}')
plt.plot(train_labels.columns, train_labels.iloc[1], label = f'id_{train_labels.index[1]}')
plt.plot(train_labels.columns, train_labels.iloc[2], label = f'id_{train_labels.index[2]}')
plt.plot(train_labels.columns, train_labels.iloc[3], label = f'id_{train_labels.index[3]}')
plt.plot(train_labels.columns, train_labels.iloc[4], label = f'id_{train_labels.index[4]}')

plt.title('Spectral Data of Samples')
plt.xlabel('Wavelength')
plt.ylabel('Value')
plt.legend()
plt.grid(True)

tick_positions = range(0, len(train_labels.columns), 40)
tick_labels = [train_labels.columns[i] for i in tick_positions]
plt.xticks(ticks=tick_positions, labels=tick_labels, rotation=45)

plt.show()

We can see that the spectrum of each planet fluctuates in a small range, which means that the spectrum can be used to distinguish each planet. Fluctuations are due to some atmospheric phenomenon, and planets with close spectral curves may be at similar distances from their stars. 

*So I think the mean of all the spectra can also be used as a simple marker to distinguish planets.*

In [None]:
correlation_matrix = train_labels.corr()

plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap in Samples')
plt.show()

It appears that there are many planets whose ground truth spectra is similar.

In [None]:
# for i in range(train_labels.shape[1]):
#     plt.figure(figsize=(10, 6))
#     autocorrelation_plot(train_labels.iloc[:, i])
#     plt.title(f'Autocorrelation Plot for Feature {i+1}')
#     plt.show()

In [None]:
mean_values = train_labels.mean(axis=1)

print("max_wl_mean",np.max(mean_values), "min_wl_mean", np.min(mean_values))

# mean_df = pd.DataFrame({
#     'planet_id': data_subset.index,
#     'mean_gts': mean_values
# }).set_index('planet_id')

# mean_df

In [None]:
# Plotting the MSE distribution as a histogram
plt.figure(figsize=(12, 2))
plt.hist(mean_values, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Mean')
plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## wavelength.csv
The wavelength grid for each ground truth spectrum in the dataset.

In [None]:
wavelengths = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/wavelengths.csv")
wavelengths

Comparing the ground truth spectrum with the wavelength grid does not make sense per se. What really matters is:
* The ground truth spectra of different planets are compared under a unified wavelength grid to understand their spectral characteristics.
* The observed spectra and ground truth spectra of the same planet were compared under a unified wavelength grid to verify the accuracy and validity of the observed data.

## [train/test]_adc_info.csv
Contains analog-to-digital (ADC) conversion parameters (gain and offset) for restoring the original dynamic range of the data. Also includes a star column identifying which star was used for that planet's simulation. 

*To convert analog data into digital signals, ADCs apply specific gain and offset parameters. These parameters can restore the data to its original dynamic range, which is very important to ensure the accuracy and consistency of the data:*
* **Gain**: Adjusts the amplitude of a signal, usually by a multiplicative factor.
* **Offset**: The base point at which the signal is adjusted, usually by an additive factor.

*The original observations are corrected using gains and offsets to recover their original dynamic range:*
$Corrected Value=(Raw Value−Offset)\times Gain$

In [None]:
train_adc_info = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv").set_index("planet_id")
print(train_adc_info.shape)
train_adc_info.head(5)

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(data=train_adc_info, x='star', order=train_adc_info['star'].value_counts().index)

* **star**: The column is a binary distribution of 0 and 1, meaning that only two stars were used in the simulation to generate the data. 0 and 1 are the identifiers of two different stars.

Other details:
1. **FGS1** refers to data from the First Fine Guidance Sensor (FGS). FGS are commonly used to precisely locate and track stars or celestial bodies to ensure a stable and accurate pointing of the telescope. FGS1 is one of the channels responsible for collecting a specific band or type of optical data.
1. **AIRS-CH0** refers to Channel 0 of the Atmospheric Infrared Sounder (AIRS). AIRS is a hyperspectral resolution infrared sounder commonly used to measure atmospheric temperature, humidity, cloud cover, and other meteorological parameters. AIRS-CH0 is one of its multiple channels, collecting a specific band or type of infrared data.

In [None]:
# data_subset = train_adc_info.head(5).drop(["star"],axis=1)
# data_subset = pd.concat([data_subset, mean_df], axis=1)

# fig, axes = plt.subplots(nrows=1, ncols=data_subset.shape[1], figsize=(14, 4))
# fig.subplots_adjust(wspace=0.5)

# for idx, col in enumerate(data_subset.columns):
#     sns.barplot(x=data_subset.index, y=data_subset[col], ax=axes[idx])
#     axes[idx].set_title({col})
#     axes[idx].set_xlabel('planet_id')
#     axes[idx].set_ylabel(col)
#     axes[idx].grid(True, linestyle='--', linewidth=0.5)
#     axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90)

# plt.show()

In [None]:
# sorted_data = pd.DataFrame()

# for col in data_subset.columns:
#     sorted_series = data_subset[[col]].copy()
#     sorted_series['abs_value'] = sorted_series[col].abs()
#     sorted_series = sorted_series.sort_values(by='abs_value', ascending=False)
#     sorted_series = sorted_series.drop(columns='abs_value')
#     sorted_data[col] = sorted_series.index

# sorted_data

## axis_info.parquet
Axis information for both instruments.

* **AIRS -CH0-AXis0-H:** Represents the value of axis0 of the AIRS-CH0 instrument at a point in time in units of h (usually a physical quantity related to the instrument, such as height or other measurement).
* **AIRS -CH0-AXis2-UM**: Represents the value of axis2 of the AIRS-CH0 instrument at a point in time, in μm (micrometers, usually representing the wavelength of the spectrum).
* **AIRS -CH0-Integration_time**: Indicates the integration time of the AIRS-CH0 instrument at a certain point in time, that is, the length of time for each measurement.
* **Fgs1-axis0-h**: Represents the value of axis0 of the FGS1 instrument at a point in time in units of h.

*--Idea--*
* Data registration: This axis information can be used to register image data with actual physical measurements, such as correlating spectral data with wavelength, time, and other information.
* Data analysis: Knowing the measurement conditions at different points in time, such as integration time, can help analyze data quality and instrument performance.
* Correction: This information may be used to correct image data, especially when NaN values appear in some columns of the data, which can be completed or adjusted using information from other columns.

In [None]:
axis_info = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/axis_info.parquet')
print(axis_info.shape)
axis_info.head(5)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 6))

columns = ['AIRS-CH0-axis0-h', 'AIRS-CH0-axis2-um', 'AIRS-CH0-integration_time', 'FGS1-axis0-h']
for ax, col in zip(axs.flat, columns):
    ax.scatter(axis_info.index, axis_info[col], s=1)  
    ax.set_title(col)
    #ax.set_xlabel('Index')
    #ax.set_ylabel(col)
    ax.grid(True)

plt.tight_layout()
plt.show()

## [train/test]/[planet_id]/FGS1_signal.parquet
Signal data from the FGS1 instrument. Each file contains 135,000 rows of images at 0.1 second time steps. Each 32x32 image has been flattened into 1024 columns. You can un-flatten the data with numpy.reshape(135000, 32, 32). Similar to AIR-CH0, the data is generated in uint16. To restore its original dynamic range you must multiply the data by the matching **gain** value from **[train/test]_adc_info.csv** and then add the **offset** value, also from **[train/test]_adc_info.csv**.

In [None]:
train_785834_FGS = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/785834/FGS1_signal.parquet')
print(train_785834_FGS.shape)
train_785834_FGS.head(5)

In [None]:
signal_array = train_785834_FGS.values

reshaped_signal_data = signal_array.reshape(135000, 32, 32)

data = reshaped_signal_data[0]
plt.matshow(data, cmap="coolwarm")
plt.colorbar(shrink=0.8, aspect=20, pad=0.1)
plt.title('Restored Signal Image at 0th Time Step')
plt.show()

In [None]:
#(673, 5)
#FGS1_adc_offset	FGS1_adc_gain	AIRS-CH0_adc_offset	AIRS-CH0_adc_gain	
#star planet_id	

planet_id = 785834

gain = train_adc_info.loc[planet_id, 'FGS1_adc_gain']
offset = train_adc_info.loc[planet_id, 'FGS1_adc_offset']

original_signal_data = reshaped_signal_data * gain + offset

print(original_signal_data.shape)

In [None]:
data = reshaped_signal_data[0]
plt.matshow(data, cmap="coolwarm")
plt.colorbar(shrink=0.8, aspect=20, pad=0.1)
plt.title('Restored Original Signal Image at 0th Time Step')
plt.show()

In [None]:
max_value = np.max(data)
max_position = np.unravel_index(np.argmax(data), data.shape)
print(max_value, max_position)

In [None]:
slice_index = 0  # Select the first time slice

# Select the slice
Z = reshaped_signal_data[slice_index, :, :]

# Creating a grid
X = np.arange(Z.shape[1])  # length
Y = np.arange(Z.shape[0])  # value
X, Y = np.meshgrid(X, Y)  # Creating a grid

# Draw a 3D thermal map
fig = plt.figure(figsize=(12, 8), dpi=80)
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(surf, ax=ax, shrink=0.8, aspect=20, pad=0.1)

ax.set_xlabel('Length')
ax.set_ylabel('Value')
ax.set_zlabel('Signal')
ax.set_title(f'Slice at Time Index {slice_index}')

plt.show()

In [None]:
slice_index = 5000  # Select the 5000th time slice

# Select the slice
Z = reshaped_signal_data[slice_index, :, :]

# Creating a grid
X = np.arange(Z.shape[1])  # length
Y = np.arange(Z.shape[0])  # value
X, Y = np.meshgrid(X, Y)  # Creating a grid

# Draw a 3D thermal map
fig = plt.figure(figsize=(12, 8), dpi=80)
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(surf, ax=ax, shrink=0.8, aspect=20, pad=0.1)

ax.set_xlabel('Length')
ax.set_ylabel('Value')
ax.set_zlabel('Signal')
ax.set_title(f'Slice at Time Index {slice_index}')

plt.show()

From the 3D figure, we can see that the FGS1 signal is tapered in each time slice. **Assuming that all planets are distributed in this way (to be verified later)**, we extract peaks as features of the training data.

In [None]:
time = reshaped_signal_data[0]
value = reshaped_signal_data[1]
length = reshaped_signal_data[2]

In [None]:
length_value_matrix_avg = np.mean(reshaped_signal_data, axis=0)

plt.matshow(length_value_matrix_avg, cmap='coolwarm', origin='lower', fignum=1)
plt.xlabel('Length')
plt.ylabel('Value')
plt.title('Length vs Value')
plt.colorbar()
plt.show()

In [None]:
length_time_matrix_avg = np.mean(reshaped_signal_data, axis=1)

plt.matshow(length_time_matrix_avg.T, cmap='coolwarm', origin='lower', aspect='auto')
plt.xlabel('Time')
plt.ylabel('Length')
plt.title('Time vs Length')
plt.colorbar()
plt.show()

In [None]:
value_time_matrix_avg = np.mean(reshaped_signal_data, axis=2)
plt.matshow(value_time_matrix_avg.T, cmap='coolwarm', origin='lower', aspect='auto')
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time vs Value')
plt.colorbar()
plt.show()

In [None]:
max_values = np.max(reshaped_signal_data, axis=(1, 2))

print(max_values, "shape:", max_values.shape)
plt.figure(figsize=(12, 2))
plt.plot(max_values)
plt.xlabel("Time")
plt.ylabel('Maximum Value')
plt.show()

haha, that's a lot like a bunch of white noise. To deal with these timing signals later, I first prepare a stationarity test.

* **ADF test**: Used to test whether the time series is stationary. If the p-value is less than the significance level alpha, then the time series is considered stationary.
* **KPSS test**: Used to test whether the time series is stationary. If the p-value is greater than the significance level alpha, then the time series is considered stationary.
* **Stationarity judgment**: If the results of both the ADF test and the KPSS test indicate that the time series is stationary, then the comprehensive judgment is that the time series is stationary.

In [None]:
def check_stationarity(data, alpha=0.05):
    """
    Check the stationarity of a time series.
    
    Parameters:
    data: pandas Series or numpy array, the time series data.
    alpha: significance level, default is 0.05.
    
    Returns:
    dict: A dictionary containing the results of ADF and KPSS tests and stationarity judgement.
    """
    
    # ADF test
    adf_result = adfuller(data)
    adf_statistic = adf_result[0]
    adf_p_value = adf_result[1]
    adf_critical_values = adf_result[4]
    
    adf_result_str = (
        f'ADF Statistic: {adf_statistic}\n'
        f'p-value: {adf_p_value}\n'
        f'Critical Values: {adf_critical_values}\n'
        f'ADF Test: {"Stationary" if adf_p_value < alpha else "Non-stationary"}'
    )
    
    # KPSS test
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')
        kpss_result = kpss(data, regression='c')
        
        kpss_statistic = kpss_result[0]
        kpss_p_value = kpss_result[1]
        kpss_critical_values = kpss_result[3]
        
        # Check for the specific warning
        if any('InterpolationWarning' in str(warning.message) for warning in w):
            kpss_result_str = (
                f'KPSS Statistic: {kpss_statistic}\n'
                f'p-value: {kpss_p_value} (Warning: actual p-value is smaller than this)\n'
                f'Critical Values: {kpss_critical_values}\n'
                f'KPSS Test: {"Stationary" if kpss_p_value > alpha else "Non-stationary"}'
            )
        else:
            kpss_result_str = (
                f'KPSS Statistic: {kpss_statistic}\n'
                f'p-value: {kpss_p_value}\n'
                f'Critical Values: {kpss_critical_values}\n'
                f'KPSS Test: {"Stationary" if kpss_p_value > alpha else "Non-stationary"}'
            )
    
    # Stationarity judgement
    is_stationary = (
        adf_p_value < alpha and kpss_p_value > alpha
    )
    
    return {
        'ADF Result': adf_result_str,
        'KPSS Result': kpss_result_str,
        'Overall Stationarity': 'Stationary' if is_stationary else 'Non-stationary'
    }

In [None]:
# Check stationarity
results = check_stationarity(max_values)
for key, value in results.items():
    print(f'{key}:\n{value}\n')

Although the ADF test indicates that the time series is stationary, the KPSS test indicates that the time series is non-stationary. Since the KPSS test is particularly **sensitive to low-frequency trends**, it is more likely to reject the stationarity hypothesis in the presence of a trend or nonstationarity component. Let's take a look at the signal peaks at odd and even times separately.

In [None]:
max_values_odd = max_values[1::2]
max_values_even = max_values[0::2]

plt.figure(figsize=(12, 4))

plt.plot(range(1, len(max_values), 2), max_values_odd, label='Odd Time Indices', color='blue')

plt.plot(range(0, len(max_values), 2), max_values_even, label='Even Time Indices', color='green')

plt.title('Signal Peaks at Odd and Even Time Indices')
plt.xlabel('Time')
plt.ylabel('Maximum Value')

plt.legend()
plt.show()

In [None]:
# Check stationarity
results = check_stationarity(max_values_odd)
for key, value in results.items():
    print(f'{key}:\n{value}\n')

In [None]:
# Check stationarity
results = check_stationarity(max_values_even)
for key, value in results.items():
    print(f'{key}:\n{value}\n')

It's still like white noise, I don't know how to extract features. I will try **Filtering**. Because we have selected the peak value, I want to remove the high frequency noise and keep the low frequency component, that is, use the **low frequency filter**.

In [None]:
def butter_lowpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = filtfilt(b, a, data)
    return y

In [None]:
fs = 1000.0  # Frequency of sampling
cutoff = 1.0  # Frequency of cutoff
filtered_data_odd = lowpass_filter(max_values_odd, cutoff, fs)
filtered_data_even = lowpass_filter(max_values_even, cutoff, fs)
filtered_data_all = lowpass_filter(max_values, cutoff, fs)

In [None]:
plt.figure(figsize=(12, 2))
plt.plot(filtered_data_odd, label='Odd Time Indices', color='blue')
plt.xlabel("Time")
plt.ylabel('Maximum Value')
plt.legend()
plt.show()

plt.figure(figsize=(12, 2))
plt.plot(filtered_data_even, label='Odd Time Indices', color='green')
plt.xlabel("Time")
plt.ylabel('Maximum Value')
plt.legend()
plt.show()

plt.figure(figsize=(12, 2))
plt.plot(filtered_data_all, label='All Time Indices', color='red')
plt.xlabel("Time")
plt.ylabel('Maximum Value')
plt.legend()
plt.show()

In [None]:
# Check stationarity
results = check_stationarity(filtered_data_all)
for key, value in results.items():
    print(f'{key}:\n{value}\n')

In [None]:
# Check stationarity
results = check_stationarity(filtered_data_all[40000:])
for key, value in results.items():
    print(f'{key}:\n{value}\n')

haha, the fluctuation before 40000 steps is bound to make the data difficult to smooth, even after we delete this part still cannot pass KPSS. However, it is worth noting that the **follow-up work should look at more data to see if there is a similar shake before 40000 steps**. If it is widespread, it may be due to systematic errors caused by some climatic and equipment problems. Things seem to be getting better. Let me try the **Fourier Transform** and **Wavelet Transform**.

**Fourier Transform**
* Application: Fourier transform is suitable for analyzing stationary signals with fixed frequency components. It converts time domain signals into frequency domain signals to provide spectral information of the whole time series.
* Limitations: For non-stationary signals, the Fourier transform is less effective because it fails to provide the local characteristics of the signal in time. Even if the frequency components in the signal change over time, the Fourier transform cannot detect these changes.

**Wavelet Transform (Wavelet Transform)**
* Application: Wavelet transform is suitable for analyzing signals with local non-stationary characteristics. It can provide multi-resolution analysis of the signal in the time-frequency domain and is able to capture the signal variation in time and frequency simultaneously.
* Advantages: Wavelet transform is particularly suitable for dealing with non-stationary signals because it provides information on time-frequency localization and can detect transient features and changes in the signal.

Passing the ADF test but not the KPSS test indicates that your time series may contain some complex, time-varying characteristics or trends. The Fourier transform may not be sufficient to reveal these characteristics over time, since it assumes that the signal is stationary, and it may be more appropriate to use the wavelet transform to analyze your time series. **Different levels can be tried in the future and the best level can be decided by analyzing the validity of the results.**

In [None]:
def perform_wavelet_transform(data, wavelet='db1'):
    """
    Perform wavelet transform on the given data and plot the results.

    Parameters:
    - data: The input time series data (1D array or list).
    - wavelet: The type of wavelet to use (default is 'db1' for Daubechies wavelet).
    """
    # Perform wavelet transform
    coeffs = pywt.wavedec(data, wavelet)
    
    # Plot the wavelet transform results
    plt.figure(figsize=(12, 18))
    for i, coeff in enumerate(coeffs):
        plt.subplot(len(coeffs), 1, i + 1)
        plt.plot(coeff)
        plt.title(f'Wavelet Coefficient Level {i}')
    
    plt.tight_layout()
    plt.show()

In [None]:
perform_wavelet_transform(max_values)

In [None]:
perform_wavelet_transform(filtered_data_all)

Here I want to try the wavelet coeddicient level 6 for original data first.

In [None]:
def get_wavelet_coefficients_at_level(data, wavelet='db1', level=8):
    """
    Perform wavelet transform on the given data and return the coefficients for the specified level.

    Parameters:
    - data: The input time series data (1D array or list).
    - wavelet: The type of wavelet to use (default is 'db1' for Daubechies wavelet).
    - level: The level of wavelet decomposition (default is 6).

    Returns:
    - level_coeffs: The coefficients at the specified level.
    """
    # Perform wavelet transform
    coeffs = pywt.wavedec(data, wavelet, level)
    
    # The coefficients are returned in order: [cA_n, cD_n, cD_(n-1), ..., cD_1]
    # where cA_n is the approximation coefficients at level n
    # and cD_i are the detail coefficients at level i.
    
    # The approximation coefficients at the highest level
    # The detail coefficients for each level are at indices 1 through level
    level_coeffs = coeffs[-1]  # Coefficients at the specified level

    return level_coeffs

In [None]:
w_data = get_wavelet_coefficients_at_level(max_values, wavelet='db1', level=8)
wf_data = get_wavelet_coefficients_at_level(filtered_data_all, wavelet='db1', level=8)

plt.figure(figsize=(12, 2))

plt.plot(w_data, label='Wave', color='grey')
plt.plot(filtered_data_all, label='Filter', color='red')
plt.plot(wf_data, label='Wave+Filter', color='orange')

plt.xlabel("Time")
plt.ylabel('Maximum Value')
plt.legend()
plt.show()

The wavelet transform is not so obvious at the moment, so I'll just use the filter data.

In [None]:
# # Calculate the Fourier transform
# fs = 1000.0  # Sampling frequency (assuming 100 Hz)
# T = 100.0 / fs  # Period of sampling
# n = len(max_values_odd)  # Length of signal
# yf = fft(max_values_odd)
# xf = fftfreq(n, T)[:n//2]

# plt.figure(figsize=(12, 6))
# plt.plot(xf, 2.0/n * np.abs(yf[:n//2]))
# plt.title('Fourier Transform of max_values_odd')
# plt.xlabel('Frequency (Hz)')
# plt.ylabel('Amplitude')
# plt.grid()
# plt.show()

## [train/test]/[planet_id]/AIRS-CH0_signal.parquet
Signal data from the AIRS-CH0 instrument. Each file contains 11,250 rows of images captured at constant time steps noted in axis_info.parquet file for details of the time steps. Each 32 x 356 image has been flattened into 11392 columns. You can un-flatten the data with numpy.reshape(11250, 32, 356). The instruments generate data as uint16. To restore the full dynamic range you must multiply the data by the matching gain value from

In [None]:
train_785834_AIRS = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_signal.parquet')
print(train_785834_AIRS.shape)
train_785834_AIRS.head(5)

In [None]:
signal_array = train_785834_AIRS.values

reshaped_signal_data = signal_array.reshape(11250, 32, 356)

data = reshaped_signal_data[0]
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.title('Restored Signal Image at 0th Time Step')
plt.show()

In [None]:
#(673, 5)
#FGS1_adc_offset	FGS1_adc_gain	AIRS-CH0_adc_offset	AIRS-CH0_adc_gain	
#star planet_id	

planet_id = 785834

gain = train_adc_info.loc[planet_id, 'AIRS-CH0_adc_gain']
offset = train_adc_info.loc[planet_id, 'AIRS-CH0_adc_offset']

original_signal_data = reshaped_signal_data * gain + offset

print(original_signal_data.shape)

In [None]:
data = reshaped_signal_data[0]
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.title('Restored Original Signal Image at 0th Time Step')
plt.show()

In [None]:
slice_index = 0  # Select the first time slice

# Select the slice
Z = reshaped_signal_data[slice_index, :, :]

# Creating a grid
X = np.arange(Z.shape[1])  # length 
Y = np.arange(Z.shape[0])  # value 
X, Y = np.meshgrid(X, Y)  

# Draw a 3D thermal map
fig = plt.figure(figsize=(12, 8), dpi=80)
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(surf, ax=ax, shrink=0.8, aspect=20, pad=0.1)

ax.set_xlabel('Length')
ax.set_ylabel('Value')
ax.set_zlabel('Signal')
ax.set_title(f'Slice at Time Index {slice_index}')

plt.show()

In [None]:
time = reshaped_signal_data[0]
value = reshaped_signal_data[1]
length = reshaped_signal_data[2]

In [None]:
length_value_matrix_avg = np.mean(reshaped_signal_data, axis=0)

plt.matshow(length_value_matrix_avg, cmap='coolwarm', origin='lower', fignum=1)
plt.xlabel('Length')
plt.ylabel('Value')
plt.title('Length vs Value')
plt.colorbar()
plt.show()

In [None]:
length_time_matrix_avg = np.mean(reshaped_signal_data, axis=1)

plt.matshow(length_time_matrix_avg.T, cmap='coolwarm', origin='lower', aspect='auto')
plt.xlabel('Time')
plt.ylabel('Length')
plt.title('Time vs Length')
plt.colorbar()
plt.show()

In [None]:
value_time_matrix_avg = np.mean(reshaped_signal_data, axis=2)
plt.matshow(value_time_matrix_avg.T, cmap='coolwarm', origin='lower', aspect='auto')
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time vs Value')
plt.colorbar()
plt.show()

## [train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/dark.parquet
Dark frames are exposures taken with the shutter closed, capturing the thermal noise and bias level of the sensor. These are used to subtract the dark current from science images.

*--Details--*
* **Thermal noise correction**: The dark frame records the thermal noise of the sensor in the absence of light. These noises are generated by the electronic components of the sensor when exposed to ambient temperature. Thermal noise in the scientific image needs to be subtracted from the observed image to improve the accuracy of the data.
* **Bias correction**: The dark frame also captures the sensor's bias level (that is, the signal baseline under zero light conditions). In actual observations, this bias signal will have an impact on the image, so it needs to be subtracted from the scientific image to eliminate the effect of baseline drift on the data.
* **Dark frame correction**: Using read dark frame data to correct scientific images. This typically involves subtracting dark frame data from scientific images to remove thermal noise and biased signals.

In [None]:
train_785834_dark = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_calibration/dark.parquet')
print(train_785834_dark.shape)
train_785834_dark.head(5)

In [None]:
data = train_785834_dark.values  
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.show()

## [train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/dead.parquet
Identifies dead or hot pixels on the sensor. Dead pixels do not respond to light, while hot pixels consistently produce high signal levels regardless of incoming light.

*--Details--*
* **Dead Pixels**: are pixels that do not respond to light. In the image, these pixels usually appear as constant values, which may be zero or a fixed value. They cannot capture any actual optical signal when collecting data, thus affecting the quality of the image.
* **Hot Pixels**: produce unusually high signal levels, even in the absence of light. These pixels are usually caused by electronic noise or malfunction of the sensor. Thermal pixels appear as bright spots in images and may interfere with data analysis and scientific measurements.
* **Image correction**: In the actual image processing process, you can use this data to correct scientific images. For example, for dead pixels, the value can be replaced by the average of neighboring pixels; For hot pixels, an appropriate algorithm can be used to reduce their impact.

In [None]:
train_785834_dead = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_calibration/dead.parquet')
print(train_785834_dead.shape)
train_785834_dead.head(5)

In [None]:
data = train_785834_dead.values  
plt.matshow(data, cmap="binary")
plt.colorbar()
plt.show()

## [train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/flat.parquet
Flat field frames are created by imaging a uniformly illuminated surface. They are used to correct for variations in pixel-to-pixel sensitivity and optical system irregularities.

--Details--
* Uneven image brightness caused by sensors or optical components.
* **The flat-field frame**: records the relative sensitivity of each pixel and the irregularity of the optical system.
* **Image Correction**: Each pixel value of the scientific image was divided by the normalized flat field frame value at the corresponding position.

In [None]:
train_785834_flat = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_calibration/flat.parquet')
print(train_785834_flat.shape)
train_785834_flat.head(5)

In [None]:
data = train_785834_flat.values  
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.show()

##[train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/linear_corr.parquet
Information about the linearity correction of the sensor. The response of the pixels in the detector becomes less linear as they fill with electrons, approaching the point of saturation, where the pixel can no longer collect additional electrons and its response to light becomes flat. For an accurate estimate of the signal, the instrument's response as a function of the received charge is calibrated, and the correction is calculated using a polynomial of degree n. This polynomial allows for the conversion of the number of electrons collected/measured by the pixel into the number of electrons that the detector would have generated with a linear response.

*--Details--*
* $P(x)=a^0+a^1x+a^2x^2+…+a^nx^n$ where $x$ is the number of electrons measured by the pixel, $P(x)$ is the corrected number of electrons.
* **Image Correction**: For each pixel in the scientific image, the nonlinear response is converted to a linear response using the calibrated polynomial parameters corrected.

In [None]:
train_785834_linear_corr = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_calibration/linear_corr.parquet')
print(train_785834_linear_corr.shape)
train_785834_linear_corr.head(5)

In [None]:
data = train_785834_linear_corr.values  
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.show()

## [train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/read.parquet
Read noise frames capture the electronic noise introduced during the readout process of the sensor. This noise is present even when no light falls on the detector.

*--Details--*
* **Objective**: By reading noisy frames, the noise characteristics of sensors can be understood and corrected in actual scientific images.
* **Methods**: In scientific image processing, the noise signal in the read noise frame can be subtracted from the actual image, so as to improve the signal-to-noise ratio (SNR) of the image signal.

In [None]:
train_785834_read = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/785834/AIRS-CH0_calibration/read.parquet')
print(train_785834_read.shape)
train_785834_read.head(5)

In [None]:
data = train_785834_read.values  
plt.matshow(data, cmap="coolwarm")
plt.colorbar()
plt.show()

# Data processing: extract the signal sequence from the two-dimensional image

Only do for **FGS1_signals** as a try. We must have missed a lot of valid information here, and the two-dimensional information is simply extracted into a list of features. Don't worry, this is just the beginning and experiment:

1.  ***Data directory structure and file reading***:
The data is stored in the /kaggle/input/ariel-data-challenge-2024/train folder.
The name of each subfolder is the ID of the planet, and the subfolder contains a file named FGS1_signal.parquet, which stores the signal data for the planet.
1.  ***Deformation of data***:
Read the FGS1_signal.parquet file and load the data as signal_array.
Deform signal_array to rearrange it into a 3D array with the shape (135000, 32, 32) reshaped_signal_data. Here, 135000 is the time step and 32x32 is the two-dimensional spatial dimension of the signal.
1. ***Maximum value extraction***:
The maximum value of each time step is extracted from reshaped_signal_data to obtain a one-dimensional array max_values with the shape (135000,). This operation is to calculate the maximum value on the (1, 2) axis, that is, to calculate the maximum value on the spatial dimension.
1. ***Filtering of noise***:
The max_values are filtered for noise using a low-pass filter. The design parameters of low-pass filter include cutoff frequency, sampling frequency fs and filter order.
Define the lowpass_filter function lowpass_filter(data, cutoff, fs, order=5) and enter max_values into the function to obtain the filtered data filtered_data_all.
1. ***Store filtered data***:
Create a new DataFrame output_df indexed by the ID of the planet, with each row corresponding to the filtered_data_all of the planet.
Iterate through the folders of all planets, perform the above steps, and store the filtered_data_all for each planet into output_df.
1. ***Save the result***:
Save the final DataFrame output_df to the /kaggle/working/ folder for later use and analysis.

In [None]:
# # try  
####################### train_labels = train_labels.head(5)       

In [None]:
%%time
# Define file paths and parameters
input_dir = '/kaggle/input/ariel-data-challenge-2024/train'

# Load the train_labels to get planet_ids
planet_ids = train_labels.index  ## set index
num_columns = 135000
print("Build:")
output_df = pd.DataFrame(index=planet_ids, columns=range(num_columns))

# Parameters for low-pass filter
cutoff = 1.0
fs = 1000.0

In [None]:
####################### output_df = output_df.head(5)     
####################### print(output_df.shape)        

In [None]:
%%time
# Iterate over each planet_id
print("Start:")
for planet_id in tqdm(planet_ids):
    planet_dir = os.path.join(input_dir, str(planet_id))
    signal_file = os.path.join(planet_dir, 'FGS1_signal.parquet')

    if os.path.isfile(signal_file):
        # Step 1: Load and reshape the data
        signal_data = pd.read_parquet(signal_file)
        signal_array = signal_data.values
        reshaped_signal_data = signal_array.reshape(135000, 32, 32)

        # Step 2: Extract the maximum values
        max_values = np.max(reshaped_signal_data, axis=(1, 2))

        # Step 3: Filter the noise
        filtered_data_all = lowpass_filter(max_values, cutoff, fs)
        #print(len(filtered_data_all))

        # Step 4: Store the filtered data in the DataFrame
        output_df.loc[planet_id] = filtered_data_all

# # Reset index of the output DataFrame
# output_df.reset_index(inplace=True)
# output_df.rename(columns={'index': 'planet_id'}, inplace=True)

# Display the resulting DataFrame
print(output_df.shape)
output_df.head(5)

In [None]:
# %%time
# # Save the DataFrame to /kaggle/working/
output_file_path = 'FGS1_time_max_filtered.csv'
output_df.to_csv(output_file_path, index=False)
print(f'Saved the output DataFrame to {output_file_path}')

In [None]:
plt.figure(figsize=(12, 4))
for i in range(min(5, len(output_df))):
    plt.plot(output_df.columns, output_df.iloc[i], label=f'Row {output_df.index[i]}')

plt.legend(title='Index')
plt.title('Line Plots of the First Five Rows')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.grid(True)
plt.show()

In [None]:
output_df = output_df.fillna(method='ffill')
train_labels = train_labels.fillna(method='ffill')
output_df = output_df.astype(float)
train_labels = train_labels.astype(float)

# Data processing: extract time series features

In [None]:
def extract_features(df):
    features = pd.DataFrame(index=df.index)
    
    features['mean'] = df.mean(axis=1)
    features['std'] = df.std(axis=1)
    features['max'] = df.max(axis=1)
    features['min'] = df.min(axis=1)
    features['median'] = df.median(axis=1)
    features['skewness'] = df.skew(axis=1)
    features['kurtosis'] = df.kurtosis(axis=1)
    features['iqr'] = df.apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25), axis=1)
    features['cv'] = df.apply(lambda x: np.std(x) / np.mean(x) if np.mean(x) != 0 else 0, axis=1)
    features['rms'] = df.apply(lambda x: np.sqrt(np.mean(np.square(x))), axis=1)
    features['energy'] = df.apply(lambda x: np.sum(np.square(x)), axis=1)
    
    features['zero_crossing_rate'] = df.apply(lambda x: ((x[:-1] * x[1:]) < 0).sum(), axis=1)
    
    features['first_quartile'] = df.apply(lambda x: np.percentile(x, 25), axis=1)
    features['third_quartile'] = df.apply(lambda x: np.percentile(x, 75), axis=1)
    
    features['autocorrelation'] = df.apply(lambda x: x.autocorr(lag=1), axis=1)

    return features

In [None]:
extract_features(output_df)