In [None]:
import itertools
import os
import random
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
from scipy.optimize import minimize
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from tqdm.auto import tqdm

In [None]:
import cupy as cp
import polars as pl
from astropy.stats import sigma_clip


def ADC_convert(signal, gain, offset):
    signal /= gain
    signal += offset
    return signal.clip(0)


def mask_hot_dead(signal, dead, dark):
    hot = sigma_clip(dark.get(), sigma=5, maxiters=5).mask
    hot_mask = cp.asarray(hot.reshape((-1,)))
    dead_mask = cp.asarray((dead == 1.0).reshape((-1,)))

    signal[:, hot_mask] = cp.nan
    signal[:, dead_mask] = cp.nan
    return signal


def clean_dark(signal, dark, dt):
    dark_current = dt[:, cp.newaxis] * dark
    signal -= dark_current
    return signal


def clean_flat(signal, flat):
    signal = signal / flat
    return signal


def apply_linear_corr(linear_corr, signal):
    result = linear_corr[5, :]
    for i in range(4, -1, -1):
        result = result * signal + linear_corr[i, :]
    return result


def bin_obs(signal, binning):
    signal_binned = cp.zeros((signal.shape[0] // binning, signal.shape[1]))
    for i in range(signal.shape[0] // binning):
        signal_binned[i, :] = cp.mean(signal[i * binning : (i + 1) * binning, :], axis=0)
    return signal_binned


def load_signal_data(planet_id, dataset, instrument, adc_info, step_size=30):
    binning = step_size if instrument == "AIRS-CH0" else step_size * 12  # 4642
    img_size = (32, 356) if instrument == "AIRS-CH0" else (32, 32)
    gain, offset = (
        adc_info.loc[planet_id][f"{instrument}_adc_gain"],
        adc_info.loc[planet_id][f"{instrument}_adc_offset"],
    )
    signal_file = f"{PATH}/{dataset}/{planet_id}/{instrument}_signal.parquet"
    calibration_file = f"{PATH}/{dataset}/{planet_id}/{instrument}_calibration"

    signal = cp.array(pl.read_parquet(signal_file).cast(pl.Float32).to_numpy())
    flat = cp.array(pl.read_parquet(f"{calibration_file}/flat.parquet").cast(pl.Float32).to_numpy().reshape((1, -1)))
    dark = cp.array(pl.read_parquet(f"{calibration_file}/dark.parquet").cast(pl.Float32).to_numpy().reshape((1, -1)))
    dead = cp.array(pl.read_parquet(f"{calibration_file}/dead.parquet").cast(pl.Float32).to_numpy().reshape((1, -1)))
    linear_corr = cp.array(
        pl.read_parquet(f"{calibration_file}/linear_corr.parquet").cast(pl.Float32).to_numpy().reshape(6, -1)
    )
    if instrument == "AIRS-CH0":
        dt = cp.array(airs_dt_info)
        dt[1::2] += 0.1
    else:
        dt = cp.ones(len(signal)) * 0.1
        dt[1::2] += 4.5

    signal = ADC_convert(signal, gain, offset)
    signal = clean_dark(signal, dark, dt)
    signal = apply_linear_corr(linear_corr, signal)
    signal = mask_hot_dead(signal, dead, dark)
    signal = clean_flat(signal, flat)
    
    signal = signal.reshape((signal.shape[0], img_size[0], img_size[1]))
    signal = cp.nanmean(signal, axis=1)
    signal = signal[1::2] - signal[0::2]
    signal = bin_obs(signal, binning)

    return signal


PATH = "/kaggle/input/ariel-data-challenge-2024/"
train_adc_info = pd.read_csv(f"{PATH}/train_adc_info.csv", index_col="planet_id")
test_adc_info = pd.read_csv(f"{PATH}/test_adc_info.csv", index_col="planet_id")
axis_info = pl.read_parquet(f"{PATH}/axis_info.parquet").to_pandas()
airs_dt_info = axis_info["AIRS-CH0-integration_time"].dropna().to_numpy()

In [None]:
def generate_signal(mode, adc_info):
    signal = []
    for planet_id in tqdm(adc_info.index):
        signal.append(
            np.concatenate(
                (
                    load_signal_data(planet_id, mode, "FGS1", adc_info),
                    load_signal_data(planet_id, mode, "AIRS-CH0", adc_info)
                ), axis=-1,
            )
         )
    return np.stack(signal)

In [None]:
train_signal = generate_signal("train", train_adc_info)
test_signal = generate_signal("test", test_adc_info)

In [None]:
np.save("train_signal.npy", train_signal)
np.save("test_signal.npy", test_signal)