## Import The Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import polars as pl
import numpy as np
import torch

In [None]:
from tqdm import tqdm
import pickle
import time
import os
import pickle
import seaborn as sns
import scipy.stats
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

## Load & Read The Data

In [None]:
PATH = "/kaggle/input/ariel-data-challenge-2024"
train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv', 
                             index_col='planet_id')
train_labels = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_labels.csv',
                           index_col='planet_id')
wavelengths = pd.read_csv(f'{PATH}/wavelengths.csv')

In [None]:
%%writefile process.py
# Function to load signal data
def load_signal_data(planet_id, dataset, instrument, img_size):
    file_path = f'{PATH}/{dataset}/{planet_id}/{instrument}_signal.parquet'
    signal = pl.read_parquet(file_path).to_numpy().flatten()
    # Compute the mean of the signal and then the net signal (odd - even indices)
    mean_signal = signal.reshape(-1, img_size).mean(axis=1)
    net_signal = mean_signal[1::2] - mean_signal[0::2]
    return net_signal

# Function to read and preprocess signal data for all planet IDs
def read_and_preprocess(dataset, planet_ids, instrument="AIRS-CH0"):
    img_size = 1024 if instrument == "FGS1" else 32 * 356
    signal_length = 67500 if instrument == 'FGS1' else 5625
    raw_data = np.empty((len(planet_ids), signal_length), dtype=np.float32)
    
    # Load the signal for each planet and store it in the array
    for i, planet_id in tqdm(enumerate(planet_ids), total=len(planet_ids)):
        raw_data[i] = load_signal_data(planet_id, dataset, instrument, img_size)
    
    return raw_data

# Function to perform feature engineering
def feature_engineering(f_raw, a_raw, adc_info):
    # Calculate basic statistics like mean, std, variance, skewness, and kurtosis
    def calculate_stats(data):
        return {
            'mean': data.mean(axis=1),
            'std': data.std(axis=1),
            'variance': data.var(axis=1),
            'skewness': pd.DataFrame(data).skew(axis=1).values,
            'kurtosis': pd.DataFrame(data).kurtosis(axis=1).values
        }
    
    # Define obscured and unobscured ranges
    def calculate_reduction(raw_data, obscured_range, unobscured_range):
        obscured_mean = raw_data[:, obscured_range].mean(axis=1)
        unobscured_mean = (raw_data[:, :unobscured_range].mean(axis=1) + raw_data[:, -unobscured_range:].mean(axis=1)) / 2
        return (unobscured_mean - obscured_mean) / unobscured_mean, unobscured_mean / raw_data.std(axis=1)
    
    # Calculate the reductions and statistics for both f and a signals
    f_reduction, f_signal_to_noise = calculate_reduction(f_raw, slice(23500, 44000), 20500)
    a_reduction, a_signal_to_noise = calculate_reduction(a_raw, slice(1958, 3666), 1708)
    
    f_stats = calculate_stats(f_raw)
    a_stats = calculate_stats(a_raw)
    
    # Combine all the features into a single DataFrame
    features = pd.DataFrame({
        'f_relative_reduction': f_reduction,
        'f_signal_to_noise': f_signal_to_noise,
        'f_variance': f_stats['variance'],
        'f_skewness': f_stats['skewness'],
        'f_kurtosis': f_stats['kurtosis'],
        'a_relative_reduction': a_reduction,
        'a_signal_to_noise': a_signal_to_noise,
        'a_variance': a_stats['variance'],
        'a_skewness': a_stats['skewness'],
        'a_kurtosis': a_stats['kurtosis']
    })

    # Append adc_info to the features
    features = pd.concat([features, adc_info.reset_index(drop=True).iloc[:, 1:6]], axis=1)
    
    return features


In [None]:
%%writefile -a process.py
def postprocessing(pred_array, index, sigma_pred):
    # Clip predictions to a minimum of 0 and create DataFrames for predictions and sigma
    pred_df = pd.DataFrame(pred_array.clip(0, None), index=index, columns=wavelengths.columns)
    sigma_df = pd.DataFrame(sigma_pred, index=index, columns=[f"sigma_{i}" for i in range(1, 284)])

    # Concatenate predictions and sigma DataFrames
    submission_df = pd.concat([pred_df, sigma_df], axis=1)
    return submission_df

In [None]:
%%writefile -a process.py
class ParticipantVisibleError(Exception):
    pass

In [None]:
%%writefile -a process.py
def competition_score(solution, submission, naive_mean, naive_sigma, sigma_true, row_id_column_name='planet_id'):
    # Remove the row ID column from the solution and submission DataFrames
    solution = solution.drop(columns=[row_id_column_name])
    submission = submission.drop(columns=[row_id_column_name])

    # Check for invalid values and data types in the submission
    if (submission < 0).any().any():
        raise ParticipantVisibleError('Negative values in the submission')
    if not all(pd.api.types.is_numeric_dtype(submission[col]) for col in submission.columns):
        raise ParticipantVisibleError('All submission columns must be numeric')

    # Validate the number of columns in the submission
    n_wavelengths = solution.shape[1]
    if submission.shape[1] != n_wavelengths * 2:
        raise ParticipantVisibleError('Incorrect number of columns in the submission')

    # Extract predicted values and sigma from the submission
    y_pred = submission.iloc[:, :n_wavelengths].values
    sigma_pred = np.clip(submission.iloc[:, n_wavelengths:].values, a_min=1e-15, a_max=None)
    y_true = solution.values

    # Calculate Gaussian log-likelihoods
    GLL_pred = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_pred, scale=sigma_pred))
    GLL_true = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_true, scale=sigma_true))
    GLL_mean = np.sum(scipy.stats.norm.logpdf(y_true, loc=naive_mean, scale=naive_sigma))

    # Compute the submission score
    submit_score = (GLL_pred - GLL_mean) / (GLL_true - GLL_mean)
    return float(np.clip(submit_score, 0.0, 1.0))

In [None]:
exec(open('process.py', 'r').read())

In [None]:
%%time
if os.path.exists("/kaggle/input/adc24-intro-training/f_raw_train.pickle"):
    f_raw_train = np.load('/kaggle/input/adc24-intro-training/f_raw_train.pickle', allow_pickle=True)
else:
    f_raw_train = read_and_preprocess('train', train_labels.index, 'FGS1')
    with open('f_raw_train.pickle', 'wb') as f:
        pickle.dump(f_raw_train, f)

In [None]:
%%time
if os.path.exists("/kaggle/input/adc24-intro-training/a_raw_train.pickle"):
    a_raw_train = np.load('/kaggle/input/adc24-intro-training/a_raw_train.pickle', allow_pickle=True)
else:
    a_raw_train = read_and_preprocess('train', train_labels.index)
    with open('a_raw_train.pickle', 'wb') as f:
        pickle.dump(a_raw_train, f)

In [None]:
%%time
train = feature_engineering(f_raw_train, a_raw_train, train_adc_info)

In [None]:
train.head()

In [None]:
train = train.iloc[:,:-1]

In [None]:
color_array = np.array(plt.rcParams['axes.prop_cycle'].by_key()['color'])
plt.scatter(train.a_relative_reduction, train_labels.wl_1, s=15, alpha=0.5,
            c=color_array[train_adc_info.star])
plt.xlabel('relative signal reduction when planet is in front')
plt.ylabel('target')
plt.title('Correlation between relative signal reduction and target')
# plt.gca().set_aspect('equal')
points = [plt.Line2D([0], [0], label=f'star {i}', marker='o', markersize=3,
         markeredgecolor=color_array[i], markerfacecolor=color_array[i], linestyle='') for i in range(2)]

plt.legend(handles=points)
plt.show()

## Build & Train the model

In [None]:
from xgboost import XGBRegressor

# Instantiate the XGBoost Regressor model
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    random_state=42
)

# Generate out-of-fold predictions using cross-validation
oof_pred = cross_val_predict(model, train, train_labels, cv=5)

# Calculate and print the R2 score
print(f"# R2 score: {r2_score(train_labels, oof_pred):.4f}")

# Calculate and print the Root Mean Squared Error (RMSE)
sigma_pred = mean_squared_error(train_labels, oof_pred, squared=False)
print(f"# Root mean squared error: {sigma_pred:.7f}")

In [None]:
col = 1
plt.scatter(oof_pred[:,col], train_labels.iloc[:,col], s=15, c='lightgreen')
plt.gca().set_aspect('equal')
plt.xlabel('y_pred')
plt.ylabel('y_true')
plt.title('Comparing y_true and y_pred')
plt.show()

In [None]:
oof_df = postprocessing(oof_pred, train_adc_info.index, sigma_pred)
display(oof_df)

gll_score = competition_score(train_labels.copy().reset_index(),
                              oof_df.copy().reset_index(),
                              naive_mean=train_labels.values.mean(),
                              naive_sigma=train_labels.values.std(),
                              sigma_true=0.000003)
print(f"# Estimated competition score: {gll_score:.4f}")

In [None]:
model.fit(train, train_labels)
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)
with open('sigma_pred.pickle', 'wb') as f:
    pickle.dump(sigma_pred, f)

## Testing & Release the output

In [None]:
test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/test_adc_info.csv',
                           index_col='planet_id')
sample_submission = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/sample_submission.csv',
                                index_col='planet_id')
f_raw_test = read_and_preprocess('test', sample_submission.index, 'FGS1')
a_raw_test = read_and_preprocess('test', sample_submission.index)
test = feature_engineering(f_raw_test, a_raw_test, test_adc_info)
test = test.iloc[: , :-1]
# Load the model
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)
with open('sigma_pred.pickle', 'rb') as f:
    sigma_pred = pickle.load(f)

# Predict
test_pred = model.predict(test)

# Package into submission file
sub_df = sub_df = postprocessing(test_pred,
                        test_adc_info.index,
                        sigma_pred=np.tile(np.where(test_adc_info[['star']] <= 1, sigma_pred, 0.001), (1, 283)))
display(sub_df)
sub_df.to_csv('submission.csv')