# Mean Submisison

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/train_labels.csv")

In [None]:
train

In [None]:
train.describe()

In [None]:
sub = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/sample_submission.csv")

In [None]:
sub

In [None]:
target_columns = [col for col in train.columns if 'wl_' in col]
len(target_columns)

In [None]:
wl_mean = train[target_columns].mean()
wl_std = train[target_columns].std()

In [None]:
sub[target_columns] = wl_mean.values
sub[[x.replace('wl_','sigma_') for x in target_columns]] = wl_std.values

In [None]:
sub

In [None]:
sub.to_csv("submission.csv", index=False)

# Local CV

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20, 1))
ax.barh('Split', 200 , color='blue')
ax.barh('Split', 467, left=200, color='grey')
ax.barh('Split', 333, left=200 + 467, color='red')
ax.text(100, 'Split', f'Train (200 - 20%)', ha='center', va='center', color='white')
ax.text(200+467/2, 'Split', f'Train & Test (467 - 46.7%)', ha='center', va='center', color='white')
ax.text(200+467+333/2, 'Split', f'Test (333 - 33.3%)', ha='center', va='center', color='white')
ax.spines[['top', 'right', 'left', 'bottom']].set_visible(False)
plt.show()

In [None]:
dev_len, comm_len, cv_len = int(657*0.2), int(657*0.467), int(657*0.333)
cv_len = (657) - dev_len - comm_len
dev_len, comm_len, cv_len

In [None]:
import random
train = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/train_labels.csv")
dev_index = random.sample(range(0,len(train)), dev_len)
rem_index = list(set(range(0,len(train))) - set(dev_index))
comm_index = random.sample(rem_index, int(len(train)*0.4))
cv_index = list(set(range(0,len(train))) - set(dev_index)-set(comm_index))
len(dev_index), len(rem_index), len(cv_index)

In [None]:
dev = train.iloc[dev_index+comm_index].copy().reset_index(drop=True)
cv = train.iloc[cv_index+comm_index].copy().reset_index(drop=True)
cv_sub = cv.copy()

In [None]:
dev

In [None]:
cv

In [None]:
dev_wl_mean = dev[target_columns].mean()
dev_wl_std = dev[target_columns].std()

In [None]:
cv_sub[target_columns] = dev_wl_mean.values

In [None]:
sigma_target_columns = [x.replace('wl_','sigma_') for x in target_columns]
cv_sub[sigma_target_columns] = dev_wl_std.std()

In [None]:
import numpy as np
import pandas as pd
import pandas.api.types
import scipy.stats


class ParticipantVisibleError(Exception):
    pass


def score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        row_id_column_name: str,
        naive_mean: float,
        naive_sigma: float,
        sigma_true: float
    ) -> float:
    '''
    This is a Gaussian Log Likelihood based metric. For a submission, which contains the predicted mean (x_hat) and variance (x_hat_std),
    we calculate the Gaussian Log-likelihood (GLL) value to the provided ground truth (x). We treat each pair of x_hat,
    x_hat_std as a 1D gaussian, meaning there will be 283 1D gaussian distributions, hence 283 values for each test spectrum,
    the GLL value for one spectrum is the sum of all of them.

    Inputs:
        - solution: Ground Truth spectra (from test set)
            - shape: (nsamples, n_wavelengths)
        - submission: Predicted spectra and errors (from participants)
            - shape: (nsamples, n_wavelengths*2)
        naive_mean: (float) mean from the train set.
        naive_sigma: (float) standard deviation from the train set.
        sigma_true: (float) essentially sets the scale of the outputs.
    '''

    if row_id_column_name in solution:
        del solution[row_id_column_name]
        del submission[row_id_column_name]

    if submission.min().min() < 0:
        raise ParticipantVisibleError('Negative values in the submission')
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')

    n_wavelengths = len(solution.columns)
    if len(submission.columns) != n_wavelengths*2:
        raise ParticipantVisibleError('Wrong number of columns in the submission')

    y_pred = submission.iloc[:, :n_wavelengths].values
    # Set a non-zero minimum sigma pred to prevent division by zero errors.
    sigma_pred = np.clip(submission.iloc[:, n_wavelengths:].values, a_min=10**-15, a_max=None)
    y_true = solution.values

    GLL_pred = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_pred, scale=sigma_pred))
    GLL_true = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_true, scale=sigma_true * np.ones_like(y_true)))
    GLL_mean = np.sum(scipy.stats.norm.logpdf(y_true, loc=naive_mean * np.ones_like(y_true), scale=naive_sigma * np.ones_like(y_true)))

    submit_score = (GLL_pred - GLL_mean)/(GLL_true - GLL_mean)
    return float(np.clip(submit_score, 0.0, 1.0))

In [None]:
n_wavelengths = len(cv.columns)
n_wavelengths

In [None]:
dev.shape, cv.shape, cv_sub.shape

In [None]:
cv

In [None]:
cv_sub

In [None]:
score_cv = cv.copy()
score_cv_sub = cv_sub.copy()
score(score_cv, score_cv_sub ,'planet_id', dev_wl_mean.mean(), dev_wl_std.std(), dev_wl_std.std())