<div style="text-align:center; margin: 10px 0;">
    <h2>✨ If you found this notebook insightful, please like! ✨</h2>

</div>


## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

## EDA

In [None]:
train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv')
test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/test_adc_info.csv')
train_labels = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_labels.csv')
wavelengths = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/wavelengths.csv')

train_adc_info.head()

In [None]:
test_adc_info.head()

In [None]:
train_labels.head()

In [None]:
wavelengths.head()

In [None]:
axis_info = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/axis_info.parquet')
axis_info.head()

In [None]:
train_adc_info.describe()

In [None]:
train_adc_info.info()

In [None]:
train_adc_info.isnull().sum()

In [None]:
sns.set(style="whitegrid")
fig, ax = plt.subplots(1, 2, figsize=(14, 7))
sns.histplot(train_adc_info['FGS1_adc_gain'], bins=30, color='dodgerblue', kde=True, ax=ax[0])
ax[0].set_title('Distribution of Gain', fontsize=14)
ax[0].set_xlabel('Gain', fontsize=12)
ax[0].set_ylabel('Frequency', fontsize=12)

sns.histplot(train_adc_info['FGS1_adc_offset'], bins=30, color='seagreen', kde=True, ax=ax[1])
ax[1].set_title('Distribution of Offset', fontsize=14)
ax[1].set_xlabel('Offset', fontsize=12)
ax[1].set_ylabel('Frequency', fontsize=12)


plt.tight_layout()
plt.show()

In [None]:
def restore_dynamic_range(signal, gain, offset):
    return signal * gain + offset

In [None]:
signal_data = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/train/2633183716/AIRS-CH0_signal.parquet')
adc_info = train_adc_info[train_adc_info['planet_id'] == 2633183716]

In [None]:
restored_signal = restore_dynamic_range(signal_data, adc_info['FGS1_adc_gain'].values[0], adc_info['FGS1_adc_offset'].values[0])

In [None]:
plt.imshow(restored_signal.iloc[0].values.reshape(32, 356), cmap='viridis')
plt.colorbar()
plt.title('Restored Signal Frame')
plt.show()

## Modelling

In [None]:
class ExoplanetDataset(Dataset):
    def __init__(self, signal_data, labels):
        self.signal_data = signal_data
        self.labels = labels

    def __len__(self):
        return len(self.signal_data)

    def __getitem__(self, idx):
        x = self.signal_data[idx]
        y = self.labels[idx]
        return x, y


In [None]:
def prepare_data(planet_id):
    signal_data = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')
    adc_info = train_adc_info[train_adc_info['planet_id'] == planet_id]
    restored_signal = restore_dynamic_range(signal_data, adc_info['FGS1_adc_gain'].values[0], adc_info['FGS1_adc_offset'].values[0])
    return restored_signal


In [None]:
planet_id = 2633183716
restored_signal = prepare_data(planet_id)


In [None]:
print(f"Length of restored_signal: {len(restored_signal)}")
print(f"Length of train_labels: {len(train_labels)}")

In [None]:
signal_length = len(restored_signal)
label_length = train_labels.shape[0]


In [None]:
if signal_length > label_length:
    restored_signal = restored_signal[:label_length]
elif label_length > signal_length:
    train_labels = train_labels.iloc[:signal_length]

print(f"Adjusted length of restored_signal: {len(restored_signal)}")
print(f"Adjusted length of train_labels: {len(train_labels)}")


In [None]:
train_dataset = ExoplanetDataset(restored_signal.values.reshape(-1, 1, 32, 356), train_labels.iloc[:, 1:].values)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


#### I will build more complex model in future updates

In [None]:
class ExoplanetModel(nn.Module):
    def __init__(self):
        super(ExoplanetModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        
        # Calculate the size after convolutions
        self.conv_output_size = 64 * 32 * 356
        
        self.fc1 = nn.Linear(self.conv_output_size, 128)
        self.fc2 = nn.Linear(128, 283)  # 283 wavelengths

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
model = ExoplanetModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# num_epochs = 10
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
#     for x, y in train_loader:
#         x, y = x.float(), y.float()
#         optimizer.zero_grad()
#         outputs = model(x)
#         loss = criterion(outputs, y)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

# torch.save(model.state_dict(), 'exoplanet_model.pth')

In [None]:
model = ExoplanetModel()
model_path = '/kaggle/input/exoplanet-model/pytorch/default/1/exoplanet_model.pth'
model.load_state_dict(torch.load(model_path))
model.eval()

In [None]:
def prepare_test_data(planet_id):
    signal_data = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/test/{planet_id}/AIRS-CH0_signal.parquet')
    adc_info = test_adc_info[test_adc_info['planet_id'] == planet_id]  # Assuming test_adc_info is available
    restored_signal = restore_dynamic_range(signal_data, adc_info['FGS1_adc_gain'].values[0], adc_info['FGS1_adc_offset'].values[0])
    return restored_signal

planet_id_test = 499191466 
restored_signal_test = prepare_test_data(planet_id_test)
restored_signal_test

In [None]:
planet_id_test = 499191466  
restored_signal_test = prepare_test_data(planet_id_test)

restored_signal_test = restored_signal_test.values.reshape(-1, 1, 32, 356)

test_dataset = ExoplanetDataset(restored_signal_test, np.zeros((restored_signal_test.shape[0], 283)))  # Dummy labels
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    total_gll = 0
    with torch.no_grad():
        for x, _ in data_loader: 
            x = x.float()
            outputs = model(x).numpy()
            y_true = np.zeros_like(outputs)  
            sigma_user = np.ones_like(outputs) * 1e-5 
            gll = -0.5 * (np.log(2 * np.pi) + np.log(sigma_user**2) + ((y_true - outputs)**2 / sigma_user**2))
            total_gll += gll.sum()
    return total_gll

In [None]:
gll_value = evaluate_model(model, test_loader)
print(f'Gaussian Log-Likelihood: {gll_value}')

In [None]:
def plot_spectra(y_true, y_pred, wavelengths):
    plt.figure(figsize=(10, 6))
    if y_true is not None:
        plt.plot(wavelengths, y_true, label='True Spectrum', linestyle='--', color='blue')
    plt.plot(wavelengths, y_pred, label='Predicted Spectrum', linestyle='-', color='red')
    plt.xlabel('Wavelength')
    plt.ylabel('Intensity')
    plt.title('Exoplanet Atmospheric Spectrum')
    plt.legend()
    plt.show()

In [None]:
for x, _ in test_loader:
    x = x.float()
    outputs = model(x)
    y_pred = outputs.detach().numpy()
    wavelengths = np.arange(283)
    plot_spectra(None, y_pred[0], wavelengths)
    break