# Ariel ML Challenge Baseline

Notebook presenting the baseline model for the [Ariel ML challenge 2021](https://www.ariel-datachallenge.space/).

In [None]:
# General imports
import os
import numpy as np
import matplotlib.pylab as plt
import torch
    
from pathlib import Path

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

## Data access

In [None]:
# paths to data dirs
lc_train_path = "/Users/mario/data/ariel_ml_2021/home 3/ucapats/Scratch/ml_data_challenge/training_set/noisy_train"
params_train_path = "/Users/mario/data/ariel_ml_2021/home/ucapats/Scratch/ml_data_challenge/training_set/params_train"
lc_test_path = "/Users/mario/data/ariel_ml_2021/home 2/ucapats/Scratch/ml_data_challenge/test_set/noisy_test"

# Import Dataset class 
from utils import ArielMLDataset

### let's plot a random spectral light curve

In [None]:

dataset = ArielMLDataset(lc_train_path, params_train_path, shuffle=True)

idx = np.random.randint(len(dataset))
item = dataset[idx]
offsets = np.linspace(-0.05, 0.05, item['lc'].shape[0])
f, ax = plt.subplots(figsize=(13,9))
plt.plot(item['lc'].T.detach().numpy() + offsets , label=None)
ax.legend([round(x, 4) for x in item['target'].detach().numpy()], fontsize=6, loc='lower right')


## Define simples preprocessing steps
- smoothing 
- clipping
- normalisation per wavelength
- removing ramp?

In [None]:
from utils import simple_transform

Let's include these steps in the datasets for convenience

In [None]:
train_size = 32
val_size = 32
test_size = 1024

# Training
dataset_train = ArielMLDataset(lc_train_path, params_train_path, shuffle=True, start_ind=0, 
                               max_size=train_size, transform=simple_transform)
# Validation
dataset_val = ArielMLDataset(lc_train_path, params_train_path, shuffle=True, start_ind=train_size, 
                             max_size=val_size, transform=simple_transform)

# Testing
dataset_test = ArielMLDataset(lc_train_path, params_train_path, start_ind=train_size+val_size, 
                              shuffle=True, max_size=test_size, transform=simple_transform)

# Evaluation : no output path available here, this will only be used for submission
dataset_eval = ArielMLDataset(lc_test_path, shuffle=True, transform=simple_transform)

Let's define the corresponding data loaders, still using Pytorch utils module

In [None]:
from torch.utils.data.dataloader import DataLoader

batch_size = int(train_size / 4)

loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
loader_val = DataLoader(dataset_val, batch_size=batch_size)
loader_test = DataLoader(dataset_test, batch_size=batch_size)
loader_eval = DataLoader(dataset_eval, batch_size=batch_size)

### Challenge Metric

The scoring system used for evaluation is defined here: https://www.ariel-datachallenge.space/ML/documentation/scoring

Let's define it here, with unity weights as we don't have the actual weights available.

In [None]:
from utils import ChallengeMetric
    
challenge_metric = ChallengeMetric()

## Models

A constant prediction model for reference

In [None]:
naive_1 = lambda x: torch.ones(x.shape[:-1]) * 0.06  

The baseline model, a fully connected neural network with 2 hidden layers with ReLU activation functions.

In [None]:
from utils import Baseline
    
baseline = Baseline().double().to(device)

### Training the baseline

In [None]:
from torch.optim import Adam
from torch.nn import MSELoss, L1Loss

opt = Adam(baseline.parameters())
loss_function = MSELoss()  # Alternatives could be ChallengeMetric() or L1Loss()

train_losses = []
val_losses = []
val_scores = []
best_val_score = 0.        

In [None]:
save_from = 10
epochs = 60


for epoch in range(1, 1+epochs):
    print(epoch)
    train_loss = 0
    val_loss = 0
    val_score = 0
    for k, item in enumerate(loader_train):
        pred = baseline(item['lc'])
        loss = loss_function(item['target'], pred)
        opt.zero_grad()
        loss.backward()
        opt.step()    
        train_loss += loss.detach().item()
    train_loss = train_loss / len(loader_train)
    for k, item in enumerate(loader_val):
        pred = baseline(item['lc'])
        loss = loss_function(item['target'], pred)
        score = challenge_metric.score(item['target'], pred)
        val_loss += loss.detach().item()
        val_score += score.detach().item()
    val_loss /= len(loader_val)
    val_score /= len(loader_val)
    print('Training loss', round(train_loss, 6))
    print('Val loss', round(val_loss, 6))
    print('Val score', round(val_score, 2))
    train_losses += [train_loss]
    val_losses += [val_loss]
    val_scores += [val_score]
    
    if epoch >= save_from and val_score > best_val_score:
        torch.save(baseline, 'outputs/model_state.pt')
    

Let's look at the learning curve

In [None]:
plt.plot(train_losses, '-o', label='Train Loss')
plt.plot(val_losses, '-o', label='Val Loss')
plt.xlabel('epochs')
plt.ylabel(loss_function)
plt.yscale('log')
plt.show()
plt.plot(val_scores, '-o', label='Val Score')
plt.xlabel('epochs')
plt.ylabel('Challenge score (unity weights)')
# plt.yscale('log')
plt.ylim(5000,10000)
plt.show()

## Compare models

In [None]:
item = next(iter(loader_test))

preds = {'naive1': naive_1(item['lc']), 
         'normal_1000ppm': torch.normal(item['target'], 1e-3),
         'baseline': baseline(item['lc'])
        }

for name, pred in preds.items():
    print(name, f"\t{challenge_metric(item['target'], pred).item():.2f}")

### Produce evaluation vectors
(takes a few mins to run)

In [None]:
baseline = torch.load('outputs/model_state.pt')
baseline.eval()

In [None]:
%%time
import tqdm
preds = []

for k, item in tqdm.tqdm(enumerate(loader_eval)):
    preds += [baseline(item['lc'])]

eval_pred = torch.cat(preds).detach().numpy()

Let's quickly plot the mean results per wavelength

In [None]:
plt.plot(eval_pred.mean(0), '-o')
plt.xlabel('wavelength')
plt.ylabel('mean prediction per wavelength')

And finally save the results as a txt file:

In [None]:
save_path = 'outputs/baseline_evaluation.txt'
if save_path and (53900, 55) == eval_pred.shape:
    np.savetxt(save_path, eval_pred)