In [2]:
# Import required libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [3]:
# Defining Components
# Rotary Positional Encoding (RoPE) module
class RoPEEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len
        # Create frequency bands for sinusoidal encoding
        self.inv_freq = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))
        
    def forward(self, time_steps):
        # Generate sinusoidal position encodings
        pos = time_steps.unsqueeze(-1) * self.inv_freq
        sin_enc = torch.sin(pos)
        cos_enc = torch.cos(pos)
        return torch.cat([sin_enc, cos_enc], dim=-1)

# Neural network components
class FeedForwardNetwork(nn.Module):
    """Simple feed-forward network to project input features"""
    def __init__(self, d_in, d_model):
        super().__init__()
        self.mlp = nn.Linear(d_in, d_model)

    def forward(self, x):
        return self.mlp(x)

class MultiHeadAttention(nn.Module):
    """Multi-head self-attention mechanism"""
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = nn.MultiheadAttention(d_model, num_heads)

    def forward(self, x, key_padding_mask=None):
        return self.mha(x, x, x, key_padding_mask=key_padding_mask)[0]

class EncoderLayer(nn.Module):
    """Transformer encoder layer with self-attention and feed-forward network"""
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_key_padding_mask=None):
        # Apply attention with residual connection and normalization
        x = self.norm1(x + self.mha(x, key_padding_mask=src_key_padding_mask))
        # Apply feed-forward with residual connection and normalization
        x = self.norm2(x + self.ff(x))
        return x

class Encoder(nn.Module):
    """Stack of transformer encoder layers"""
    def __init__(self, d_model, num_heads, d_ff, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])

    def forward(self, x, src_key_padding_mask=None):
        for layer in self.layers:
            x = layer(x, src_key_padding_mask)
        return x

class Decoder(nn.Module):
    """Simple linear decoder to project back to output dimension"""
    def __init__(self, d_model, d_out):
        super().__init__()
        self.linear = nn.Linear(d_model, d_out)

    def forward(self, x):
        return self.linear(x)

class Astromer(nn.Module):
    """Main model architecture for light curve processing"""
    def __init__(self, d_model, num_heads, d_ff, num_layers, lmax):
        super().__init__()
        self.rope_encoding = RoPEEncoding(d_model, lmax)
        self.fnn = FeedForwardNetwork(1, d_model)
        self.encoder = Encoder(d_model, num_heads, d_ff, num_layers)
        self.decoder = Decoder(d_model, 1)

    def forward(self, times, magnitudes, lengths, mask_prob=0.15):
        # Generate positional encodings
        pe = self.rope_encoding(times)
        if pe.dim() == 2:
            pe = pe.unsqueeze(0).expand(magnitudes.size(0), -1, -1)
        
        # Apply masking and initial feature projection
        masked_magnitudes = self.mask_magnitudes(magnitudes, lengths, mask_prob)
        x = pe + self.fnn(masked_magnitudes.unsqueeze(-1))
        
        # Create attention padding mask and process through encoder
        padding_mask = self.create_padding_mask(lengths, times.size(1))
        encoded = self.encoder(x.transpose(0, 1), src_key_padding_mask=padding_mask)
        
        # Decode to final predictions
        reconstructed = self.decoder(encoded.transpose(0, 1)).squeeze(-1)
        return reconstructed

    def mask_magnitudes(self, magnitudes, lengths, mask_prob):
        """Randomly mask input magnitudes for denoising training"""
        mask = torch.rand_like(magnitudes) < mask_prob
        masked_magnitudes = magnitudes.clone()
        for i, length in enumerate(lengths):
            masked_magnitudes[i, :length][mask[i, :length]] = 0
        return masked_magnitudes

    def create_padding_mask(self, lengths, max_length):
        """Create mask for padding tokens in attention mechanism"""
        mask = torch.arange(max_length).expand(len(lengths), max_length) >= lengths.unsqueeze(1)
        return mask.to(lengths.device)


In [4]:
# Dataset class setup
class LightCurveDataset(Dataset):
    """Dataset class for light curve data with padding"""
    def __init__(self, times, magnitudes, max_length):
        self.times = times
        self.magnitudes = magnitudes
        self.max_length = max_length

    def __len__(self):
        return len(self.times)

    def __getitem__(self, idx):
        time = self.times[idx]
        magnitude = self.magnitudes[idx]
        length = len(time)

        # Pad sequences to max_length
        padded_time = torch.zeros(self.max_length)
        padded_magnitude = torch.zeros(self.max_length)
        
        padded_time[:length] = torch.tensor(time, dtype=torch.float32)
        padded_magnitude[:length] = torch.tensor(magnitude, dtype=torch.float32)
        
        return padded_time, padded_magnitude, length


In [5]:
# Model configuration
d_model = 128
num_heads = 4
d_ff = 256
num_layers = 3
lmax = 100

# Initialize model and optimizer
model = Astromer(d_model, num_heads, d_ff, num_layers, lmax)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [6]:
model

Astromer(
  (rope_encoding): RoPEEncoding()
  (fnn): FeedForwardNetwork(
    (mlp): Linear(in_features=1, out_features=128, bias=True)
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (mha): MultiHeadAttention(
          (mha): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
        )
        (ff): Sequential(
          (0): Linear(in_features=128, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=128, bias=True)
        )
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (decoder): Decoder(
    (linear): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [5]:
# Data preparation
file_path = '../Data/synthetic_light_curves.csv'
df = pd.read_csv(file_path)
grouped = df.groupby('sample_id')

# Extract time series data
times = [group['time_mjd'].values for _, group in grouped]
magnitudes = [group['magnitude'].values for _, group in grouped]
max_length = max(len(t) for t in times)

# Split into train and test sets
train_times, test_times, train_mags, test_mags = train_test_split(
    times, magnitudes, test_size=0.2, random_state=42
)

# Create data loaders
train_dataset = LightCurveDataset(train_times, train_mags, max_length)
test_dataset = LightCurveDataset(test_times, test_mags, max_length)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [6]:
# Training function
def train(model, optimizer, train_loader, test_loader, num_epochs=10):
    """Training loop with validation"""
    criterion = nn.MSELoss(reduction='none')
    best_test_loss = float('inf')
    model_dir = '../Models'
    os.makedirs(model_dir, exist_ok=True)
    best_model_path = os.path.join(model_dir, 'RoPE-astromer.pth')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for times, magnitudes, lengths in train_loader:
            optimizer.zero_grad()
            reconstructed = model(times, magnitudes, lengths)
            loss = criterion(reconstructed, magnitudes)
            # Apply mask to compute loss only on valid timesteps
            mask = torch.arange(magnitudes.size(1)).expand(magnitudes.size(0), magnitudes.size(1)) < lengths.unsqueeze(1)
            loss = (loss * mask.float()).sum() / mask.sum()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validation phase
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for times, magnitudes, lengths in test_loader:
                reconstructed = model(times, magnitudes, lengths)
                loss = criterion(reconstructed, magnitudes)
                mask = torch.arange(magnitudes.size(1)).expand(magnitudes.size(0), magnitudes.size(1)) < lengths.unsqueeze(1)
                loss = (loss * mask.float()).sum() / mask.sum()
                test_loss += loss.item()
        
        test_loss /= len(test_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
        
        # Save best model
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved to {best_model_path}")
    
    print(f"Training completed. Best model saved with test loss: {best_test_loss:.4f}")

In [7]:
# Start training
train(model, optimizer, train_loader, test_loader, num_epochs=10)

Epoch 1/10, Train Loss: 0.5542, Test Loss: 0.1414
New best model saved to ../Models/RoPE-astromer.pth
Epoch 2/10, Train Loss: 0.0870, Test Loss: 0.0715
New best model saved to ../Models/RoPE-astromer.pth
Epoch 3/10, Train Loss: 0.0649, Test Loss: 0.0576
New best model saved to ../Models/RoPE-astromer.pth
Epoch 4/10, Train Loss: 0.0452, Test Loss: 0.0336
New best model saved to ../Models/RoPE-astromer.pth
Epoch 5/10, Train Loss: 0.0213, Test Loss: 0.0129
New best model saved to ../Models/RoPE-astromer.pth
Epoch 6/10, Train Loss: 0.0109, Test Loss: 0.0094
New best model saved to ../Models/RoPE-astromer.pth
Epoch 7/10, Train Loss: 0.0085, Test Loss: 0.0082
New best model saved to ../Models/RoPE-astromer.pth
Epoch 8/10, Train Loss: 0.0076, Test Loss: 0.0075
New best model saved to ../Models/RoPE-astromer.pth
Epoch 9/10, Train Loss: 0.0067, Test Loss: 0.0070
New best model saved to ../Models/RoPE-astromer.pth
Epoch 10/10, Train Loss: 0.0062, Test Loss: 0.0058
New best model saved to ../Mode

In [8]:
# Test inference time
import time
import numpy as np

def test_inference_time(model, test_loader, num_runs=100, confidence_level=0.95):
    """
    Measure the average inference time per sample for a given model.

    Args:
        model: The neural network model to test.
        test_loader: DataLoader containing the test dataset.
        num_runs (int): Number of inference runs to average over (default: 100).
        confidence_level (float): Confidence level for interval calculation (default: 0.95).
    """
    # Set model to evaluation mode
    model.eval()
    # Get the device (CPU or GPU) that the model is on
    device = next(model.parameters()).device
    total_time = 0
    total_samples = 0
    all_times = []
    
    with torch.no_grad():  # Disable gradient computation for inference
        for times, magnitudes, lengths in test_loader:
            # Move input data to the same device as the model
            times, magnitudes, lengths = times.to(device), magnitudes.to(device), lengths.to(device)
            batch_size = times.size(0)
            
            # Perform a warm-up run to ensure GPU is ready
            _ = model(times, magnitudes, lengths)
            
            # Timed runs
            batch_times = []
            for _ in range(num_runs):
                start_time = time.time()
                _ = model(times, magnitudes, lengths)
                end_time = time.time()
                batch_times.extend([end_time - start_time] * batch_size)
            
            all_times.extend(batch_times)
            total_time += sum(batch_times)
            total_samples += batch_size * num_runs
    
    # Calculate overall average inference time per sample
    avg_inference_time = total_time / total_samples
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    times_array = np.array(all_times)
    ci_lower = np.percentile(times_array, alpha * 100 / 2)
    ci_upper = np.percentile(times_array, 100 - (alpha * 100 / 2))
    
    print(f"Average inference time per sample: {avg_inference_time*1000:.2f} ms")
    print(f"{confidence_level*100:.1f}% CI: [{ci_lower*1000:.2f}, {ci_upper*1000:.2f}] ms")

# Test inference time using the trained model and test data loader
test_inference_time(model, test_loader)

Average inference time per sample: 18.96 ms
95.0% CI: [5.51, 31.91] ms
Memory tracking is only available for GPU devices
