### Load Data

In [2]:
import pandas as pd

In [3]:
data_df = pd.read_csv('../data/Easier Dataset.csv')

# represent each planetary system as a matrix
matrices = [
    group.drop(columns=['system_number']).to_numpy()
    for _, group in data_df.groupby('system_number')
]

### Autoencoder to learn embeddings

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torch.nn.functional as F### Autoencoder to learn embeddings

In [6]:
class PlanetarySystemsDataset(Dataset):
    def __init__(self, matrices):
        self.matrices = matrices

    def __len__(self):
        return len(self.matrices)

    def __getitem__(self, idx):
        return torch.tensor(self.matrices[idx], dtype=torch.float32)

def collate_fn(batch):
    # Normalize each sequence before padding
    normed = [nn.LayerNorm(seq.shape[1])(seq) for seq in batch]
    lengths = [seq.shape[0] for seq in normed]
    padded = pad_sequence(normed, batch_first=True)
    return padded, torch.tensor(lengths, dtype=torch.long)

In [14]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=20):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Input normalization for length_head
        self.length_norm = nn.LayerNorm(output_dim)
        # (3-layer MLP)
        self.length_head = nn.Sequential(
            nn.Linear(output_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.LayerNorm(hidden_dim // 4),
            nn.ReLU(),
            nn.Linear(hidden_dim // 4, 1)
        )

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        embedding = self.fc(h_n[-1])
        # Normalize input to length_head
        normed_embedding = self.length_norm(embedding)
        lengths_pred = self.length_head(normed_embedding).squeeze(-1)
        return embedding, lengths_pred

    def predict_length(self, embedding):
        normed_embedding = self.length_norm(embedding)
        return self.length_head(normed_embedding).squeeze(-1)

class LSTMDecoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, embedding, seq_len):
        # Repeat embedding for each timestep
        repeated = embedding.unsqueeze(1).repeat(1, seq_len, 1)
        out, _ = self.lstm(repeated)
        out = self.fc(out)
        return out

In [15]:
# Prepare dataset and dataloader
dataset = PlanetarySystemsDataset(matrices)
BATCH_SIZE = 64
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

ENCODING_SIZE = 20
LSTM_HIDDEN_SIZE = 128
input_dim = matrices[0].shape[1]

# Instantiate models
encoder = LSTMEncoder(input_dim=input_dim, hidden_dim=LSTM_HIDDEN_SIZE, output_dim=ENCODING_SIZE)
decoder = LSTMDecoder(embedding_dim=ENCODING_SIZE, hidden_dim=LSTM_HIDDEN_SIZE, output_dim=input_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.to(device)
decoder = decoder.to(device)

EPOCHS = 10

for epoch in range(EPOCHS):
    encoder.train()
    decoder.train()
    total_loss = 0

    for padded, lengths in loader:
        padded = padded.to(device)
        # lengths stays on CPU

        optimizer.zero_grad()
        embeddings, lengths_pred = encoder(padded, lengths)
        reconstructions = decoder(embeddings, padded.shape[1])

        # Reconstruction loss
        recon_loss = criterion(reconstructions, padded)

        # Length prediction loss (MSE)
        lengths_tensor = lengths.float().to(device)
        length_loss = F.mse_loss(lengths_pred, lengths_tensor)

        loss = recon_loss + length_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 55190.12050628662
Epoch 2, Loss: 4881.064542770386
Epoch 3, Loss: 210.1327759027481
Epoch 4, Loss: 88.05743090808392
Epoch 5, Loss: 69.43953649699688
Epoch 6, Loss: 61.79268125444651
Epoch 7, Loss: 58.66616266220808
Epoch 8, Loss: 56.60544043034315
Epoch 9, Loss: 53.7027323320508
Epoch 10, Loss: 52.08891548961401


### Gaussian Mixture Model to generate new data

In [17]:
encoder.eval()
embeddings = []
with torch.no_grad():
    for mat in matrices:
        x = torch.tensor(mat, dtype=torch.float32).unsqueeze(0).to(device)
        lengths = torch.tensor([mat.shape[0]], dtype=torch.long)
        embedding, _ = encoder(x, lengths)
        embeddings.append(embedding.cpu().numpy()[0])

In [18]:
from sklearn.mixture import GaussianMixture
import numpy as np

# Stack embeddings and train GMM
embeddings_np = np.stack(embeddings)

# TODO: tune n_components
n_components = 8

gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
gmm.fit(embeddings_np)

0,1,2
,n_components,8
,covariance_type,'full'
,tol,0.001
,reg_covar,1e-06
,max_iter,100
,n_init,1
,init_params,'kmeans'
,weights_init,
,means_init,
,precisions_init,


In [19]:
# Sample a new embedding from GMM
sampled_embedding = gmm.sample(1)[0]  # shape: (1, embedding_dim)
sampled_embedding_tensor = torch.tensor(sampled_embedding, dtype=torch.float32).to(device)

# Predict sequence length from embedding
encoder.eval()
decoder.eval()
with torch.no_grad():
    length_pred = encoder.predict_length(sampled_embedding_tensor)
    seq_len = int(torch.clamp(length_pred.round(), min=1, max=50).item())
    generated = decoder(sampled_embedding_tensor, seq_len)
    generated = generated.cpu().numpy()

In [20]:
generated

array([[[-1.0259984 , -0.30524525,  1.3365245 ],
        [-1.022121  , -0.22185785,  1.2239584 ],
        [-1.0513628 ,  0.01429183,  1.0518812 ],
        [-1.1027529 ,  0.18717967,  0.9156727 ],
        [-1.0796642 ,  0.27587014,  0.8038441 ],
        [-1.0219638 ,  0.2885154 ,  0.73428375],
        [-0.9486135 ,  0.29959446,  0.64403766],
        [-0.8752753 ,  0.34453762,  0.5294443 ],
        [-0.75706416,  0.3635826 ,  0.37510365],
        [-0.76770467,  0.61112356,  0.15447551],
        [ 1.2928607 , -0.5868641 , -0.7071509 ],
        [ 1.3908334 , -0.7216451 , -0.66282797],
        [ 1.3976808 , -0.7295655 , -0.65910906],
        [ 1.4119961 , -0.7327504 , -0.6700607 ],
        [ 1.4187601 , -0.727535  , -0.6852199 ],
        [ 1.420536  , -0.71852136, -0.6983398 ],
        [ 1.4197242 , -0.7132516 , -0.7093458 ],
        [ 1.4246392 , -0.7102601 , -0.7173431 ],
        [ 1.3039205 , -0.6649518 , -0.66608053]]], dtype=float32)