# INN Applied to Molecular Data

## Loading the dataset

In [None]:
import pandas as pd

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import rdmolfiles

computed_spectra = pd.read_csv("/home/undergrad/2026/wcarvalh/Documents/uvsq/" + "computed_spectra.csv")

smiles = computed_spectra["smiles"]

Displaying 5 SMILES examples:

CN(c1ccc(NC(=O)Nc2ccccc2)cc1)S(=O)(=O)c1ccc(-c2ccn(CCO)n2)s1

CC(O)CC(C)C#COC#CC(C)CC(C)O

Cc1cc([N+](=O)[O-])ccc1NC(=O)c1ccc(OCC(C)C)c(Br)c1

COC12C(COC(N)=O)C3=C(C(=O)C(C)=C(N)C3=O)N1CC1NC12

CC(C)(O)C(NC(=O)c1cnn2cc(C3CC3)cnc12)c1ccc(OC(F)(F)F)c(F)c1



In [None]:
# display the df. each row corresponds to a smile/molecule, and the columns to the right of the molecule represent its spectra values
smiles_spectra = computed_spectra.iloc[:, 1:].values.tolist()

'\nprint("Displaying spectra for 5 SMILES:\n")\nfor i in range(5):\n    print(f"SMILES {i+1}: {smiles[i]}")\n    print("Spectra:", smiles_spectra[i])\n    print("\n")\n'

### SMILES -> SELFIES
Transform SMILES to SELFIES

### Create the EMBEDDED_SMILES <--> SPECTRA Dataset

In [None]:
import selfies as sf
from selfies import split_selfies

class SelfiesEncoder:
    def __init__(self, selfies_list, pad_token='[PAD]', max_len=1801):
        self.max_len = max_len
        self.pad_token = pad_token

        # tokenize and build vocabulary
        tokenized = [list(split_selfies(s)) for s in selfies_list]
        unique_tokens = sorted(set(tok for seq in tokenized for tok in seq))

        # create a dictionary mapping each unique token to an integer index
        self.token2idx = {tok: i for i, tok in enumerate(unique_tokens)}
        self.pad_idx = len(self.token2idx)
        self.token2idx[self.pad_token] = self.pad_idx

        self.idx2token = {i: tok for tok, i in self.token2idx.items()}

    def encode(self, selfies_str):
        tokens = list(split_selfies(selfies_str))
        token_ids = [self.token2idx[tok] for tok in tokens]
        # pad or truncate to max_len
        if len(token_ids) < self.max_len:
            token_ids += [self.pad_idx] * (self.max_len - len(token_ids))
        else:
            token_ids = token_ids[:self.max_len]
        return token_ids

    def decode(self, token_ids):
        tokens = [self.idx2token[i] for i in token_ids if i != self.pad_idx]
        return ''.join(tokens)


In [39]:
selfies = []
for smile in smiles:
    selfies_str = sf.encoder(smile)
    selfies.append(selfies_str)

encoder = SelfiesEncoder(selfies)

embedded_selfies = [encoder.encode(s) for s in selfies]

# 4. Check the result
print("Example encoded selfie:", embedded_selfies[0])
print("Length:", len(embedded_selfies[0]))  # should be 1801



Example encoded selfie: [64, 78, 50, 88, 25, 64, 28, 64, 28, 49, 3, 78, 64, 24, 64, 36, 78, 64, 28, 64, 28, 64, 28, 88, 24, 64, 28, 88, 94, 94, 24, 64, 36, 24, 64, 36, 64, 28, 64, 28, 49, 34, 64, 64, 28, 78, 49, 89, 64, 64, 82, 78, 40, 50, 94, 88, 34, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115

In [None]:
decoded_selfie = encoder.decode(embedded_selfies[0])
print("Example decoded SELFIE:", decoded_selfie)
print("Same SELFIE as a SMILE:", sf.decoder(decoded_selfie))

Example decoded SELFIE: [C][N][Branch2][Ring1][=Branch2][C][=C][C][=C][Branch1][#C][N][C][=Branch1][C][=O][N][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][Ring1][S][S][=Branch1][C][=O][=Branch1][C][=O][C][=C][C][=C][Branch1][=N][C][C][=C][N][Branch1][Ring2][C][C][O][N][=Ring1][Branch2][S][Ring1][=N]
Same SELFIE as a SMILE: CN(C1=CC=C(NC(=O)NC2=CC=CC=C2)C=C1)S(=O)(=O)C3=CC=C(C=4C=CN(CCO)N=4)S3


### Dataset

In [None]:
# column 1 = embedded selfie; column 2 = computed spectra

# convert computed_spectra to dataframe
spectra_df = pd.DataFrame(computed_spectra)

# separate SMILES and spectra
#smiles_list = spectra_df.iloc[:, 0].tolist()  # First column: SMILES strings
spectra_values = spectra_df.iloc[:, 1:].values  # Remaining columns: spectra

# create new df
final_data = []
for embedded, spectra in zip(embedded_selfies, spectra_values):
    final_data.append(embedded + list(spectra))

# create column names
num_tokens = len(embedded_selfies[0])
num_spectra = spectra_values.shape[1]
columns = [f"token_{i}" for i in range(num_tokens)] + [f"spectra_{i}" for i in range(num_spectra)]

# create final dataframe
df = pd.DataFrame(final_data, columns=columns)
#print(df.head())

   token_0  token_1  token_2  token_3  token_4  token_5  token_6  token_7  \
0       64       78       50       88       25       64       28       64   
1       64       64       49       64       82       64       64       49   
2       64       64       28       64       49       24       69       24   
3       64       82       64       64       49       50       64       82   
4       64       64       49       64       64       49       64       82   

   token_8  token_9  ...  spectra_1791  spectra_1792  spectra_1793  \
0       28       49  ...      0.000774      0.000786      0.000803   
1       64       64  ...      0.001032      0.001046      0.001065   
2       64       36  ...      0.000462      0.000465      0.000453   
3       64       49  ...      0.001045      0.001055      0.001057   
4       64       50  ...      0.000649      0.000653      0.000655   

   spectra_1794  spectra_1795  spectra_1796  spectra_1797  spectra_1798  \
0      0.000780      0.000782      0.0007

## INN setup
Forward: f(embedding) -> spectra
Inverse: f^-1(spectra) -> embedding

Implement normalization later (min-max or z-score)?

In [50]:
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

num_tokens = len([col for col in df.columns if col.startswith("token_")])
num_spectra = len([col for col in df.columns if col.startswith("spectra_")])

# X = embedded selfies; y = spectra
X = df.iloc[:, :num_tokens].values.astype(np.float32)
y = df.iloc[:, num_tokens:].values.astype(np.float32)

print(X.shape)
print(y.shape)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train)
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test)

# Dataloaders
batch_size = 64
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size)


(85506, 1801)
(85506, 1801)


In [None]:
class INN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(INN, self).__init__()
        hidden = 512
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, output_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(output_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, input_dim),
        )

    def forward(self, x):
        return self.encoder(x)

    def inverse(self, y):
        return self.decoder(y)

# Initialize model
model = INN(input_dim=num_tokens, output_dim=num_spectra)

# Step 6: Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Step 7: Training loop
n_epochs = 20
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}")

# Step 8: Evaluate on test set
model.eval()
with torch.no_grad():
    total_test_loss = 0
    for batch_X, batch_y in test_loader:
        output = model(batch_X)
        loss = criterion(output, batch_y)
        total_test_loss += loss.item()
    print(f"\nTest Loss: {total_test_loss / len(test_loader):.4f}")

# Optional: Test the inverse (predict SELFIES from spectra)
sample_spectra = y_test_tensor[0].unsqueeze(0)  # Add batch dim
reconstructed_selfie = model.inverse(sample_spectra)
print("\nOriginal embedded selfie:\n", X_test_tensor[0].tolist())
print("Reconstructed selfie embedding:\n", reconstructed_selfie.squeeze().tolist())

### Build and train the INN
Using PyTorch's FrEIA (Framework for Easily Invertible Architectures)

In [None]:
import torch
import torch.nn as nn
import FrEIA.framework as Ff
import FrEIA.modules as Fm

# Subnet for each block
def subnet_fc(dims_in, dims_out):
    return nn.Sequential(
        nn.Linear(dims_in, 512),
        nn.ReLU(),
        nn.Linear(512, dims_out)
    )

# Simulated data
batch_size = 32
embedding_dim = 128
spectra_dim = 1801

# split the data 80x20
# x = embeddings, y = spectra
# create INN to predict spectra
inn = Ff.SequenceINN(embedding_dim)

for _ in range(4):
    inn.append(Fm.AllInOneBlock,
               subnet_constructor=subnet_fc,
               permute_soft=True)

# Project spectra to match embedding_dim (for inversion)
proj = nn.Linear(spectra_dim, embedding_dim)
unproj = nn.Linear(embedding_dim, spectra_dim)

# === TRAINING EXAMPLE ===

# Forward: embedding → spectra
z = inn(x_embedding)                   # latent rep
output = unproj(z)                     # predicted spectra

# Inverse: spectra → embedding
z_inv = proj(y_spectra)                # compress spectra
x_reconstructed = inn(z_inv, rev=True)

# === LOSS ===
loss_forward = nn.MSELoss()(output, y_spectra)
loss_inverse = nn.MSELoss()(x_reconstructed, x_embedding)

print("Forward loss:", loss_forward.item())
print("Inverse loss:", loss_inverse.item())

In [None]:
# Training loop

optimizer = torch.optim.Adam(list(inn.parameters()) + list(proj.parameters()) + list(unproj.parameters()), lr=1e-3)

for epoch in range(100):
    optimizer.zero_grad()
    
    z = inn(x_embedding)
    output = unproj(z)
    z_inv = proj(y_spectra)
    x_reconstructed = inn(z_inv, rev=True)

    loss_fwd = nn.MSELoss()(output, y_spectra)
    loss_inv = nn.MSELoss()(x_reconstructed, x_embedding)
    loss = loss_fwd + loss_inv
    
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Forward loss: {loss_fwd.item():.4f} | Inverse loss: {loss_inv.item():.4f}")


### Invert the INN

In [None]:
# embedding_pred = inn(spectra_input, rev=True)

# compare to known embeddings
# use similarity search (MSE?)
# return top-k closest SMILES