# INN Applied to Molecular Data

## Loading the dataset

In [21]:
import random
import string
import numpy as np
import pandas as pd
import os

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolfiles

# some base fragments to combine (as SMILES)
scaffolds = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CC(=O)O', 'c1cncnc1']
side_chains = ['Cl', 'Br', 'C(=O)O', 'N', 'C#N']

def combine_fragments():
    scaffold = random.choice(scaffolds)
    chain = random.choice(side_chains)
    combined = scaffold + '.' + chain  # simple combination
    mols = [Chem.MolFromSmiles(m) for m in [scaffold, chain]]
    combo = Chem.CombineMols(*mols)
    return Chem.MolToSmiles(combo)

# generate 10 realistic-ish SMILES
smiles = [combine_fragments() for _ in range(10)]
print("Generated realistic-looking SMILES:")
for s in smiles:
    print(s)

# Generate Fake Spectra Values
# Let's say each spectrum is a 10-dimensional vector (float values between 0–1)
spectra = [np.round(np.random.rand(10), 4).tolist() for _ in range(num_molecules)]

# ---- Step 3: Create DataFrame ----
df = pd.DataFrame(spectra, columns=[f"peak_{i+1}" for i in range(10)])
df.insert(0, "SMILES", smiles)

print("Sample dataframe:")
print(df.head())

# ---- Step 4: Split into Separate Variables ----
# a) List of SMILES
smiles_list = df["SMILES"].tolist()

# b) Spectra matrix (list of lists)
spectra_matrix = df.drop("SMILES", axis=1).values.tolist()

# Optional: Save to CSV
#df.to_csv("smiles_with_spectra.csv", index=False)

Generated realistic-looking SMILES:
O=CO.c1cncnc1
N.c1ccccc1
C#N.c1ccccc1
Cl.c1cncnc1
C1CCCCC1.Cl
O=CO.c1cncnc1
Cl.c1ccccc1
N.c1ccccc1
C#N.CC(=O)O
Br.C1CCCCC1
Sample dataframe:
          SMILES  peak_1  peak_2  peak_3  peak_4  peak_5  peak_6  peak_7  \
0  O=CO.c1cncnc1  0.8639  0.1717  0.3548  0.0328  0.4307  0.5677  0.7153   
1     N.c1ccccc1  0.1611  0.3108  0.9927  0.8513  0.8450  0.9543  0.2742   
2   C#N.c1ccccc1  0.7043  0.3999  0.1997  0.6645  0.8606  0.1126  0.3618   
3    Cl.c1cncnc1  0.3944  0.7102  0.4422  0.0383  0.4724  0.9982  0.6592   
4    C1CCCCC1.Cl  0.6804  0.9452  0.6019  0.5835  0.8190  0.6170  0.3440   

   peak_8  peak_9  peak_10  
0  0.3843  0.6809   0.7354  
1  0.9765  0.4783   0.1567  
2  0.3036  0.8299   0.6308  
3  0.1210  0.9135   0.0430  
4  0.2265  0.1845   0.3491  


In [None]:
# display the df. each row corresponds to a smile/molecule, and the columns to the right of the molecule represent its spectra values
df

Unnamed: 0,SMILES,peak_1,peak_2,peak_3,peak_4,peak_5,peak_6,peak_7,peak_8,peak_9,peak_10
0,O=CO.c1cncnc1,0.8639,0.1717,0.3548,0.0328,0.4307,0.5677,0.7153,0.3843,0.6809,0.7354
1,N.c1ccccc1,0.1611,0.3108,0.9927,0.8513,0.845,0.9543,0.2742,0.9765,0.4783,0.1567
2,C#N.c1ccccc1,0.7043,0.3999,0.1997,0.6645,0.8606,0.1126,0.3618,0.3036,0.8299,0.6308
3,Cl.c1cncnc1,0.3944,0.7102,0.4422,0.0383,0.4724,0.9982,0.6592,0.121,0.9135,0.043
4,C1CCCCC1.Cl,0.6804,0.9452,0.6019,0.5835,0.819,0.617,0.344,0.2265,0.1845,0.3491
5,O=CO.c1cncnc1,0.547,0.0556,0.2653,0.5432,0.4268,0.6407,0.3826,0.5963,0.6066,0.3649
6,Cl.c1ccccc1,0.0585,0.5009,0.6757,0.5648,0.3594,0.1133,0.8446,0.0316,0.9203,0.2288
7,N.c1ccccc1,0.2873,0.3018,0.2104,0.4168,0.1951,0.2924,0.3454,0.4383,0.6658,0.0164
8,C#N.CC(=O)O,0.5916,0.7609,0.0908,0.9358,0.2114,0.8008,0.4015,0.0034,0.4794,0.3753
9,Br.C1CCCCC1,0.4401,0.4855,0.8202,0.0643,0.3519,0.689,0.8438,0.7314,0.6092,0.8587


### SMILES -> EMBEDDING -> MORGAN FINGERPRINT
Still TBD if this is the appropriate type of embedding. Morgan Fingerprint is a 1024-bit binary encoding, each bit representing whether it is valuable or not.


In [None]:
from rdkit import DataStructs
from rdkit.Chem import AllChem
import numpy as np

# Parameters
radius = 2
n_bits = 1024

# Your list of SMILES strings
# e.g., smiles = ["CCO", "c1ccccc1.N", "C1CCCCC1.C#N", ...] 

embeddings = []
for s in smiles:
    mol = Chem.MolFromSmiles(s)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        arr = np.zeros((n_bits,), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        embeddings.append(arr.tolist())
    else:
        embeddings.append([0] * n_bits)  # fallback for invalid SMILES

# Example output
print("First SMILES embedding (first 10 bits):", embeddings[0][:10])


First SMILES embedding (first 10 bits): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]




In [None]:
# just comparing the two for understanding
print(np.array(embeddings).shape) # n molecules x 1024-bit fingerprints
print(np.array(spectra).shape) # n molecules x n spectra values

(10, 1024)
(10, 10)


### Create the EMBEDDED_SMILES <--> SPECTRA Dataset

In [None]:
dataset = pd.DataFrame([e + s for e, s in zip(embeddings, spectra)])

embedding_columns = [f"fp_{i}" for i in range(len(embeddings[0]))] # fingerprint bit 
spectra_columns = [f"spectra_{i}" for i in range(10)] # range = n of spectra points
dataset.columns = embedding_columns + spectra_columns

dataset

Unnamed: 0,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,fp_8,fp_9,...,spectra_0,spectra_1,spectra_2,spectra_3,spectra_4,spectra_5,spectra_6,spectra_7,spectra_8,spectra_9
0,0,0,0,0,0,0,0,0,0,0,...,0.8639,0.1717,0.3548,0.0328,0.4307,0.5677,0.7153,0.3843,0.6809,0.7354
1,0,0,0,0,0,0,0,0,0,0,...,0.1611,0.3108,0.9927,0.8513,0.845,0.9543,0.2742,0.9765,0.4783,0.1567
2,0,0,0,0,0,0,0,0,0,0,...,0.7043,0.3999,0.1997,0.6645,0.8606,0.1126,0.3618,0.3036,0.8299,0.6308
3,0,0,0,0,0,0,0,0,0,0,...,0.3944,0.7102,0.4422,0.0383,0.4724,0.9982,0.6592,0.121,0.9135,0.043
4,0,0,1,0,1,0,0,0,0,0,...,0.6804,0.9452,0.6019,0.5835,0.819,0.617,0.344,0.2265,0.1845,0.3491
5,0,0,0,0,0,0,0,0,0,0,...,0.547,0.0556,0.2653,0.5432,0.4268,0.6407,0.3826,0.5963,0.6066,0.3649
6,0,0,0,0,0,0,0,0,0,0,...,0.0585,0.5009,0.6757,0.5648,0.3594,0.1133,0.8446,0.0316,0.9203,0.2288
7,0,0,0,0,0,0,0,0,0,0,...,0.2873,0.3018,0.2104,0.4168,0.1951,0.2924,0.3454,0.4383,0.6658,0.0164
8,0,0,0,0,0,0,0,0,0,0,...,0.5916,0.7609,0.0908,0.9358,0.2114,0.8008,0.4015,0.0034,0.4794,0.3753
9,0,0,1,0,1,0,0,0,0,0,...,0.4401,0.4855,0.8202,0.0643,0.3519,0.689,0.8438,0.7314,0.6092,0.8587


## INN setup
Forward: f(embedding) -> spectra
Inverse: f^-1(spectra) -> embedding
Need to make input/output vectors match (option A: pad short spectra with zeros, option B: compress 1801 to short spectra size with PCA or downsampling)
Implement normalization later (min-max or z-score)?

### Make Input and Output Vectors Match

In [None]:
# match size

In [None]:
pip3 install git+https://github.com/VLL-HD/FrEIA.git

### Build and train the INN
Using PyTorch's FrEIA (Framework for Easily Invertible Architectures)

In [None]:
import torch
import torch.nn as nn
import FrEIA.framework as Ff
import FrEIA.modules as Fm

# Subnet for each block
def subnet_fc(dims_in, dims_out):
    return nn.Sequential(
        nn.Linear(dims_in, 512),
        nn.ReLU(),
        nn.Linear(512, dims_out)
    )

# Simulated data
batch_size = 32
embedding_dim = 128
spectra_dim = 1801

# split the data 80x20
# x = embeddings, y = spectra
# create INN to predict spectra
inn = Ff.SequenceINN(embedding_dim)

for _ in range(4):
    inn.append(Fm.AllInOneBlock,
               subnet_constructor=subnet_fc,
               permute_soft=True)

# Project spectra to match embedding_dim (for inversion)
proj = nn.Linear(spectra_dim, embedding_dim)
unproj = nn.Linear(embedding_dim, spectra_dim)

# === TRAINING EXAMPLE ===

# Forward: embedding → spectra
z = inn(x_embedding)                   # latent rep
output = unproj(z)                     # predicted spectra

# Inverse: spectra → embedding
z_inv = proj(y_spectra)                # compress spectra
x_reconstructed = inn(z_inv, rev=True)

# === LOSS ===
loss_forward = nn.MSELoss()(output, y_spectra)
loss_inverse = nn.MSELoss()(x_reconstructed, x_embedding)

print("Forward loss:", loss_forward.item())
print("Inverse loss:", loss_inverse.item())

In [None]:
# Training loop

optimizer = torch.optim.Adam(list(inn.parameters()) + list(proj.parameters()) + list(unproj.parameters()), lr=1e-3)

for epoch in range(100):
    optimizer.zero_grad()
    
    z = inn(x_embedding)
    output = unproj(z)
    z_inv = proj(y_spectra)
    x_reconstructed = inn(z_inv, rev=True)

    loss_fwd = nn.MSELoss()(output, y_spectra)
    loss_inv = nn.MSELoss()(x_reconstructed, x_embedding)
    loss = loss_fwd + loss_inv
    
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Forward loss: {loss_fwd.item():.4f} | Inverse loss: {loss_inv.item():.4f}")


### Invert the INN

In [None]:
# embedding_pred = inn(spectra_input, rev=True)

# compare to known embeddings
# use similarity search (MSE?)
# return top-k closest SMILES