## Literature


https://greglandrum.github.io/rdkit-blog/posts/2024-05-31-scaffold-splits-and-murcko-scaffolds1.html

https://www.oloren.ai/blog/scaff-split

https://practicalcheminformatics.blogspot.com/2023/06/getting-real-with-molecular-property.html

In [None]:
import io

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from rdkit import Chem
from rdkit.Chem import Draw, rdFMCS
from rdkit.Chem.Draw import IPythonConsole

# Set up RDKit drawing options
Draw.SetComicMode(IPythonConsole.drawOptions)

# Set up matplotlib
plt.style.use("tableau-colorblind10")
plt.rcParams["font.size"] = "16"

In [None]:
# Read the parquet file
df = pd.read_parquet("../data/raw/enveda_library_subset.parquet")

In [None]:
# Convert SMILES to RDKit molecules
mols = [Chem.MolFromSmiles(smi) for smi in df["smiles"] if isinstance(smi, str)]

In [None]:
# Compute Murcko scaffolds with error handling
scaffs = []
for m in mols:
    if m is not None:
        try:
            scaff = Chem.MurckoDecompose(m)
            # Try to sanitize, but if it fails, try to at least get a valid mol
            try:
                Chem.SanitizeMol(scaff)
            except:
                Chem.GetSymmSSSR(scaff)
            scaffs.append(scaff)
        except Exception as e:
            print(f"Error processing molecule: {Chem.MolToSmiles(m)}")
            print(f"Error message: {str(e)}")

scaff_smis = [Chem.MolToSmiles(x) for x in scaffs if x is not None]
unique_scaffolds = set(scaff_smis)

print(f"Number of unique scaffolds: {len(unique_scaffolds)}")

In [None]:
# Visualize some scaffolds
num_to_visualize = min(10, len(scaffs))
valid_scaffs = [s for s in scaffs if s is not None][:num_to_visualize]
img = Draw.MolsToGridImage(
    valid_scaffs,
    molsPerRow=5,
    subImgSize=(200, 200),
    legends=[f"Scaffold {i+1}" for i in range(len(valid_scaffs))],
    returnPNG=False,
)

# Save the image directly
img.save("scaffold_visualization.png")

# Count scaffold frequencies
scaffold_counts = pd.Series(scaff_smis).value_counts()

# Plot scaffold distribution
plt.figure(figsize=(12, 6))
scaffold_counts.head(20).plot(kind="bar")
plt.title("Top 20 Scaffold Frequencies")
plt.xlabel("Scaffold SMILES")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
# plt.tight_layout()
plt.savefig("scaffold_distribution.png")

print("Scaffold visualization saved as 'scaffold_visualization.png'")
print("Scaffold distribution plot saved as 'scaffold_distribution.png'")