## Literature


https://greglandrum.github.io/rdkit-blog/posts/2024-05-31-scaffold-splits-and-murcko-scaffolds1.html

https://www.oloren.ai/blog/scaff-split

https://practicalcheminformatics.blogspot.com/2023/06/getting-real-with-molecular-property.html

In [None]:
import io

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from rdkit import Chem
from rdkit.Chem import Draw, rdFMCS
from rdkit.Chem.Draw import IPythonConsole

# Set up RDKit drawing options
Draw.SetComicMode(IPythonConsole.drawOptions)

# Set up matplotlib
plt.style.use("tableau-colorblind10")
plt.rcParams["font.size"] = "16"

In [None]:
# Read the parquet file
df = pd.read_parquet("../data/raw/enveda_library_subset_10percent.parquet")

In [None]:
# Function to safely convert SMILES to RDKit molecule
def smiles_to_mol(smiles):
    if isinstance(smiles, str):
        try:
            return Chem.MolFromSmiles(smiles)
        except:
            return None
    return None

# Add RDKit molecules to the DataFrame
df['mol'] = df['smiles'].apply(smiles_to_mol)

# Extract invalid molecules to a separate DataFrame
invalid_df = df[df['mol'].isna()].copy()

# Remove rows where molecule conversion failed from the main DataFrame
df = df.dropna(subset=['mol'])

print(f"Total molecules in original dataset: {len(df) + len(invalid_df)}")
print(f"Valid molecules: {len(df)}")
print(f"Invalid molecules: {len(invalid_df)}")

In [None]:

# Assuming we have already created the DataFrame 'df' with valid molecules

# Function to compute Murcko scaffold
def compute_murcko_scaffold(mol):
    if mol is not None:
        try:
            scaff = Chem.MurckoDecompose(mol)
            try:
                Chem.SanitizeMol(scaff)
            except:
                Chem.GetSymmSSSR(scaff)
            return scaff
        except Exception as e:
            print(f"Error processing molecule: {Chem.MolToSmiles(mol)}")
            print(f"Error message: {str(e)}")
            return None
    return None

# Compute Murcko scaffolds
df['scaffold'] = df['mol'].apply(compute_murcko_scaffold)

# Extract molecules that failed scaffold generation
failed_scaffold_df = df[df['scaffold'].isna()].copy()

# Remove rows where scaffold generation failed from the main DataFrame
df = df.dropna(subset=['scaffold'])

# Generate SMILES for successful scaffolds
df['scaffold_smiles'] = df['scaffold'].apply(Chem.MolToSmiles)

print(f"Total molecules: {len(df) + len(failed_scaffold_df)}")
print(f"Molecules with valid scaffolds: {len(df)}")
print(f"Molecules that failed scaffold generation: {len(failed_scaffold_df)}")

# Save molecules that failed scaffold generation to a CSV file
failed_scaffold_df.to_csv('failed_scaffold_molecules.csv', index=False)
print("Molecules that failed scaffold generation saved to 'failed_scaffold_molecules.csv'")

# Display the first few molecules that failed scaffold generation
print("\nFirst few molecules that failed scaffold generation:")
print(failed_scaffold_df[['smiles']].head())

# Count unique scaffolds
unique_scaffolds = df['scaffold_smiles'].nunique()
print(f"Number of unique scaffolds: {unique_scaffolds}")

In [None]:
def print_column_names(df):
    print("Column names of the DataFrame:")
    for col in df.columns:
        print(f"- {col}")

def print_nth_row(df, n):
    if n < 0 or n >= len(df):
        print(f"Error: Row index {n} is out of bounds. DataFrame has {len(df)} rows.")
        return
    
    print(f"Row {n} of the DataFrame:")
    row = df.iloc[n]
    for col, value in row.items():
        if col in ['mol', 'scaffold']:  # These are RDKit mol objects
            value = "RDKit Mol object"
        elif isinstance(value, str) and len(value) > 50:
            value = value[:47] + "..."
        print(f"{col}: {value}")

# Usage:
print_column_names(df)
print("\n")  # Add a blank line for readability
print_nth_row(df, 0)  # Print the first row (index 0)

# If you want to print a different row, just change the index:
print_nth_row(df, 1)  # This will print the 6th row (index 5)

In [None]:

# Sort the DataFrame by scaffold_smiles
df_sorted = df.sort_values(by='scaffold_smiles', ascending=True)

# Reset the index of the sorted DataFrame
df_sorted = df_sorted.reset_index(drop=True)

# Print the first few rows of the sorted DataFrame
print("First few rows of the DataFrame sorted by scaffold_smiles:")
print(df_sorted[['scaffold_smiles', 'smiles']].head())

# Print the last few rows of the sorted DataFrame
print("\nLast few rows of the DataFrame sorted by scaffold_smiles:")
print(df_sorted[['scaffold_smiles', 'smiles']].tail())

# If you want to sort in descending order, you can use:
# df_sorted_desc = df.sort_values(by='scaffold_smiles', ascending=False)

# If you want to update the original DataFrame instead of creating a new one:
# df.sort_values(by='scaffold_smiles', ascending=True, inplace=True)
# df.reset_index(drop=True, inplace=True)

# To see the distribution of scaffolds, you can count the occurrences:
scaffold_counts = df_sorted['scaffold_smiles'].value_counts()
print("\nTop 10 most common scaffolds:")
print(scaffold_counts.head(10))

# To get the number of unique scaffolds:
unique_scaffolds = df_sorted['scaffold_smiles'].nunique()
print(f"\nNumber of unique scaffolds: {unique_scaffolds}")



In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from IPython.display import display, Image, HTML
# Assuming 'df' is your DataFrame with 'smiles' and 'scaffold_smiles' columns

# Extract molecules with empty scaffolds
empty_scaffold_df = df[df['scaffold_smiles'] == '']

# If there are no empty scaffolds, the condition might be different, e.g.:
# empty_scaffold_df = df[df['scaffold_smiles'].isna()]

# Print the number of molecules with empty scaffolds
print(f"Number of molecules with empty scaffolds: {len(empty_scaffold_df)}")

# Extract 5 random molecules with empty scaffolds
sample_empty_scaffolds = empty_scaffold_df.sample(n=min(5, len(empty_scaffold_df)))

# Function to manually generate scaffold
def generate_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        return Chem.MolToSmiles(scaffold)
    return "Invalid SMILES"

def display_molecule_and_scaffold(smiles, scaffold_smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold_mol = Chem.MolFromSmiles(scaffold_smiles) if scaffold_smiles != "Invalid SMILES" else None
    
    img = Draw.MolsToGridImage(
        [mol, scaffold_mol] if scaffold_mol else [mol],
        molsPerRow=2,
        subImgSize=(300, 300),
        legends=['Original', 'Scaffold'] if scaffold_mol else ['Original']
    )
    
    display(img)
    display(HTML(f"<b>Original SMILES:</b> {smiles}<br><b>Scaffold SMILES:</b> {scaffold_smiles}"))
    display(HTML("<hr>"))

# Assuming sample_empty_scaffolds is your DataFrame with molecules that have empty scaffolds
print("Sample molecules with empty scaffolds:")
for idx, row in sample_empty_scaffolds.iterrows():
    smiles = row['smiles']
    manual_scaffold = generate_scaffold(smiles)
    
    display(HTML(f"<h3>Molecule {idx}</h3>"))
    display_molecule_and_scaffold(smiles, manual_scaffold)



# If you want to print a different row, just change the index:
print_nth_row(df_sorted, 1)  # This will print the 6th row (index 5)

In [None]:
MurckoScaffold.MurckoScaffoldSmiles('CCOC(=O)CC(C(=O)OCC)C(=O)OCC')


def display_molecule(smiles, size=(300, 300)):
    """
    Display a molecule from a SMILES string in a Jupyter notebook.
    
    Parameters:
    - smiles (str): The SMILES string of the molecule.
    - size (tuple): The size of the image in pixels (width, height).
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Error: Could not convert SMILES to molecule: {smiles}")
        return
    
    img = Draw.MolToImage(mol, size=size)
    display(img)

# Example usage
# Display a single molecule
smiles = "CCOC(=O)CC(C(=O)OCC)C(=O)OCC"  # Aspirin
print("CCOC(=O)CC(C(=O)OCC)C(=O)OCC:")
display_molecule(smiles)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display, Image
import io
# Extract unique scaffold SMILES
unique_scaffold_smiles = df_sorted['scaffold_smiles'].unique()

# Convert scaffold SMILES to RDKit molecules
scaffs = [Chem.MolFromSmiles(smi) for smi in unique_scaffold_smiles if pd.notna(smi)]

# Remove any None values (in case of invalid SMILES)
scaffs = [s for s in scaffs if s is not None]

# Determine number of scaffolds to visualize
num_to_visualize = min(10, len(scaffs))
valid_scaffs = scaffs[:num_to_visualize]

# Create grid image
img = Draw.MolsToGridImage(
    valid_scaffs,
    molsPerRow=5,
    subImgSize=(200, 200),
    legends=[f"Scaffold {i+1}" for i in range(len(valid_scaffs))],
    returnPNG=True
)

# Display the image directly in the notebook
display(Image(img))

# If you want to save the image to a file:
with open("scaffold_visualization.png", "wb") as f:
    f.write(img)
print(f"Scaffold visualization saved as 'scaffold_visualization.png'")

# Print SMILES for displayed scaffolds
print("\nScaffold SMILES:")
for i, mol in enumerate(valid_scaffs, 1):
    print(f"Scaffold {i}: {Chem.MolToSmiles(mol)}")

In [None]:
# Visualize some scaffolds
num_to_visualize = min(10, len(scaffs))
valid_scaffs = [s for s in scaffs if s is not None][:num_to_visualize]
img = Draw.MolsToGridImage(
    valid_scaffs,
    molsPerRow=5,
    subImgSize=(200, 200),
    legends=[f"Scaffold {i+1}" for i in range(len(valid_scaffs))],
    returnPNG=False,
)

# Save the image directly
img.save("scaffold_visualization.png")

# Count scaffold frequencies
scaffold_counts = pd.Series(scaff_smis).value_counts()

# Plot scaffold distribution
plt.figure(figsize=(12, 6))
scaffold_counts.head(20).plot(kind="bar")
plt.title("Top 20 Scaffold Frequencies")
plt.xlabel("Scaffold SMILES")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
# plt.tight_layout()
plt.savefig("scaffold_distribution.png")

print("Scaffold visualization saved as 'scaffold_visualization.png'")
print("Scaffold distribution plot saved as 'scaffold_distribution.png'")

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS
import matplotlib.pyplot as plt

# ... [Previous code remains unchanged] ...

# Visualize some scaffolds
num_to_visualize = min(10, len(scaffs))
valid_scaffs = [s for s in scaffs if s is not None][:num_to_visualize]
valid_scaff_smiles = [Chem.MolToSmiles(s) for s in valid_scaffs]

img = Draw.MolsToGridImage(
    valid_scaffs,
    molsPerRow=5,
    subImgSize=(200, 200),
    legends=[f"Scaffold {i+1}" for i in range(len(valid_scaffs))],
    returnPNG=False,
)

# Save the image directly
img.save("scaffold_visualization.png")

# Count scaffold frequencies
scaffold_counts = pd.Series(scaff_smis).value_counts()

# Plot scaffold distribution
plt.figure(figsize=(15, 8))
top_20_scaffolds = scaffold_counts.head(20)
top_20_scaffolds.plot(kind="bar")
plt.title("Top 20 Scaffold Frequencies")
plt.xlabel("Scaffold Index")
plt.ylabel("Frequency")
plt.xticks(range(len(top_20_scaffolds)), range(1, len(top_20_scaffolds) + 1), rotation=0)
plt.tight_layout()
plt.savefig("scaffold_distribution.png", dpi=300, bbox_inches='tight')

print("Scaffold visualization saved as 'scaffold_visualization.png'")
print("Scaffold distribution plot saved as 'scaffold_distribution.png'")

# Print SMILES for visualized scaffolds
print("\nSMILES for visualized scaffolds:")
for i, smiles in enumerate(valid_scaff_smiles, 1):
    print(f"Scaffold {i}: {smiles}")

# Print SMILES for top 20 scaffolds
print("\nSMILES for top 20 scaffolds by frequency:")
for i, (smiles, count) in enumerate(top_20_scaffolds.items(), 1):
    print(f"Scaffold {i}: {smiles} (Count: {count})")