## Literature


https://greglandrum.github.io/rdkit-blog/posts/2024-05-31-scaffold-splits-and-murcko-scaffolds1.html

https://www.oloren.ai/blog/scaff-split

https://practicalcheminformatics.blogspot.com/2023/06/getting-real-with-molecular-property.html

In [None]:
import io

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
from PIL import Image
from rdkit import Chem
from rdkit.Chem import Draw, rdFMCS
from rdkit.Chem.Draw import IPythonConsole

# Set up RDKit drawing options
Draw.SetComicMode(IPythonConsole.drawOptions)

# Set up matplotlib
plt.style.use("tableau-colorblind10")
plt.rcParams["font.size"] = "16"

In [None]:
# Read the parquet file
parquet_file="../data/raw/enveda_library_subset_1percent.parquet"
df = pl.read_parquet(parquet_file)


# Assuming you have a DataFrame called 'df'
column_list = df.columns

# Print the list of columns
print(column_list)

In [None]:
print(f"Total molecules in original dataset: {len(df)}")



In [None]:

print(f"Total molecules: {len(df)}")
# Count unique scaffolds
unique_scaffolds = df['scaffold_smiles'].n_unique()
print(f"Number of unique scaffolds: {unique_scaffolds}")

In [None]:
from rdkit import Chem
import pyarrow as pa
import pyarrow.parquet as pq
from team5.data.data_split import sort_dataframe_by_scaffold, split_dataframe

# Sort the DataFrame by scaffold frequency (descending) and then by scaffold_smiles
df_sorted = sort_dataframe_by_scaffold(df)

# Verify the sorting
total_rows = len(df_sorted)
print(f"Total rows: {total_rows}")
print(f"90% mark: {int(total_rows * 0.9)}")
print("\nFirst 5 rows (most common scaffolds):")
print(df_sorted[['scaffold_smiles']].head())
print("\nLast 5 rows (least common scaffolds):")
print(df_sorted[['scaffold_smiles']].tail())

In [None]:
def print_column_names(df):
    print("Column names of the DataFrame:")
    for col in df.columns:
        print(f"- {col}")

def print_nth_row(df, n):
    if n < 0 or n >= len(df):
        print(f"Error: Row index {n} is out of bounds. DataFrame has {len(df)} rows.")
        return
    
    print(f"Row {n} of the DataFrame:")
    row = df.row(n,named=True)
   
    for col, value in row.items():
        if col in ['mol', 'scaffold']:  # These are RDKit mol objects
            value = "RDKit Mol object"
        elif isinstance(value, str) and len(value) > 50:
            value = value[:47] + "..."
        print(f"{col}: {value}")

# Usage:
print_column_names(df)
print("\n")  # Add a blank line for readability
print_nth_row(df, 0)  # Print the first row (index 0)

# If you want to print a different row, just change the index:
print_nth_row(df, 1)  # This will print the 6th row (index 5)

# Sort data by scaffold smile

In [None]:
# Print the first few rows of the sorted DataFrame
print("First few rows of the DataFrame sorted by scaffold_smiles:")
print(df_sorted[['scaffold_smiles', 'smiles']].head())

# Print the last few rows of the sorted DataFrame
print("\nLast few rows of the DataFrame sorted by scaffold_smiles:")
print(df_sorted[['scaffold_smiles', 'smiles']].tail())

# To see the distribution of scaffolds, you can count the occurrences:
scaffold_counts = df_sorted['scaffold_smiles'].value_counts()
print("\nTop 10 most common scaffolds:")
print(scaffold_counts.head(10))

# To get the number of unique scaffolds:
unique_scaffolds = df_sorted['scaffold_smiles'].n_unique()
print(f"\nNumber of unique scaffolds: {unique_scaffolds}")



# Split the data for train and test

In [None]:
import os
# Calculate the split point (90% of the data)
split_point = int(len(df) * 0.9)

df_90,df_10=split_dataframe(df_sorted,0.9)
# Create output directory if it doesn't exist
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)

# Generate output file names
input_filename = os.path.basename(parquet_file)
output_90_filename = f"{os.path.splitext(input_filename)[0]}_90percent.parquet"
output_10_filename = f"{os.path.splitext(input_filename)[0]}_10percent.parquet"

# Full paths for output files
output_90_path = os.path.join(output_dir, output_90_filename)
output_10_path = os.path.join(output_dir, output_10_filename)

# Write to Parquet files
df_90.write_parquet(output_90_path)
df_10.write_parquet(output_10_path)

print(f"90% data written to '{output_90_path}' ({len(df_90)} rows)")
print(f"10% data written to '{output_10_path}' ({len(df_10)} rows)")

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from IPython.display import display, Image, HTML
# Assuming 'df' is your DataFrame with 'smiles' and 'scaffold_smiles' columns

# Extract molecules with empty scaffolds
empty_scaffold_df = df.filter(df['scaffold_smiles'] == '')

# Print the number of molecules with empty scaffolds
print(f"Number of molecules with empty scaffolds: {len(empty_scaffold_df)}")

# Extract 5 random molecules with empty scaffolds
sample_empty_scaffolds = empty_scaffold_df.sample(n=min(5, len(empty_scaffold_df)))

def display_molecule_and_scaffold(smiles, scaffold_smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold_mol = Chem.MolFromSmiles(scaffold_smiles) if scaffold_smiles != "Invalid SMILES" else None
    
    img = Draw.MolsToGridImage(
        [mol, scaffold_mol] if scaffold_mol else [mol],
        molsPerRow=2,
        subImgSize=(300, 300),
        legends=['Original', 'Scaffold'] if scaffold_mol else ['Original']
    )
    
    display(img)
    display(HTML(f"<b>Original SMILES:</b> {smiles}<br><b>Scaffold SMILES:</b> {scaffold_smiles}"))
    display(HTML("<hr>"))

# Assuming sample_empty_scaffolds is your DataFrame with molecules that have empty scaffolds
print("Sample molecules with empty scaffolds:")
for row in sample_empty_scaffolds.with_row_index().iter_rows(named=True):
    
    smiles = row['smiles']
    scaffold_smiles = row['scaffold_smiles']
    
    display(HTML(f"<h3>Molecule {row['index']}</h3>"))
    display_molecule_and_scaffold(smiles, scaffold_smiles)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display, Image as IPythonImage
from PIL import Image as PILImage
import io
from collections import Counter

# Assuming df_sorted is your pandas DataFrame with 'smiles' and 'scaffold_smiles' columns

# Count scaffold frequencies, including empty strings
scaffold_counts = Counter(df_sorted.select(pl.col('scaffold_smiles').fill_null('')).to_series())

# Sort scaffolds by frequency
sorted_scaffolds = sorted(scaffold_counts.items(), key=lambda x: x[1], reverse=True)
# Convert top scaffold SMILES to RDKit molecules
num_to_visualize = min(10, len(sorted_scaffolds))
valid_scaffs = []
legends = []

for rank, (smi, count) in enumerate(sorted_scaffolds[:num_to_visualize], 1):
    if smi == '':
        valid_scaffs.append(Chem.MolFromSmiles('C'))  # Placeholder molecule for empty string
        legends.append(f"Rank {rank}: empty string\n(Count: {count})")
    else:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid_scaffs.append(mol)
            legends.append(f"Rank {rank}: {smi}\n(Count: {count})")

# Create grid image
img = Draw.MolsToGridImage(
    valid_scaffs,
    molsPerRow=2,  # Reduced to 2 per row to accommodate longer legends
    subImgSize=(300, 300),  # Increased size
    legends=legends,
    returnPNG=False  # This returns a PIL Image object
)

# Convert PIL Image to bytes
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()

# Display the image directly in the notebook
display(IPythonImage(data=img_byte_arr))

# If you want to save the image to a file:
with open("scaffold_visualization_ranked.png", "wb") as f:
    f.write(img_byte_arr)
print(f"Scaffold visualization saved as 'scaffold_visualization_ranked.png'")

# Print SMILES for displayed scaffolds
print("\nTop Scaffolds by Frequency:")
for rank, (smi, count) in enumerate(sorted_scaffolds[:num_to_visualize], 1):
    if smi == '':
        print(f"Rank {rank}: empty string (Count: {count})")
    else:
        print(f"Rank {rank}: {smi} (Count: {count})")

In [None]:
import matplotlib.pyplot as plt
import polars as pl
from collections import Counter

# Assuming sorted_scaffolds is available from the previous step

# Convert scaffold counts to a DataFrame for easy plotting
scaffold_df = pl.DataFrame(sorted_scaffolds, schema=["scaffold_smiles", "count"])

# Select top 20 most frequent scaffolds
top_20_scaffolds = scaffold_df.sort("count", descending=True).head(20)

# Convert Polars DataFrame to Pandas for plotting
top_20_pd = top_20_scaffolds.to_pandas()

# Plot scaffold distribution
plt.figure(figsize=(12, 6))
plt.bar(top_20_pd['scaffold_smiles'], top_20_pd['count'])

plt.title("Top 20 Scaffold Frequencies")
plt.xlabel("Scaffold SMILES")
plt.ylabel("Frequency")
plt.xticks(rotation=90)

# Optionally, you can add a grid or adjust other plot styles here
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# Save plot as an image
plt.tight_layout()  # Adjust layout for better fit
plt.savefig("scaffold_distribution.png")

print("Scaffold distribution plot saved as 'scaffold_distribution.png'")
