In [1]:
# D:\FJTCM\DeepLife\DLProject\run_fingerprint_lifespan_simplified.py

import sys
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect # For Morgan
from rdkit.Chem import RDKFingerprint # For Topological (RDKit default)
import numpy as np
from tqdm import tqdm
import os

In [2]:
# --- Fingerprint Calculation Functions (RDKit only) ---
def compute_morgan_fingerprint(smi, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smi)
    if mol is None: return np.zeros(n_bits, dtype=int)
    fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(list(fp), dtype=int)

def compute_rdkit_topological_fingerprint(smi, n_bits=2048):
    mol = Chem.MolFromSmiles(smi)
    if mol is None: return np.zeros(n_bits, dtype=int)
    fp = RDKFingerprint(mol, fpSize=n_bits)
    return np.array(list(fp), dtype=int)

def compute_maccs_keys_fingerprint(smi):
    """Computes MACCS keys fingerprint (166 bits)."""
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return np.zeros(166, dtype=int) # MACCS has 166 bits

    fp_bit_vect = MACCSkeys.GenMACCSKeys(mol) # This is an ExplicitBitVect of size 167

    # Convert ExplicitBitVect to a numpy array of the 166 MACCS keys
    # The 0-th bit of the RDKit MACCS ExplicitBitVect is unused,
    # so we slice from the 1st bit onwards.
    arr = np.zeros(166, dtype=int) # Initialize an array of 166 zeros
    for i in range(1, 167): # Iterate from bit 1 to 166 (inclusive for ExplicitBitVect indices)
        if fp_bit_vect[i]: # Check if the i-th bit is set in the ExplicitBitVect
            arr[i-1] = 1   # Set the corresponding (i-1)-th position in our 166-bit array
    return arr

# --- Fingerprint Generator Class (Simplified) ---
class SimplifiedFingerprintGenerator:
    def __init__(self, morgan_radius=2, morgan_nbits=2048, rdkit_fp_nbits=2048):
        self.morgan_radius = morgan_radius
        self.morgan_nbits = morgan_nbits
        self.rdkit_fp_nbits = rdkit_fp_nbits
        self.maccs_nbits = 166

        self.fp_config = {
            "hash": {
                "Morgan": lambda smi: compute_morgan_fingerprint(smi, self.morgan_radius, self.morgan_nbits),
                "Topological": lambda smi: compute_rdkit_topological_fingerprint(smi, self.rdkit_fp_nbits),
            },
            "nonhash": {
                "MACCS": lambda smi: compute_maccs_keys_fingerprint(smi),
            }
        }
        self.fp_bit_sizes = {
            "Morgan": self.morgan_nbits,
            "Topological": self.rdkit_fp_nbits,
            "MACCS": self.maccs_nbits,
        }

    def generate_fingerprints_for_smiles_list(self, smiles_list: list, fp_compute_func, fp_name, n_bits):
        all_fp_vectors = []
        for smi in tqdm(smiles_list, desc=f"Computing {fp_name}"):
            if pd.isna(smi):
                all_fp_vectors.append(np.zeros(n_bits, dtype=int))
            else:
                all_fp_vectors.append(fp_compute_func(smi))
        return pd.DataFrame(all_fp_vectors, columns=[f"{fp_name}_{i}" for i in range(n_bits)])

    def generate_and_save_fingerprints(self, smiles_list: list, output_dir: str, target_name: str):
        os.makedirs(output_dir, exist_ok=True)
        all_hash_fps_dfs = []
        for fp_name, fp_func in self.fp_config["hash"].items():
            print(f"Processing HASH type: {fp_name}")
            n_bits = self.fp_bit_sizes[fp_name]
            fp_df = self.generate_fingerprints_for_smiles_list(smiles_list, fp_func, fp_name, n_bits)
            all_hash_fps_dfs.append(fp_df)
        
        if all_hash_fps_dfs:
            combined_hash_fps_df = pd.concat(all_hash_fps_dfs, axis=1)
            hash_output_path = os.path.join(output_dir, f"{target_name}hash.csv")
            combined_hash_fps_df.to_csv(hash_output_path, index=False)
            print(f"Saved HASH fingerprints to {hash_output_path} with shape {combined_hash_fps_df.shape}")
        else:
            print("No HASH fingerprints were generated.")

        all_nonhash_fps_dfs = []
        for fp_name, fp_func in self.fp_config["nonhash"].items():
            print(f"Processing NON-HASH type: {fp_name}")
            n_bits = self.fp_bit_sizes[fp_name]
            fp_df = self.generate_fingerprints_for_smiles_list(smiles_list, fp_func, fp_name, n_bits)
            all_nonhash_fps_dfs.append(fp_df)

        if all_nonhash_fps_dfs:
            combined_nonhash_fps_df = pd.concat(all_nonhash_fps_dfs, axis=1)
            nonhash_output_path = os.path.join(output_dir, f"{target_name}nonhash.csv")
            combined_nonhash_fps_df.to_csv(nonhash_output_path, index=False)
            print(f"Saved NON-HASH fingerprints to {nonhash_output_path} with shape {combined_nonhash_fps_df.shape}")
        else:
            print("No NON-HASH fingerprints were generated.")

# %%
# --- Main execution block for Lifespan data (Simplified Fingerprints) ---
def main_process_simplified_fingerprints(
    base_dir: str, # Base project directory e.g., D:\FJTCM\DeepLife\DLProject
    morgan_r: int, morgan_n: int, rdkit_fp_n: int
    ):

    output_fingerprint_base = os.path.join(base_dir, "processed_fingerprints") # Subdir for these features

    # --- Process training data fingerprints ---
    train_csv_file = os.path.join(base_dir, "train.csv")
    train_target_name = "LifespanReg_train" # Consistent with Notebook CONFIG
    print(f"\n--- Processing Training Data Fingerprints: {train_target_name} ---")
    if not os.path.exists(train_csv_file):
        print(f"ERROR: {train_csv_file} not found!")
        return
    df_train = pd.read_csv(train_csv_file)
    if "SMILES" not in df_train.columns:
        print("ERROR: train.csv must contain 'SMILES' column.")
        return
    
    train_smiles_list = df_train["SMILES"].dropna().tolist()
    if not train_smiles_list:
        print("  No valid SMILES in training data for fingerprints.")
    else:
        fp_train_output_dir = os.path.join(output_fingerprint_base, "fingerprint") # Original used just 'fingerprint'
        fp_generator_train = SimplifiedFingerprintGenerator(
            morgan_radius=morgan_r, morgan_nbits=morgan_n, rdkit_fp_nbits=rdkit_fp_n
        )
        fp_generator_train.generate_and_save_fingerprints(
            smiles_list=train_smiles_list,
            output_dir=os.path.join(fp_train_output_dir, train_target_name), # Save inside a target_name subdir
            target_name="" # target_name prefix is now part of the output_dir path
        )

    # --- Process test data fingerprints ---
    test_csv_file = os.path.join(base_dir, "test.csv")
    test_target_name = "LifespanReg_test" # Consistent with Notebook CONFIG
    print(f"\n--- Processing Test Data Fingerprints: {test_target_name} ---")
    if not os.path.exists(test_csv_file):
        print(f"ERROR: {test_csv_file} not found!")
        return
    df_test = pd.read_csv(test_csv_file)
    if "SMILES" not in df_test.columns:
        print("ERROR: test.csv must contain 'SMILES' column.")
        return

    test_smiles_list = df_test["SMILES"].dropna().tolist()
    if not test_smiles_list:
        print("  No valid SMILES in test data for fingerprints.")
    else:
        fp_test_output_dir = os.path.join(output_fingerprint_base, "fingerprint")
        fp_generator_test = SimplifiedFingerprintGenerator(
            morgan_radius=morgan_r, morgan_nbits=morgan_n, rdkit_fp_nbits=rdkit_fp_n
        )
        fp_generator_test.generate_and_save_fingerprints(
            smiles_list=test_smiles_list,
            output_dir=os.path.join(fp_test_output_dir, test_target_name), # Save inside a target_name subdir
            target_name=""
        )

    print(f"\nSimplified fingerprint processing complete. Check the '{output_fingerprint_base}' directory.")

if __name__ == "__main__":
    PROJECT_BASE_DIR = r"D:\FJTCM\DeepLife\DLProject"

    # --- Configuration for Fingerprints (must match Notebook CONFIG) ---
    MORGAN_RADIUS_CONFIG = 2
    MORGAN_NBITS_CONFIG = 2048
    RDKIT_FP_NBITS_CONFIG = 2048

    main_process_simplified_fingerprints(
        base_dir=PROJECT_BASE_DIR,
        morgan_r=MORGAN_RADIUS_CONFIG,
        morgan_n=MORGAN_NBITS_CONFIG,
        rdkit_fp_n=RDKIT_FP_NBITS_CONFIG
    )
    # Output paths will be like:
    # D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_train\hash.csv
    # D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_train\nonhash.csv


--- Processing Training Data Fingerprints: LifespanReg_train ---
Processing HASH type: Morgan


Computing Morgan: 100%|██████████| 1679/1679 [00:01<00:00, 1587.47it/s]


Processing HASH type: Topological


Computing Topological: 100%|██████████| 1679/1679 [00:02<00:00, 649.09it/s]


Saved HASH fingerprints to D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_train\hash.csv with shape (1679, 4096)
Processing NON-HASH type: MACCS


Computing MACCS: 100%|██████████| 1679/1679 [00:01<00:00, 937.01it/s] 


Saved NON-HASH fingerprints to D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_train\nonhash.csv with shape (1679, 166)

--- Processing Test Data Fingerprints: LifespanReg_test ---
Processing HASH type: Morgan


Computing Morgan: 100%|██████████| 21/21 [00:00<00:00, 1742.96it/s]


Processing HASH type: Topological


Computing Topological: 100%|██████████| 21/21 [00:00<00:00, 320.69it/s]


Saved HASH fingerprints to D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_test\hash.csv with shape (21, 4096)
Processing NON-HASH type: MACCS


Computing MACCS: 100%|██████████| 21/21 [00:00<00:00, 432.47it/s]

Saved NON-HASH fingerprints to D:\FJTCM\DeepLife\DLProject\processed_fingerprints\fingerprint\LifespanReg_test\nonhash.csv with shape (21, 166)

Simplified fingerprint processing complete. Check the 'D:\FJTCM\DeepLife\DLProject\processed_fingerprints' directory.



