In [11]:
import sys
sys.path.append('../03-code/')
from config import PROJECT_ROOT_DIRECTORY, COLUMNS_DICT
import pandas as pd

### Obtain five topological molecular descriptors

In [3]:
import csv
from rdkit import Chem
from synthesis_feasibility_helper import calculate_topological_formability_descriptors

def generate_formability_descriptors_csv_file_from_smiles(input_csv, output_csv):
    with open(input_csv, 'r') as infile, open(output_csv, 'w', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = ['smiles_canonical','linker_position','STEI','NumRot_tail','eccentricity','disNN','NumN']  # Add functional group names as columns
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        
        # Write the header to the output CSV
        writer.writeheader()
        
        # Process each row (SMILES and sequence) in the input CSV
        for row in reader:
            smiles_canonical = row['smiles_canonical']
            
            # calculate the formability descriptors of molecule in pubchem
            mol = Chem.MolFromSmiles(smiles_canonical)
            formability_descriptors_dict = calculate_topological_formability_descriptors(mol)
            output_row = {
                'smiles_canonical': smiles_canonical,
                'linker_position': row['linker_position'],
            }
            
            output_row.update(formability_descriptors_dict)  # Add fingerprint data to the row
                
            # Write the row to the output CSV
            writer.writerow(output_row)

In [7]:
for i in range(5,7): #0,5
    input_csv = PROJECT_ROOT_DIRECTORY + '01-rawdata/01-molecular-generation/fingerprints/fingerprints_generation_' + str(i) + '.csv'  # Input file containing SMILES and sequences
    output_csv = PROJECT_ROOT_DIRECTORY + '01-rawdata/12-formability-score/formability_descriptors_generation_' + str(i) + '.csv'  # Output file to store formability descriptors

    # Process the input CSV and output the fingerprint vectors to a new CSV
    generate_formability_descriptors_csv_file_from_smiles(input_csv, output_csv)

### Calculate the formability score

In [12]:
selected_formability_descriptors = ['STEI','NumRot_tail','eccentricity','disNN','linker_position']

In [13]:
from synthesis_feasibility_helper import calculate_formability_score
smearing_factor = 0.15

In [27]:
def generate_formability_score_from_formability_descriptors(formability_descriptors_dataframe):
    formability_score_list = []
    for index, row in formability_descriptors_dataframe.iterrows():
        formability_descriptors_dict = {
            key: row[key] for key in selected_formability_descriptors
        }
        formability_score = calculate_formability_score(formability_descriptors_dict, smearing_factor=smearing_factor)
        formability_score['smiles_canonical'] = row['smiles_canonical']
        formability_score.update(formability_descriptors_dict)
        formability_score_list.append(formability_score)

    formability_score_dataframe = pd.DataFrame(formability_score_list)
    cols = ['smiles_canonical'] + [col for col in formability_score_dataframe.columns if col != 'smiles_canonical']
    return(formability_score_dataframe[cols])

In [28]:
for i in range(0,7): #0,5
    input_csv = PROJECT_ROOT_DIRECTORY + '01-rawdata/12-formability-score/formability_descriptors_generation_' + str(i) + '.csv' 
    output_csv = PROJECT_ROOT_DIRECTORY + '01-rawdata/12-formability-score/formability_score_generation_' + str(i) + '.csv' 

    formability_descriptors_dataframe = pd.read_csv(input_csv)
    formability_score_dataframe = generate_formability_score_from_formability_descriptors(formability_descriptors_dataframe)
    formability_score_dataframe.to_csv(output_csv, index=False)
