In [1]:
import requests
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
import pubchempy as pcp
from bs4 import BeautifulSoup
import csv
import urllib3
from sklearn.preprocessing import StandardScaler

from utils import get_morgan_fingerprints, get_padel_descriptors

def name_to_smiles(name):
    # Intentar con la API de Cactus
    url = f"https://cactus.nci.nih.gov/chemical/structure/{name}/smiles"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            return response.text.strip()
    except:
        pass

    # Intentar con PubChemPy
    try:
        compounds = pcp.get_compounds(name, 'name')
        if compounds:
            return compounds[0].canonical_smiles
    except:
        pass

    return None


In [4]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# URL de la base de datos
url = "https://www.fluorophores.tugraz.at/substance/"

# Descargar el contenido HTML
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, "html.parser")

# Buscar la tabla (puede que haya que ajustar el selector)
table = soup.find("table")

# Extraer los encabezados
headers = [th.text.strip() for th in table.find_all("th")]

# Extraer los datos de las filas
rows = []
for tr in table.find_all("tr")[1:]:
    cols = [td.text.strip() for td in tr.find_all("td")]
    if cols:
        rows.append(cols)

# Guardar en un archivo CSV
with open("fluorophores_data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(rows)

print("Datos guardados en 'fluorophores_data.csv'")


Datos guardados en 'fluorophores_data.csv'


In [13]:
df = pd.read_csv('fluorophores_data.csv')


# Convertir ambas columnas a SMILES
df["substance_smiles"] = df["Substance name"].apply(name_to_smiles)
df["solvent_smiles"] = df["Solvent"].apply(name_to_smiles)

df.to_csv('base_datos_externa.csv', index = False)

print(df) 

                              Substance name   Solvent   pH  \
0                                   Qdot 705       NaN  7.2   
1                    SYBR Safe DNA gel stain       NaN  NaN   
2                      (CS)2Ir(µ-Cl)2Ir(CS)2     CHCl3  NaN   
3    1,1  -Diethyl-4,4  -carbocyanine iodide  Methanol  NaN   
4                      1,2-Diphenylacetylene      EtOH  NaN   
..                                       ...       ...  ...   
950                      Zinc Phthalocyanine  pyridine  NaN   
951               Zinc Tetramesitylporphyrin   toluene  NaN   
952                Zinc Tetraphenylporphyrin   toluene  NaN   
953                                 ZsGreen1       NaN  NaN   
954                                ZsYellow1       NaN  NaN   

    Excitation max (nm) Emission max (nm)  \
0                   300               702   
1              509, 284               526   
2                   NaN               587   
3              707, 651               728   
4         373, 354

Se ha hecho una limpieza manual de la base de datos (eliminación de datos cuyo SMILES no se puede obtener y de duplicados, corrección del SMILES de algunos compuestos,  y se ha escogido solo una $\lambda_{max}$ para aquellos que tenian más de uno). El siguente código ha sido adaptado de GreenMan et al. 

In [19]:
df_val_final = pd.read_csv('bd_validacion_final.csv')

# ChemFluor FSD Molecule Fingerprint
print("Calculating ChemFluor FSD molecule fingerprints...")
df_val_final, ecdkex_features_names = get_padel_descriptors(df_val_final, descriptor_types='chemfluor')

# Solvent Morgan Fingerprints
print("Calculating solvent Morgan fingerprints...")
df_val_final, solvent_fp_cols = get_morgan_fingerprints(df_val_final, mol_or_solv='solvents', nbits=256)

# Minnesota Solvent Descriptors
print("Merging with Minnesota solvent descriptors...")
mn_solvent_db = pd.read_csv('data_solvents/mn_solvent_db.csv')
minnesota_desc_cols = [x for x in mn_solvent_db.columns if x!='solvent']
df_val_final = df_val_final.merge(mn_solvent_db, how='left')

# ChemFluor CGSD Solvent Descriptors
print("Merging with ChemFluor CGSD solvent descriptors...")
chemfluor_solvent_db = pd.read_csv('data_solvents/chemfluor_cgsd_solvent_db.csv')
chemfluor_solv_desc_cols = [x for x in chemfluor_solvent_db.columns if x!='solvent']
df_val_final = df_val_final.merge(chemfluor_solvent_db, how='left')


# Eliminación de NaN y normalizado de las features 
df_val_final.dropna()
float_columns = df_val_final.select_dtypes(include=['float64']).columns.drop('peakwavs_max', errors='ignore')
scaler = StandardScaler(with_mean = True , with_std = True) 
df_val_final[float_columns] = scaler.fit_transform(df_val_final[float_columns])

df_val_final.to_csv('val_final_all_features.csv', index=False)

# Export Feature Name Lists to JSON
feature_names_dict = {'ecdkex': ecdkex_features_names, 
                      'sfp': solvent_fp_cols,
                      'minnesota': minnesota_desc_cols,  
                      'cgsd': chemfluor_solv_desc_cols}

feature_names_file = "feature_names.json"

with open(feature_names_file, "w") as f:
    f.write(json.dumps(feature_names_dict, indent=4, sort_keys=True))

Calculating ChemFluor FSD molecule fingerprints...
Calculating solvent Morgan fingerprints...
Merging with Minnesota solvent descriptors...
Merging with ChemFluor CGSD solvent descriptors...


  unique_df[col_name+str(i+1)] = np.nan


# Base de datos para predicción en chemprop

In [4]:
data = pd.read_csv('scaled_df.csv')
data['combined_smiles'] = data['smiles'] + '.' + data['solvent']
data = data[['combined_smiles', 'peakwavs_max']]
data.to_csv('chemprop_data_train.csv', index=False)

data = pd.read_csv('bd_validacion_final.csv')
data['combined_smiles'] = data['smiles'] + '.' + data['solvent']
data = data[['combined_smiles', 'peakwavs_max']]
data.to_csv('chemprop_data_val.csv', index=False)
