# XGBoost y kNN con distintas representaciones para el disolvente

In [1]:
import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
with open("feature_names.json", "r") as f:
    feature_names_dict = json.load(f)

base_datos_entera = pd.read_csv('scaled_df.csv')
bd_solvent_mpnn = pd.read_csv('solvent_mpnn_fingerprints.csv')

## **Random split**

In [6]:
solvent_to_feat_dic = {'minnesota_descriptors': 'minnesota', 
                'morgan_fingerprint': 'sfp',
                'chemfluor': 'cgsd'}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

for solvent_representation in [
    'morgan_fingerprint',
    'chemfluor',
    'minnesota_descriptors',
]:
    print(f"\n Representación disolvente: {solvent_representation}")
    solvent_features_name = solvent_to_feat_dic[solvent_representation]
    features = feature_names_dict['ecdkex'] + feature_names_dict[solvent_features_name]
    
    df = base_datos_entera[['smiles', 'solvent', 'peakwavs_max'] + features]
    smiles = df['smiles']
    x = df[features]
    y = df['peakwavs_max']
    
    for metodo in ['xgb', 'knn']:

        print(f"\n Método: {metodo}")

        if metodo == 'xgb' :
            reg =  XGBRegressor(
                    n_estimators=700, 
                    learning_rate=0.1,
                    max_depth=17,
                    gamma=0.1,
                    min_child_weight=7,
                    colsample_bytree=1,
                    subsample=1.0,
                    objective='reg:squarederror',
                    verbosity=1,
                    random_state= 22, n_jobs=-1)
        else:
            reg = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski', n_jobs=-1)       
        fold_maes = []
        resultados = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(x, y)):
            X_train, X_val = x.iloc[train_idx], x.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            smiles_val = smiles.iloc[val_idx]

            reg.fit(X_train, y_train)
            y_pred = reg.predict(X_val)

            mae = mean_absolute_error(y_val, y_pred)
            fold_maes.append(mae)
            print(f"    Fold {fold_idx}: MAE = {mae:.4f}")
            
            fold_results = pd.DataFrame({
                'smiles': smiles_val.values,
                'real': y_val.values,
                'predicho': y_pred,
                'fold': fold_idx
            })
            
            resultados.append(fold_results)
            
        mean_mae = np.mean(fold_maes)
        print(f"         Media de los MAEs: {mean_mae:.4f}")

        
        resultados_df = pd.concat(resultados, ignore_index=True)
        filename = f'Resultados/CV/{metodo}_{solvent_representation}.csv'
        resultados_df.to_csv(filename, index=False)


 Representación disolvente: morgan_fingerprint

 Método: xgb
    Fold 0: MAE = 13.2786
    Fold 1: MAE = 13.8343
    Fold 2: MAE = 13.8481
    Fold 3: MAE = 14.1134
    Fold 4: MAE = 14.3094
         Media de los MAEs: 13.8768

 Método: knn
    Fold 0: MAE = 14.8674
    Fold 1: MAE = 15.8294
    Fold 2: MAE = 15.6037
    Fold 3: MAE = 15.4896
    Fold 4: MAE = 16.1277
         Media de los MAEs: 15.5836

 Representación disolvente: chemfluor

 Método: xgb
    Fold 0: MAE = 12.9227
    Fold 1: MAE = 13.1769
    Fold 2: MAE = 13.6703
    Fold 3: MAE = 13.6391
    Fold 4: MAE = 13.8897
         Media de los MAEs: 13.4598

 Método: knn
    Fold 0: MAE = 13.9644
    Fold 1: MAE = 14.8152
    Fold 2: MAE = 14.8786
    Fold 3: MAE = 14.5530
    Fold 4: MAE = 15.2984
         Media de los MAEs: 14.7019

 Representación disolvente: minnesota_descriptors

 Método: xgb
    Fold 0: MAE = 12.7086
    Fold 1: MAE = 13.5074
    Fold 2: MAE = 13.5730
    Fold 3: MAE = 13.7903
    Fold 4: MAE = 13.881

## **Scaffold split**

In [9]:
solvent_to_feat_dic = {'minnesota_descriptors': 'minnesota', 
                'morgan_fingerprint': 'sfp',
                'solvent_mpnn': None,
                'chemfluor': 'cgsd'}

with open("feature_names.json", "r") as f:
    feature_names_dict = json.load(f)

test_dfs = {}
train_dfs = {}

#Añadir las distintas particiones de train test por scaffold en diccionarios. 
for i in range(1, 6):
    test_dfs[f'test_{i}'] = pd.read_csv(f'CV/all features/test_{i}.csv')
    train_dfs[f'train_{i}'] = pd.read_csv(f'CV/all features/train_{i}.csv')

for solvent_representation in [
    'morgan_fingerprint',
    'chemfluor',
    'minnesota_descriptors',
]:
    print(f"\n Representación disolvente: {solvent_representation}")

    ##OBTENCIÓN DE FEATURESL
    feat_name = solvent_to_feat_dic[solvent_representation]
    features = feature_names_dict['ecdkex'] + feature_names_dict[feat_name]
    
    for metodo in ['xgb', 'knn']:

        print(f"\n  Método: {metodo}")

        if metodo == 'xgb' :
            reg = XGBRegressor(n_estimators=900, 
                                learning_rate=0.1,
                                max_depth=17,
                                gamma=0.1,
                                min_child_weight=7,
                                colsample_bytree=1,
                                subsample=1.0,
                                objective='reg:squarederror',
                                verbosity=1)
        else:
            reg = KNeighborsRegressor(n_neighbors=3, 
                                      weights='distance', 
                                      algorithm='ball_tree', 
                                      leaf_size=30, 
                                      p=1, 
                                      metric='minkowski')

        fold_maes = []
        resultados =[]
        for i in range(1, 6):
            
            train = train_dfs[f'train_{i}']
            test = test_dfs[f'test_{i}']
            
            x_train = train[features]
            y_train = train['peakwavs_max']
            x_test = test[features]
            y_test = test['peakwavs_max']

            smiles_val = test['smiles']
            
            reg.fit(x_train, y_train)
            y_pred = reg.predict(x_test)
        
            mae = mean_absolute_error(y_test, y_pred)
            fold_maes.append(mae)
            print(f"    Fold {i}: MAE = {mae:.4f}")
            
            # Prepara DataFrame de este fold
            fold_results = pd.DataFrame({
                'smiles': smiles_val.values,
                'real': y_test.values,
                'predicho': y_pred,
                'fold': [i] * len(y_test)
            })
            resultados.append(fold_results)
        
        # Media de MAEs
        media = np.mean(fold_maes)
        print(f"         Media de los MAEs: {media:.4f}")
        
        resultados_df = pd.concat(resultados, ignore_index=True)
        nombre_archivo = f'Resultados/CV/scaffold/{metodo}_{solvent_representation}.csv'
        resultados_df.to_csv(nombre_archivo, index=False)


 Representación disolvente: morgan_fingerprint

  Método: xgb
    Fold 1: MAE = 33.5985
    Fold 2: MAE = 34.8080
    Fold 3: MAE = 31.6485
    Fold 4: MAE = 32.6392
    Fold 5: MAE = 31.0166
         Media de los MAEs: 32.7422

  Método: knn
    Fold 1: MAE = 36.4829
    Fold 2: MAE = 36.5978
    Fold 3: MAE = 36.7454
    Fold 4: MAE = 35.4593
    Fold 5: MAE = 35.1795
         Media de los MAEs: 32.7422

 Representación disolvente: chemfluor

  Método: xgb
    Fold 1: MAE = 33.6209
    Fold 2: MAE = 34.4158
    Fold 3: MAE = 31.5280
    Fold 4: MAE = 32.0761
    Fold 5: MAE = 29.9520
         Media de los MAEs: 32.7422

  Método: knn
    Fold 1: MAE = 36.3379
    Fold 2: MAE = 36.7753
    Fold 3: MAE = 36.8202
    Fold 4: MAE = 35.9212
    Fold 5: MAE = 35.0622
         Media de los MAEs: 32.7422

 Representación disolvente: minnesota_descriptors

  Método: xgb
    Fold 1: MAE = 33.5715
    Fold 2: MAE = 35.4947
    Fold 3: MAE = 32.4495
    Fold 4: MAE = 31.9748
    Fold 5: MAE = 3

## **Hay un error en la media de los MAEs**
***en la celda de abajo si están bien calculados***

In [23]:
for metodo in ['xgb', 'knn']:
    print(f"\nMétodo: {metodo}")
    for solvent_representation in [
    'morgan_fingerprint',
    'chemfluor',
    'minnesota_descriptors']:

        df_calcular_MAE = pd.read_csv(f'Resultados/CV/scaffold/{metodo}_{solvent_representation}.csv')

        media_maes = mean_absolute_error(df_calcular_MAE['real'], df_calcular_MAE['predicho']) 

        
        print(f"\n   Representación disolvente: {solvent_representation}")
        print(f'      Media de los MAEs: {media_maes:.4f} ')


Método: xgb

   Representación disolvente: morgan_fingerprint
      Media de los MAEs: 32.7423 

   Representación disolvente: chemfluor
      Media de los MAEs: 32.3187 

   Representación disolvente: minnesota_descriptors
      Media de los MAEs: 32.7765 

Método: knn

   Representación disolvente: morgan_fingerprint
      Media de los MAEs: 36.0930 

   Representación disolvente: chemfluor
      Media de los MAEs: 36.1834 

   Representación disolvente: minnesota_descriptors
      Media de los MAEs: 36.1680 


## Validación final

In [17]:
solvent_to_feat_dic = {'minnesota_descriptors': 'minnesota', 
                'morgan_fingerprint': 'sfp',
                'solvent_mpnn': None,
                'chemfluor': 'cgsd'}

with open("feature_names.json", "r") as f:
    feature_names_dict = json.load(f)

df_train = pd.read_csv('scaled_df.csv')
df_val = pd.read_csv('val_final_all_features.csv')

y_val = df_val['peakwavs_max']
y_train = df_train['peakwavs_max']


for solvent_representation in [
    'morgan_fingerprint',
    'chemfluor',
    'minnesota_descriptors',
]:
    print(f'\nRepresentación disolvente: {solvent_representation}')
    feat_name = solvent_to_feat_dic[solvent_representation]
    features = feature_names_dict['ecdkex'] + feature_names_dict[feat_name]
    
    x_train = df_train[features] 
    x_val = df_val[features]
    smiles_val = df_val['smiles']
    resultados = []
    for metodo in ['xgb', 'knn']:

        print(f"\n  Método: {metodo}")

        if metodo == 'xgb' :
            reg =  XGBRegressor(
                    n_estimators=700, 
                    learning_rate=0.1,
                    max_depth=17,
                    gamma=0.1,
                    min_child_weight=7,
                    colsample_bytree=1,
                    subsample=1.0,
                    objective='reg:squarederror',
                    verbosity=1,
                    random_state= 22, n_jobs=-1)
        else:
            reg = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski', n_jobs=-1) 
            
        reg.fit(x_train, y_train)
        y_pred = reg.predict(x_val)
    
        mae = mean_absolute_error(y_val, y_pred)

        desviación = std(y_val, y_pred)
    
        print(f' MAE de validación final: {mae:.4f} +- {desviación:.4f}')

        print(f'${mae:.2f} \pm {desviación:.2f}$')
        
        resultados = pd.DataFrame({
                    'smiles': smiles_val.values,
                    'real': y_val.values,
                    'predicho': y_pred})
        resultados.to_csv( f'Resultados/val_final/{metodo}_{solvent_representation}.csv' , index=False)


Representación disolvente: morgan_fingerprint

  Método: xgb
 MAE de validación final: 56.8654

  Método: knn
 MAE de validación final: 82.2623

Representación disolvente: chemfluor

  Método: xgb
 MAE de validación final: 59.9798

  Método: knn
 MAE de validación final: 81.8562

Representación disolvente: minnesota_descriptors

  Método: xgb
 MAE de validación final: 63.2755

  Método: knn
 MAE de validación final: 82.3758
