In [None]:
### CREATE PIPELINE TO PROCESS FEATURES E

In [12]:
import os
import numpy as np
import pandas as pd
import joblib

from feature_engineering import create_molecular_pipeline
from transformations import LogTargetTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from main_pipeline import create_molecular_prediction_pipeline

In [None]:
class MolecularPredictor:
    """
    Class for training and making predictions with molecular property models
    """
    
    def __init__(self, datasets_config, output_dir="models", preprocessing_options=None):
        """
        Initialize the MolecularPredictor
        
        Parameters:
        -----------
        datasets_config : dict
            Configuration for datasets (if None, uses default configs)
        output_dir : str
            Directory to save models and results
        preprocessing_options : dict or None
            Additional preprocessing options (if None, uses defaults)
        """
        self.datasets_config = datasets_config
        # Default preprocessing options
        self.preprocessing_options = {
            'drop_missing_target': True,
            'drop_missing_smiles': True,
            'drop_invalid_smiles': True,
            'convert_target_to_numeric': True
        }

        # Update with custom options if provided
        if preprocessing_options is not None:
            self.preprocessing_options.update(preprocessing_options)
            
        self.output_dir = output_dir
        self.models = {}
        self.results = {}
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def _preprocess_data(self, data, smiles_col, target_col, verbose=1):
        """
        Preprocess the dataset by cleaning target variable and handling missing values
        
        Parameters:
        -----------
        data : pandas.DataFrame
            The dataset to preprocess
        smiles_col : str
            Name of the SMILES column
        target_col : str
            Name of the target column
        verbose : int
            Verbosity level
            
        Returns:
        --------
        pandas.DataFrame
            Preprocessed dataset
        """
        original_row_count = len(data)
        
        # Make a copy to avoid modifying the original
        data = data.copy()
        
        # Check for missing SMILES
        missing_smiles = data[smiles_col].isna().sum()
        if missing_smiles > 0:
            if verbose >= 1:
                print(f"Found {missing_smiles} rows with missing SMILES. Dropping these rows.")
            data = data.dropna(subset=[smiles_col])
        
        # Convert SMILES to string type if not already
        data[smiles_col] = data[smiles_col].astype(str)
        
        # Check for invalid SMILES (completely empty or just whitespace)
        invalid_smiles = data[data[smiles_col].str.strip() == ""].shape[0]
        if invalid_smiles > 0:
            if verbose >= 1:
                print(f"Found {invalid_smiles} rows with empty SMILES. Dropping these rows.")
            data = data[data[smiles_col].str.strip() != ""]
        
        # Handle target variable
        if target_col in data.columns:
            # Try to convert target to numeric, coercing errors to NaN
            original_target_type = data[target_col].dtype
            data[target_col] = pd.to_numeric(data[target_col], errors='coerce')
            
            # Count how many conversions failed (became NaN)
            conversion_failures = data[target_col].isna().sum()
            if conversion_failures > 0 and verbose >= 1:
                print(f"Converted target from {original_target_type} to numeric. "
                      f"{conversion_failures} values couldn't be converted and became NaN.")
            
            # Drop rows with missing target values
            missing_target = data[target_col].isna().sum()
            if missing_target > 0:
                if verbose >= 1:
                    print(f"Dropping {missing_target} rows with missing target values.")
                data = data.dropna(subset=[target_col])
        
        # Report total rows removed
        final_row_count = len(data)
        rows_removed = original_row_count - final_row_count
        if rows_removed > 0 and verbose >= 1:
            print(f"Preprocessing removed {rows_removed} rows ({rows_removed/original_row_count:.1%}). "
                  f"{final_row_count} rows remaining.")
        
        return data

    def fit(self, dataset_name=None, custom_param_grid=None, test_size=0.2, cv=5, verbose=1):
        """
        Train models for the specified dataset(s) with hyperparameter optimization
        
        Parameters:
        -----------
        dataset_name : str or None
            Name of the dataset to train on (if None, trains on all datasets)
        custom_param_grid : dict
            Custom parameter grid for grid search (if None, uses default grid)
        test_size : float
            Proportion of data to use for testing
        cv : int
            Number of cross-validation folds
        verbose : int
            Verbosity level (0-3)
        
        Returns:
        --------
        self : MolecularPredictor
            Returns self for method chaining
        """
        # Default parameter grid if none provided
        if custom_param_grid is None:
            param_grid = {
                # Feature generation parameters
                'features__features__fingerprint_pipe__fingerprints__radius': [2, 3],
                'features__features__fingerprint_pipe__fingerprints__n_bits': [1024, 2048],
                
                # Model parameters
                'regressor__regressor__n_estimators': [100, 200, 300, 400, 500],
                'regressor__regressor__max_depth': [i for i in np.arange(5, 51, 2)] + [None],
            }
        else:
            param_grid = custom_param_grid

        # Determine which datasets to train on
        datasets_to_train = [dataset_name] if dataset_name else self.datasets_config.keys()

        for ds_name in datasets_to_train:
            if ds_name not in self.datasets_config:
                raise ValueError(f"Unknown dataset: {ds_name}")
            
            if verbose >= 1:
                print(f"\nTraining model for dataset: {ds_name}")
            
            # Load dataset
            config = self.datasets_config[ds_name]
            data = pd.read_csv(config["file"])
            smiles_col = config["smiles_col"]
            target_col = config["target_col"]
            
            # Preprocess the data
            data = self._preprocess_data(data, smiles_col, target_col, verbose)
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                data[smiles_col], 
                data[target_col],
                test_size=test_size,
                random_state=42
            )
            
            # Create molecular prediction pipeline
            pipeline = create_molecular_prediction_pipeline(
                regressor=RandomForestRegressor(random_state=42, n_jobs=-1),
                log_transform_target=True,
                n_jobs=-1
            )
            
            # Grid search with cross-validation
            if verbose >= 1:
                print(f"Running grid search with {cv}-fold cross-validation...")
                
            grid_search = GridSearchCV(
                pipeline,
                param_grid=param_grid,
                cv=cv,
                scoring='neg_root_mean_squared_error',
                n_jobs=-1,
                verbose=max(0, verbose-1)
            )
            
            # Train model
            grid_search.fit(X_train, y_train)

            # Get best model
            best_model = grid_search.best_estimator_
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            # Store results
            self.results[ds_name] = {
                'best_params': grid_search.best_params_,
                'test_rmse': rmse,
                'test_r2': r2,
                'cv_results': grid_search.cv_results_
            }

            # Store model
            self.models[ds_name] = best_model
            
            # Save model to disk
            model_path = os.path.join(self.output_dir, config["model_file"])
            joblib.dump(best_model, model_path)

            # Save detailed results
            results_path = os.path.join(self.output_dir, f"{ds_name}_results.txt")
            with open(results_path, 'w') as f:
                f.write(f"Dataset: {ds_name}\n")
                f.write(f"Best Parameters: {grid_search.best_params_}\n")
                f.write(f"Test RMSE: {rmse:.4f}\n")
                f.write(f"Test R²: {r2:.4f}\n\n")
                f.write("Top Grid Search CV Results:\n")
                
                # Get indices sorted by score (negative because higher is better for neg_rmse)
                sorted_indices = np.argsort(-grid_search.cv_results_['mean_test_score'])
                for i in sorted_indices[:10]:  # Top 10 configurations
                    mean = -grid_search.cv_results_['mean_test_score'][i]  # Convert back to RMSE
                    std = grid_search.cv_results_['std_test_score'][i]
                    f.write(f"Configuration: Mean RMSE={mean:.4f}, Std={std:.4f}\n")
                    for param, value in grid_search.cv_results_['params'][i].items():
                        f.write(f"  {param}: {value}\n")
                    f.write("\n")
            
            if verbose >= 1:
                print(f"Model for {ds_name} trained and saved.")
                print(f"Test RMSE: {rmse:.4f}, Test R²: {r2:.4f}")
                print(f"Model saved to {model_path}")
        
        # Print summary if training multiple datasets
        if verbose >= 1 and len(datasets_to_train) > 1:
            print("\nTraining Summary:")
            for ds_name in datasets_to_train:
                result = self.results[ds_name]
                print(f"{ds_name}: RMSE={result['test_rmse']:.4f}, R²={result['test_r2']:.4f}")
        
        return self

    def predict(self, smiles, dataset_name):
        """
        Make predictions using a trained model
        
        Parameters:
        -----------
        smiles : str or list
            SMILES string(s) to predict
        dataset_name : str
            Name of the dataset/model to use for prediction
        
        Returns:
        --------
        numpy.ndarray
            Predicted values
        """
        if dataset_name not in self.datasets_config:
            raise ValueError(f"Unknown dataset: {dataset_name}")
        
        # Convert single SMILES to list if needed
        if isinstance(smiles, str):
            smiles = [smiles]
        
        # Clean SMILES input - handle None values or empty strings
        cleaned_smiles = []
        valid_indices = []
        
        for i, s in enumerate(smiles):
            if s is not None and str(s).strip():
                cleaned_smiles.append(str(s).strip())
                valid_indices.append(i)
        
        if len(cleaned_smiles) == 0:
            return np.array([])
            
        # Check if model is already loaded
        if dataset_name in self.models:
            model = self.models[dataset_name]
        else:
            # Try to load model from disk
            model_path = os.path.join(self.output_dir, self.datasets_config[dataset_name]["model_file"])
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found: {model_path}. Please train the model first.")
            
            model = joblib.load(model_path)
            self.models[dataset_name] = model

        # Make predictions for valid SMILES
        predictions = model.predict(cleaned_smiles)
        
        # If any SMILES were invalid, create a result array with NaNs for invalid entries
        if len(valid_indices) < len(smiles):
            full_predictions = np.full(len(smiles), np.nan)
            for i, valid_idx in enumerate(valid_indices):
                full_predictions[valid_idx] = predictions[i]
            return full_predictions
        else:
            return predictions
    
    def predict_to_csv(self, smiles, dataset_name, output_file=None):
        """
        Make predictions and save to CSV file
        
        Parameters:
        -----------
        smiles : str or list
            SMILES string(s) to predict
        dataset_name : str
            Name of the dataset/model to use for prediction
        output_file : str or None
            Output file path (if None, generates a default name)
        
        Returns:
        --------
        pandas.DataFrame
            DataFrame with SMILES and predictions
        """
        # Get predictions
        predictions = self.predict(smiles, dataset_name)
        
        # Ensure smiles is a list
        if isinstance(smiles, str):
            smiles = [smiles]
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'SMILES': smiles,
            'Prediction': predictions
        })
        
        # Save to CSV if output file provided
        if output_file is None:
            output_file = os.path.join(self.output_dir, f"predictions_{dataset_name}.csv")
            
        results_df.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
        
        return results_df
    
    def get_results(self, dataset_name=None):
        """
        Get training results
        
        Parameters:
        -----------
        dataset_name : str or None
            Name of the dataset (if None, returns all results)
        
        Returns:
        --------
        dict
            Dictionary of results
        """
        if dataset_name is None:
            return self.results
        elif dataset_name in self.results:
            return self.results[dataset_name]
        else:
            raise ValueError(f"No results found for dataset: {dataset_name}")


In [28]:
# Create a custom configuration for your datasets
datasets_config = {
    "dataset1": {
        "file": "training_data/GCGR.csv",
        "smiles_col": "Ligand SMILES",
        "target_col": "EC50 (nM)",
        "model_file": "model_gcgr"
    },
    "dataset2": {
        "file": "training_data/GIP.csv", 
        "smiles_col": "Ligand SMILES",
        "target_col": "EC50 (nM)",
        "model_file": "model_gip"
    },
    "dataset3": {
        "file": "training_data/GLP-1R.csv",
        "smiles_col": "Ligand SMILES", 
        "target_col": "EC50 (nM)",
        "model_file": "model_glp1r"
    }
}

In [29]:
predictor = MolecularPredictor(
        datasets_config=datasets_config,
        output_dir="trained_models",
        preprocessing_options={
            'drop_missing_target': True,  #  drop rows with missing target
            'drop_missing_smiles': True,  #  drop rows with missing SMILES
            'drop_invalid_smiles': True,  # drop rows with empty/invalid SMILES
            'convert_target_to_numeric': True  # convert target to numeric
        }
    )

In [30]:
predictor.fit(dataset_name= 'dataset1',verbose=2)


Training model for dataset: dataset1
Converted target from object to numeric. 1910 values couldn't be converted and became NaN.
Dropping 1910 rows with missing target values.
Preprocessing removed 1910 rows (91.4%). 179 rows remaining.
Running grid search with 5-fold cross-validation...
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


PicklingError: Could not pickle the task to send it to the workers.