# Препроцессинг

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import optuna
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.manifold import TSNE
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import umap
from scipy.stats import randint, uniform, loguniform
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def convert_to_number(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(' ', '')  # удаляем пробелы
    # обработка значений '<число'
    if val.startswith('<'):
        num = re.findall(r'<(\d+\.?\d*)', val)
        return float(num[0]) if num else np.nan
    # обработка значений с ±
    elif '±' in val:
        nums = re.findall(r'([\d\.]+)±([\d\.]+)', val)
        if nums:
            main, uncertainty = nums[0]
            return float(main)  # берём только среднее (первое число)
        else:
            return np.nan
    # обработка значений с '/'
    elif '/' in val:
        nums = re.findall(r'([\d\.]+)/([\d\.]+)', val)
        if nums:
            num1, num2 = nums[0]
            return (float(num1) + float(num2)) / 2
        else:
            return np.nan
    # пробуем просто преобразовать в число
    else:
        try:
            return float(val)
        except:
            return np.nan

In [3]:
# Load dataset
df = pd.read_csv("datasets/for_regr_descriptors_full.csv")
df['raw_efficiency'] = df['raw_efficiency'].apply(convert_to_number)

# Load embeddings
blomap_embeddings = np.load("datasets/blomap_regr.npy")
fingerprints_embeddings = np.load("datasets/fingerprints_regr.npy")
protbert_embeddings = np.load("datasets/protbert_regr.npy")

In [4]:
import re
import pandas as pd
import numpy as np
from rdkit import Chem
from collections import Counter
from typing import Dict, Set

from rdkit.Chem import Draw
from IPython.display import display

def categorize_sequences(
    df, 
    sequence_col='sequence', 
    output_col='sequence_category', 
    patterns=None
):
    """
    Categorizes peptide sequences based on regex patterns.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame
        sequence_col (str): Column containing peptide sequences
        output_col (str): Output column for categories
        patterns (dict): Dictionary of {category: regex pattern}
    
    Returns:
        pd.DataFrame: Modified DataFrame with category column
    """
    if patterns is None:
        patterns = {
            "Standard": re.compile(r"^[A-Z]+$"),
            "Enantiomer Mix": re.compile(r"^[A-Za-z]+$"),
            "Chemical Formula": re.compile(r"^[A-Za-z0-9]+-[a-z]+$"),
            "Special Structure": re.compile(r"^[A-Za-z0-9\-]+-BLGTYTQDFNXFHTFPQTAIGVGAP$"),
            "Repeating Segments": re.compile(r"^[A-Za-z0-9\(\)\-]+-NH2$"),
            "Biotinylated": re.compile(r"\(biotin\)-[a-zA-Z0-9]+"),
            "N-terminal Acylated": re.compile(r"^Ac-[A-Z]+.*$"),
            "Cyclic": re.compile(r"^Cyclo\([A-Za-z0-9]+.*\)$"),
            "Amidated": re.compile(r"^.*-NH2$"),
            "Peptide with Substitutions": re.compile(r"[A-Za-z0-9\(\)\-\[\]]+"),
            "Uptake with Stearyl": re.compile(r".*(Stearyl|Myristoyl|Lauroyl)-[A-Za-z0-9]+"),
            "Polypeptide with Repeats": re.compile(r"poly-[A-Za-z0-9]+"),
            "Hexahistidine Tagged": re.compile(r"^(His6|HHHHHH)+.*"),
            "Unknown": re.compile(r".*")
        }

    def _categorizer(sequence):
        for category, pattern in patterns.items():
            if pattern.match(str(sequence)):
                return category
        return "Unknown"

    df[output_col] = df[sequence_col].apply(_categorizer)
    return df

def standardize_sequence(sequence):
    """
    Helper function to extract features from peptide sequences.
    Returns tuple of (cleaned_sequence, features...).
    """
    features = {
        'nh3_tail': False,
        'po3_pos': False,
        'biotinylated': False,
        'acylated_n_terminal': False,
        'cyclic': False,
        'amidated': False,
        'stearyl_uptake': False,
        'hexahistidine_tagged': False,
        'modifications': []
    }

    if not isinstance(sequence, str):
        return (np.nan, *features.values())

    # Feature detection
    features['nh3_tail'] = bool(re.search(r"-NH2$", sequence))
    features['po3_pos'] = bool(re.search(r"\(PO3\)", sequence))
    features['biotinylated'] = bool(re.search(r"\(biotin\)", sequence))
    features['acylated_n_terminal'] = bool(re.search(r"^Ac-", sequence))
    features['cyclic'] = bool(re.search(r"^Cyclo\(", sequence))
    features['amidated'] = bool(re.search(r"-NH2$", sequence))
    features['stearyl_uptake'] = bool(re.search(r"(Stearyl|Myristoyl|Lauroyl)", sequence))
    features['hexahistidine_tagged'] = bool(re.search(r"^(His6|HHHHHH)+", sequence))

    # Sequence cleaning
    cleaned = re.sub(r"[^A-Za-z]", "", sequence)
    cleaned_standard = re.sub(r"[^A-Z]", "", cleaned)
    
    if cleaned_standard.isalpha() and cleaned_standard.isupper():
        features['modifications'] = [
            (char.upper(), i+1) 
            for i, char in enumerate(cleaned) 
            if char.islower()
        ]
        return (cleaned_standard, *features.values())
    
    return (np.nan, *features.values())

def add_sequence_features(
    df,
    sequence_col='sequence',
    output_cols=[
        'standard_sequence', 'nh3_tail', 'po3_pos', 'biotinylated',
        'acylated_n_terminal', 'cyclic', 'amidated', 'stearyl_uptake',
        'hexahistidine_tagged', 'modifications'
    ]
):
    """
    Adds standardized sequence and feature columns to DataFrame.
    """
    df[output_cols] = df[sequence_col].apply(
        lambda x: pd.Series(standardize_sequence(x))
    )
    return df

def validate_sequences(df, sequence_col='standard_sequence'):
    """
    Filters valid sequences using standard amino acid check.
    """
    def _is_valid(seq):
        if not isinstance(seq, str):
            return False
        return bool(re.match(r"^[ACDEFGHIKLMNPQRSTVWY]+$", seq))
    
    df[sequence_col] = df[sequence_col].apply(
        lambda s: s if _is_valid(s) else np.nan
    )
    return df

class PeptideToSmilesConverter:
    """Converts peptide sequences to SMILES with modification handling."""
    
    def __init__(self):
        self._init_maps()
        self.unrecognized_aa_counter = Counter()
        self.bad_sequences: Set[str] = set()
        
    def _init_maps(self):
        """Initialize SMILES mappings for modifications and amino acids."""
        self.modification_map = {
            "Ac-": "CC(=O)",
            "(Acp)": "CC(=O)",  
            "(biotin)": "C1[C@H]2SC(=S)N[C@H]1CCCCC(=O)",  # Correct biotin structure
            "-NH2": "N",
            "Stearyl": "CCCCCCCCCCCCCCCCCC(=O)",  # 18 carbons (C18)
            "Myristoyl": "CCCCCCCCCCCCCC(=O)",    # Correct (C14)
            "Lauroyl": "CCCCCCCCCCCC(=O)",        # Correct (C12)
            "Nspe": "N[C@H](C(C)O)C(=O)",         # Assume correct stereochemistry
            "Nbtg": "N[C@H](C(C)(C)C)C(=O)",      # N-tert-butylglycine
            "Ac": "CC(=O)",
            "Et": "OCC",                          # Ethyl ester (O-linked)
            "Npm": "N1[C@H](C(C)C)C(=O)",         # Verify context if needed
            "Nssb": "N1[C@H](C(C)CC)C(=O)",       # Verify context
            "Mpa": "SCCC(=O)",                    # Mercaptopropionyl (HS-CH2CH2CO-)
            "Cou": "C1=CC(=O)OC2=CC=CC=C12",      # Coumarin (corrected)
            "Xr": "N[C@H](C(C)C)C(=O)",           # D-valine (matches 'v' in aa_map)
            "NII": "C(C)C",                       # N-isopropyl (assumed)
            "PIC": "C1=CC=NC(=C1)CO",             # 4-picolyl (example)
            "IC": "NC1=NC(=O)NC=N1"               # Isocytosine
        }
        
        self.aa_map = {
            # Standard L/D amino acids
            'A': 'N[C@@H](C)C(=O)', 'a': 'N[C@H](C)C(=O)',
            'C': 'N[C@@H](CS)C(=O)', 'c': 'N[C@H](CS)C(=O)',
            'D': 'N[C@@H](CC(=O)O)C(=O)', 'd': 'N[C@H](CC(=O)O)C(=O)',
            'E': 'N[C@@H](CCC(=O)O)C(=O)', 'e': 'N[C@H](CCC(=O)O)C(=O)',
            'F': 'N[C@@H](CC1=CC=CC=C1)C(=O)', 'f': 'N[C@H](CC1=CC=CC=C1)C(=O)',
            'G': 'NCC(=O)', 'g': 'NCC(=O)',
            'H': 'N[C@@H](CC1=CNC=N1)C(=O)', 'h': 'N[C@H](CC1=CNC=N1)C(=O)',
            'I': 'N[C@@H](C(C)CC)C(=O)', 'i': 'N[C@H](C(C)CC)C(=O)',
            'K': 'N[C@@H](CCCCN)C(=O)', 'k': 'N[C@H](CCCCN)C(=O)',
            'L': 'N[C@@H](CC(C)C)C(=O)', 'l': 'N[C@H](CC(C)C)C(=O)',
            'M': 'N[C@@H](CCSC)C(=O)', 'm': 'N[C@H](CCSC)C(=O)',
            'N': 'N[C@@H](CC(=O)N)C(=O)', 'n': 'N[C@H](CC(=O)N)C(=O)',
            'P': 'N1[C@@H](CCC1)C(=O)', 'p': 'N1[C@H](CCC1)C(=O)',
            'Q': 'N[C@@H](CCC(=O)N)C(=O)', 'q': 'N[C@H](CCC(=O)N)C(=O)',
            'R': 'N[C@@H](CCCNC(=N)N)C(=O)', 'r': 'N[C@H](CCCNC(=N)N)C(=O)',
            'S': 'N[C@@H](CO)C(=O)', 's': 'N[C@H](CO)C(=O)',
            'T': 'N[C@@H](C(O)C)C(=O)', 't': 'N[C@H](C(O)C)C(=O)',
            'V': 'N[C@@H](C(C)C)C(=O)', 'v': 'N[C@H](C(C)C)C(=O)',
            'W': 'N[C@@H](CC1=CNC2=CC=CC=C12)C(=O)', 'w': 'N[C@H](CC1=CNC2=CC=CC=C12)C(=O)',
            'Y': 'N[C@@H](CC1=CC=C(O)C=C1)C(=O)', 'y': 'N[C@H](CC1=CC=C(O)C=C1)C(=O)',
            # Special cases
            'X': '*', '?': '*',
            'O': 'N[C@@H](CCCCN)C(=O)',  # Ornithine (corrected side chain)
            'Aib': 'NC(C)(C)C(=O)',      # Aib without chiral center
            'B': 'N[C@@H](CC(=O)N)C(=O)',  # Asn (standard 'B' ambiguity resolved to Asn)
            # Removed invalid 'b' entry to avoid conflicts
        }
        
    @staticmethod
    def expand_repeats(sequence: str) -> str:
        """Expand notation like R8 to RRRRRRRR."""
        # Use a substitution function to replace each occurrence of <letter(s)><digit>
        def repl(match):
            token = match.group(1)
            count = int(match.group(2))
            return token * count
        return re.sub(r'([A-Za-z]+)(\d+)', repl, sequence)
    
    def sequence_to_smiles(self, sequence: str) -> str:
        """Convert a raw peptide sequence to SMILES."""
        try:
            # First try RDKit's built-in conversion.
            mol = Chem.MolFromSequence(sequence)
            if mol:
                return Chem.MolToSmiles(mol)
        except Exception:
            pass
        
        # Fallback to the custom conversion.
        return self._custom_sequence_conversion(sequence)
    
    def _custom_sequence_conversion(self, sequence: str) -> str:
        """Handle non-standard sequences with modifications."""
        smiles_parts = []
        seq = self.expand_repeats(sequence)
        
        # Process modifications first.
        for mod, smi in self.modification_map.items():
            if mod in seq:
                seq = seq.replace(mod, "")
                smiles_parts.append(smi)
        
        # Remove any non-letter characters (such as dashes).
        seq = re.sub(r'[^A-Za-z]', '', seq)
        
        # Process amino acids one-by-one.
        for aa in seq:
            if aa in self.aa_map:
                smiles_parts.append(self.aa_map[aa])
            else:
                self.unrecognized_aa_counter[aa] += 1
                self.bad_sequences.add(sequence)
        
        final_smiles = "".join(smiles_parts)
        try:
            mol = Chem.MolFromSmiles(final_smiles)
            final_smiles = Chem.MolToSmiles(mol) if mol else None
            if not final_smiles:
                return None
            final_smiles = Chem.CanonSmiles(final_smiles, useChiral=True)
            return final_smiles
        except Exception:
            return None
    
    def process_dataframe(
        self, 
        df: pd.DataFrame, 
        sequence_col: str = 'sequence',
        output_col: str = 'smiles_sequence'
    ) -> pd.DataFrame:
        """Process a dataframe with peptide sequences."""
        df[output_col] = df[sequence_col].apply(self.sequence_to_smiles)
        return df
    
    def get_bad_sequences_df(self) -> pd.DataFrame:
        """Get a dataframe of problematic sequences."""
        return pd.DataFrame({
            'sequence': list(self.bad_sequences),
            'reason': 'Contains unrecognized components'
        })
    
    def report_validity(self, df: pd.DataFrame, smiles_col: str = 'smiles_sequence') -> Dict:
        """Generate a validity report."""
        valid = df[smiles_col].notna()
        return {
            'valid_percentage': valid.mean() * 100,
            'invalid_count': len(df) - valid.sum(),
            'unrecognized_aa': dict(self.unrecognized_aa_counter)
        }

def convert_sequences(df, sequence_col='sequence', smiles_col='smiles_sequence'):
    """
    Convert peptide sequences in a DataFrame to SMILES and return the modified DataFrame.
    Designed to be used with .pipe().
    """
    converter = PeptideToSmilesConverter()
    df[smiles_col] = df[sequence_col].apply(converter.sequence_to_smiles)
    return df

import re

import pandas as pd

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight

pattern = re.compile(r"^[ACDEFGHIKLMNPQRSTVWY]+$")

def is_valid_amino_acid_sequence(sequence):

    if isinstance(sequence, str):
        return bool(pattern.match(sequence))
    
    return False

def calculate_protein_descriptors(sequence):

    if not is_valid_amino_acid_sequence(sequence):
        return pd.Series({})

    analysis = ProteinAnalysis(sequence)

    descriptors = {
        'molecular_weight': analysis.molecular_weight(),
        'seq_length': len(sequence),
        'aromaticity': analysis.aromaticity(),
        'instability_index': analysis.instability_index(),
        'isoelectric_point': analysis.isoelectric_point(),
        'helix_fraction': analysis.secondary_structure_fraction()[0],
        'turn_fraction': analysis.secondary_structure_fraction()[1],
        'sheet_fraction': analysis.secondary_structure_fraction()[2],
        'molar_extinction_coefficient_reduced': analysis.molar_extinction_coefficient()[0],
        'molar_extinction_coefficient_oxidized': analysis.molar_extinction_coefficient()[1],
        'gravy': analysis.gravy()
    }
    
    return pd.Series(descriptors)

def add_descriptors_features(
        df: pd.DataFrame,
        sequence_column: str = 'standard_sequence',
):
    
    """
    Adds protein descriptors features to the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing sequences.
    sequence_column (str): The name of the column containing amino acid sequences.

    Returns:
    pd.DataFrame: The DataFrame with added protein descriptors features.
    """
    
    descriptors_df = df[sequence_column].apply(calculate_protein_descriptors)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)
    
    return df_with_descriptors



In [5]:
peptides_df = pd.read_csv("datasets/for_regr.csv", index_col=0)
peptides_df

assert peptides_df.sequence.nunique() == len(peptides_df)
assert 'Unnamed: 0' not in peptides_df.columns

peptides_df = peptides_df.drop(columns=[
    'sequence_category',
    'standard_sequence',
    'nh3_tail',
    'po3_pos',
    'biotinylated',
    'acylated_n_terminal',
    'cyclic',
    'amidated',
    'stearyl_uptake',
    'hexahistidine_tagged',
    'modifications',
    'smiles_sequence',
    'fp_path'
], errors='ignore')

clean_peptides_df = (peptides_df.copy()
    .pipe(categorize_sequences)
    .pipe(add_sequence_features)
    .pipe(validate_sequences)
)

In [6]:

descriptors_df = (
    clean_peptides_df.copy()
    .pipe(lambda df: df.dropna(subset=['standard_sequence']))
    .pipe(add_descriptors_features)
)


assert descriptors_df['gravy'].isna().sum() == 0

In [8]:
from typing import List, Tuple, Optional

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score

def preprocess_descriptors(
    df: pd.DataFrame, 
    target_column: str = 'raw_efficiency', 
    columns_to_use: Optional[List[str]] = None, 
    test_size: float = 0.2, 
    random_state: int = 42, 
    stratify: bool = True
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Preprocesses the data by selecting specified columns, encoding categorical variables,
    splitting into train and test sets, and asserting no missing values.

    Parameters:
    df (pd.DataFrame): The input dataframe.
    target_column (str): The name of the target column.
    columns_to_use (list): List of columns to use. If None, default columns will be used.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Random seed for reproducibility.
    stratify (bool): Whether to stratify the split based on the target column.

    Returns:
    X_train (pd.DataFrame): Training feature matrix.
    X_test (pd.DataFrame): Testing feature matrix.
    y_train (pd.Series): Training target vector.
    y_test (pd.Series): Testing target vector.
    """
    if columns_to_use is None:
        columns_to_use = [
            'seq_length', 'molecular_weight', 'nh3_tail', 'po3_pos', 
            'biotinylated', 'acylated_n_terminal', 'cyclic', 'amidated', 
            'stearyl_uptake', 'hexahistidine_tagged', 'aromaticity', 
            'instability_index', 'isoelectric_point', 'helix_fraction', 
            'turn_fraction', 'sheet_fraction', 'molar_extinction_coefficient_reduced', 
            'molar_extinction_coefficient_oxidized', 'gravy', 'uptake_type'
        ]
    
    # Select specified columns
    df_clean = df[columns_to_use + [target_column]]
    
    # Encode boolean columns
    bool_cols = df_clean.columns[df_clean.dtypes == 'bool'].tolist()
    for col in bool_cols:
        df_clean[col] = df_clean[col].astype(int)
    
    # Encode target variable
#    label_encoder = LabelEncoder()
#    df_clean[target_column] = label_encoder.fit_transform(df_clean[target_column])
    
    # Define features and target
#    X = df_clean.drop(target_column, axis=1)
#    y = df_clean[target_column]
    
    # Assert no missing values
#    assert X.isnull().sum().sum() == 0, "There are missing values in the feature matrix"
#    assert y.isnull().sum() == 0, "There are missing values in the target vector"
    
    # Split the data
#    stratify_param = y if stratify else None
#    X_train, X_test, y_train, y_test = train_test_split(
#        X, y, test_size=test_size, random_state=random_state, stratify=stratify_param
#    )

    # Print the shapes of the resulting dataframes
#    print(f"X_train shape: {X_train.shape}")
#    print(f"X_test shape: {X_test.shape}")
#    print(f"y_train shape: {y_train.shape}")
#    print(f"y_test shape: {y_test.shape}")
    
    return df_clean

In [9]:
df_clean = preprocess_descriptors(descriptors_df, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [27]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 877 entries, 2721 to 2715
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   seq_length                             877 non-null    float64
 1   molecular_weight                       877 non-null    float64
 2   nh3_tail                               877 non-null    int32  
 3   po3_pos                                877 non-null    int32  
 4   biotinylated                           877 non-null    int32  
 5   acylated_n_terminal                    877 non-null    int32  
 6   cyclic                                 877 non-null    int32  
 7   amidated                               877 non-null    int32  
 8   stearyl_uptake                         877 non-null    int32  
 9   hexahistidine_tagged                   877 non-null    int32  
 10  aromaticity                            877 non-null    float64
 11  instabi

In [26]:
classification_descriptors = df_clean.drop(columns=['raw_efficiency', 'uptake_type'])

In [28]:
# Select numerical features
fp_path_index = df.columns.get_loc('fp_path')
selected_features = ['raw_efficiency', 'uptake_type'] + list(df.columns[fp_path_index + 1:])

X_numerical = df[selected_features].copy()

In [29]:
# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)

In [30]:
# Функция для удаления выбросов методом IQR
def remove_outliers(df, target_column):
    """
    Удаляет выбросы из числовых колонок методом межквартильного размаха (IQR).

    Аргументы:
    df — pandas DataFrame с числовыми признаками.
    target_column — название столбца с таргетом (raw_efficiency).

    Возвращает:
    Очищенный DataFrame без выбросов.
    """
    df_clean = df.copy()

    Q1 = df_clean['raw_efficiency'].quantile(0.25)
    Q3 = df_clean['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_clean = df_clean[(df_clean['raw_efficiency'] >= lower_bound) & (df_clean['raw_efficiency'] <= upper_bound)]

    return df_clean

In [31]:
X_numerical = X_numerical.reset_index(drop=True)

In [32]:
classification_descriptors = classification_descriptors.reset_index(drop=True)

In [33]:
X_numerical

Unnamed: 0,raw_efficiency,uptake_type,MW,GRAVY,pI,Charge,Charge_Density,Aromaticity,Flexibility,Aliphatic_Index,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,750.0,Fluorescence intensity,3151.8362,-1.492308,11.608322,12.577710,0.483758,0.192308,1.021711,0.269231,...,0,0,0,0,0,0,0,0,12,0
1,1400.0,Fluorescence intensity,2785.1939,-1.272727,11.839377,5.637233,0.256238,0.136364,1.000826,0.272727,...,1,0,0,0,0,0,0,0,2,0
2,75.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.021405,0.100000,...,0,0,0,0,0,0,0,0,1,0
3,95.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.006857,0.100000,...,0,0,0,0,0,0,0,0,1,0
4,66.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.039310,0.100000,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872,80.0,Cellular uptake (%),515.6103,-3.325000,11.999968,2.554897,0.638724,0.000000,,0.000000,...,0,0,0,0,0,0,0,0,2,0
873,424.0,Relative fluorescence (%),178.2095,1.050000,5.518123,-0.467050,-0.233525,0.000000,,0.000000,...,0,0,0,0,0,0,0,0,0,0
874,100.0,Relative cellular uptake (%),2600.6253,-3.215789,7.516013,0.216729,0.011407,0.000000,0.950467,0.000000,...,0,0,0,0,0,0,0,0,0,0
875,25.0,Mean Fluorescence intensity,3712.9716,-3.611538,11.999968,8.179901,0.314612,0.000000,0.967821,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Фильтруем датафрейм по столбцу 'uptake_type'
X_numerical_filtered = X_numerical[X_numerical['uptake_type'].isin(['Mean Fluorescence intensity', 'Fluorescence intensity'])].copy()
# X_numerical_filtered = X_numerical_filtered[X_numerical_filtered['raw_efficiency'] <= 50000]

# Удаляем выбросы из числовых признаков
X_numerical_filtered_no_outliers = remove_outliers(X_numerical_filtered, 'raw_efficiency')

# Проверяем размер до и после удаления выбросов
print(f"Размер данных ДО удаления выбросов: {X_numerical_filtered.shape}")
print(f"Размер данных ПОСЛЕ удаления выбросов: {X_numerical_filtered_no_outliers.shape}")

# Получаем индексы отфильтрованных молекул
filtered_indices = X_numerical_filtered_no_outliers.index

X_cell_line_filtered = X_cell_line.loc[filtered_indices]
classification_filtered = classification_descriptors.loc[filtered_indices]

# Фильтруем массивы эмбеддингов
blomap_embeddings_filtered = blomap_embeddings[filtered_indices]
fingerprints_embeddings_filtered = fingerprints_embeddings[filtered_indices]
protbert_embeddings_filtered = protbert_embeddings[filtered_indices]

Размер данных ДО удаления выбросов: (312, 225)
Размер данных ПОСЛЕ удаления выбросов: (268, 225)


In [35]:
target = 'raw_efficiency'
rdkit_descriptors = X_numerical_filtered_no_outliers.drop(columns=['uptake_type', target])

In [36]:
rdkit_descriptors_raw = X_numerical_filtered_no_outliers.drop(columns=['uptake_type', target])
rdkit_cols = rdkit_descriptors_raw.columns

imputer = SimpleImputer(strategy="mean")
rdkit_descriptors = pd.DataFrame(imputer.fit_transform(rdkit_descriptors), columns=rdkit_cols)

In [37]:
rdkit_descriptors = rdkit_descriptors.reset_index(drop=True)
X_cell_line_filtered = pd.DataFrame(X_cell_line_filtered).reset_index(drop=True)
classification_filtered = pd.DataFrame(classification_filtered).reset_index(drop=True)
blomap_embeddings_filtered = pd.DataFrame(blomap_embeddings_filtered).reset_index(drop=True)
blomap_embeddings_filtered.columns = [f'blomap_{i}' for i in blomap_embeddings_filtered.columns]
fingerprints_embeddings_filtered = pd.DataFrame(fingerprints_embeddings_filtered).reset_index(drop=True)
fingerprints_embeddings_filtered.columns = [f'fingerprint_{i}' for i in fingerprints_embeddings_filtered.columns]
protbert_embeddings_filtered = pd.DataFrame(protbert_embeddings_filtered).reset_index(drop=True)
protbert_embeddings_filtered.columns = [f'protbert_{i}' for i in protbert_embeddings_filtered.columns]

In [38]:
classification_filtered 

Unnamed: 0,seq_length,molecular_weight,nh3_tail,po3_pos,biotinylated,acylated_n_terminal,cyclic,amidated,stearyl_uptake,hexahistidine_tagged,aromaticity,instability_index,isoelectric_point,helix_fraction,turn_fraction,sheet_fraction,molar_extinction_coefficient_reduced,molar_extinction_coefficient_oxidized,gravy
0,26.0,3151.8362,0,0,0,0,0,0,0,0,0.192308,10.653846,11.608322,0.538462,0.230769,0.230769,0.0,0.0,-1.492308
1,22.0,2785.1939,0,0,0,0,0,0,0,0,0.136364,115.700455,11.839377,0.227273,0.181818,0.318182,1490.0,1490.0,-1.272727
2,17.0,2078.3658,0,0,0,0,0,0,0,0,0.000000,113.976471,11.824485,0.235294,0.176471,0.235294,0.0,0.0,-1.123529
3,30.0,3208.5313,0,0,0,0,0,0,0,0,0.100000,30.983333,4.139095,0.233333,0.366667,0.433333,2980.0,2980.0,-0.066667
4,26.0,2624.9800,0,0,0,0,0,0,0,0,0.000000,-2.926923,4.783081,0.576923,0.269231,0.346154,0.0,0.0,0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,3.0,424.5177,0,0,0,0,0,0,0,0,0.333333,6.666667,8.249713,0.000000,0.000000,0.333333,0.0,0.0,0.266667
264,3.0,390.5015,0,0,0,0,0,0,0,0,0.000000,6.666667,8.249713,0.000000,0.000000,0.333333,0.0,0.0,0.833333
265,1.0,89.0932,0,0,0,0,0,0,0,0,0.000000,0.000000,5.570017,1.000000,0.000000,0.000000,0.0,0.0,1.800000
266,26.0,3712.9716,1,0,0,0,0,1,0,0,0.000000,168.576923,11.999968,0.000000,0.038462,0.000000,0.0,0.0,-3.611538


In [39]:
list_of_dfs_named = {
    "rdkit": rdkit_descriptors,
    "blomap": blomap_embeddings_filtered,
    "fingerprints": fingerprints_embeddings_filtered,
    "protbert": protbert_embeddings_filtered,
}

combined_df_concat = pd.concat(list_of_dfs_named.values(), axis=1)

print("\nИнформация об объединенном датафрейме:")
combined_df_concat.info()


Информация об объединенном датафрейме:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Columns: 4640 entries, MW to protbert_1023
dtypes: float32(1024), float64(1568), int64(2048)
memory usage: 8.4 MB


In [40]:
dfs_dict = {
    'RDKit_Descriptors': rdkit_descriptors,
    'BLOMAP_Embeddings': blomap_embeddings_filtered,
    'Fingerprints_Embeddings': fingerprints_embeddings_filtered,
    'ProtBERT_Embeddings': protbert_embeddings_filtered,
    'Combined_All_Features': combined_df_concat,
    'Classification_Descriptors': classification_filtered
}

In [89]:
X = pd.concat([rdkit_descriptors, X_cell_line_filtered], axis=1)
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

X.columns = pd.Index(X.columns).map(str)
X.columns = X.columns.to_series().reset_index(drop=True).duplicated().cumsum().astype(str) + '_' + X.columns

In [41]:
def apply_varThreshold (X, threshold=0): 

    # 1. Создаем объект VarianceThreshold
    selector = VarianceThreshold(threshold)

    # 2. Применяем селектор к числовым данным
    X_transformed_array = selector.fit_transform(X)
    selected_columns_mask = selector.get_support()
    selected_columns_names = X.columns[selected_columns_mask]

    # Создаем новый датафрейм только с выбранными столбцами
    X_filtered_var = pd.DataFrame(X_transformed_array, columns=selected_columns_names)

    print("\nДатафрейм после отсева по дисперсии (метод Scikit-learn):")
    print(X_filtered_var.head())
    print("\nОписание отфильтрованного датафрейма (метод Scikit-learn):")
    print(X_filtered_var.describe())

    return X_filtered_var

    '''# Проверка, какие столбцы были удалены sklearn методом
    all_numeric_columns = X.columns.tolist()
    removed_columns_sklearn = [col for col in all_numeric_columns if col not in selected_columns_names]
    print(f"\nСтолбцы, удаленные методом Scikit-learn (порог {selector.threshold}):")
    print(removed_columns_sklearn)'''

In [42]:
def apply_corr(X, threshold = 0.2):
    # Вычисляем корреляции каждого признака с целевой переменной
    correlations = X.apply(lambda col: col.corr(y))
    # Фильтруем признаки по модулю корреляции
    selected_features = correlations[correlations.abs() >= threshold].index
    # Оставляем только отобранные признаки в X
    X_corr = X[selected_features]
    # Выводим результат
    print("Оставленные признаки:", len(list(selected_features)), list(selected_features))

    return X_corr

In [43]:
def apply_scaler (train, test):
    train.columns = train.columns.astype(str)
    test.columns = test.columns.astype(str)

    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = pd.DataFrame(scaler.fit_transform(train))
    test_scaled = pd.DataFrame(scaler.transform(test))
    return train_scaled, test_scaled

In [44]:
def apply_pca (X_train, X_test, threshold=0.95):
    pca = PCA(n_components=threshold, svd_solver='full')
    train_transformed = pd.DataFrame(pca.fit_transform(X_train))
    test_transformed = pd.DataFrame(pca.transform(X_test))
    dispersion=round((sum(pca.explained_variance_ratio_)*100),2)
    number_of_comp = pca.n_components_
    print(number_of_comp,dispersion)

    return train_transformed, test_transformed

In [45]:
def apply_tsne(X_train, X_test, n_components=2, perplexity=30, n_iter=5000, random_state=42):
    # Объединяем данные
    X_combined = np.vstack((X_train, X_test))
    
    # Применяем t-SNE ко всему датасету
    tsne = TSNE(n_components=n_components, perplexity=perplexity, n_iter=n_iter, random_state=random_state)
    X_tsne = tsne.fit_transform(X_combined)
    
    # Разделяем обратно
    X_train_tsne = X_tsne[:len(X_train)]
    X_test_tsne = X_tsne[len(X_train):]
    
    return X_train_tsne, X_test_tsne

In [91]:
X_filtered_var = apply_varThreshold(X, 0)


Датафрейм после отсева по дисперсии (метод Scikit-learn):
    0_0        0_1  0_2  0_7  0_9      0_10        0_11       0_12      0_13  \
0  26.0  3151.8362  0.0  0.0  0.0  0.192308   10.653846  11.608322  0.538462   
1  22.0  2785.1939  0.0  0.0  0.0  0.136364  115.700455  11.839377  0.227273   
2  17.0  2078.3658  0.0  0.0  0.0  0.000000  113.976471  11.824485  0.235294   
3  30.0  3208.5313  0.0  0.0  0.0  0.100000   30.983333   4.139095  0.233333   
4  26.0  2624.9800  0.0  0.0  0.0  0.000000   -2.926923   4.783081  0.576923   

       0_14  ...  0_cell_line_MDA-MB-435S cells  0_cell_line_N. tabacum cells  \
0  0.230769  ...                            0.0                           0.0   
1  0.181818  ...                            0.0                           0.0   
2  0.176471  ...                            0.0                           0.0   
3  0.366667  ...                            0.0                           0.0   
4  0.269231  ...                            0.0        

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered_var, y, test_size=0.2, random_state=42)

In [95]:
X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)

In [96]:
X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled, 0.95)

30 95.23


In [191]:
X_train_transformed, X_test_transformed = apply_tsne (X_train_scaled, X_test_scaled, perplexity=60)



In [195]:
def apply_umap(X_train: pd.DataFrame, 
                         X_test: pd.DataFrame, 
                         n_components: int = 10,
                         n_neighbors: int = 15,
                         min_dist: float = 0.1,
                         metric: str = 'euclidean',
                         random_state: int = 42):
    """
    Применяет UMAP для понижения размерности к обучающей и тестовой выборкам.

    Обучает UMAP только на обучающей выборке (X_train) и затем применяет
    обученный трансформер к обеим выборкам для предотвращения утечки данных.

    Args:
        X_train (pd.DataFrame): Обучающая выборка (признаки).
        X_test (pd.DataFrame): Тестовая выборка (признаки).
        n_components (int): Целевое количество компонент после понижения.
                           По умолчанию 10.
        n_neighbors (int): Параметр UMAP, влияющий на баланс локальной/глобальной
                           структуры. По умолчанию 15.
        min_dist (float): Параметр UMAP, влияющий на плотность точек.
                          По умолчанию 0.1.
        metric (str): Метрика расстояния для UMAP. По умолчанию 'euclidean'.
        random_state (int): Зерно для генератора случайных чисел для
                            воспроизводимости. По умолчанию 42.

    Returns:
        tuple: Кортеж из двух numpy массивов:
               - X_train_reduced (np.ndarray): Обучающая выборка после UMAP.
               - X_test_reduced (np.ndarray): Тестовая выборка после UMAP.
    """
    
    print(f"Применение UMAP с n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}...")
    
    # 1. Создаем экземпляр UMAP трансформера
    # Важно: используем random_state для воспроизводимости
    umap_transformer = umap.UMAP(n_components=n_components,
                                 n_neighbors=n_neighbors,
                                 min_dist=min_dist,
                                 metric=metric,
                                 random_state=random_state,
                                 # Дополнительные опции, можно добавить
                                 # n_epochs=None, # Можно увеличить для больших датасетов
                                 # learning_rate=1.0,
                                 # init='spectral', # Или 'random'
                                )
    
    # 2. Обучаем трансформер ТОЛЬКО на обучающей выборке
    print("Обучение UMAP на X_train...")
    umap_transformer.fit(X_train)
    print("Обучение завершено.")
    
    # 3. Преобразуем обучающую выборку
    print("Преобразование X_train...")
    X_train_reduced = umap_transformer.transform(X_train)
    print(f"X_train размерность до: {X_train.shape}, после: {X_train_reduced.shape}")
    
    # 4. Преобразуем тестовую выборку, используя ТОТ ЖЕ обученный трансформер
    print("Преобразование X_test...")
    X_test_reduced = umap_transformer.transform(X_test)
    print(f"X_test размерность до: {X_test.shape}, после: {X_test_reduced.shape}")
    
    return X_train_reduced, X_test_reduced

In [196]:
X_train_umap, X_test_umap = apply_umap(X_train_scaled, X_test_scaled, n_components=30, n_neighbors=15, min_dist=0.1)

Применение UMAP с n_components=30, n_neighbors=15, min_dist=0.1...
Обучение UMAP на X_train...


  warn(


Обучение завершено.
Преобразование X_train...
X_train размерность до: (214, 4735), после: (214, 30)
Преобразование X_test...




X_test размерность до: (54, 4735), после: (54, 30)


# Облучение

In [46]:
# Определяем функцию для оценки модели
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)

    print(f'Train MSE: {train_mse:.4f}')
    print(f'Train R2: {train_r2:.4f}')
    print(f'Test MSE: {test_mse:.4f}')
    print(f'Test R2: {test_r2:.4f}')

In [47]:
# Инициализируем модель SVR и оцениваем её
def svr_learning(X_train, y_train, X_test, y_test):
    model = SVR()
    model.fit(X_train, y_train)
    evaluate_model(model, X_train, y_train, X_test, y_test)

In [52]:
def svr_optuna(X_train, y_train, X_test, y_test):
    # Целевая функция для оптимизации
    def objective(trial):
        kernel = trial.suggest_categorical('kernel', ['rbf'])
    
        params = {
            "C": trial.suggest_float("C", 1e-1, 1e3, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
            "kernel": kernel,
        }

        # Добавляем gamma, если оно актуально для выбранного ядра
        if kernel in ['rbf', 'poly', 'sigmoid']:
            params["gamma"] = trial.suggest_categorical("gamma", ['scale', 'auto'])

        # Параметр degree актуален только для полиномиального ядра
        if kernel == 'poly':
            params["degree"] = trial.suggest_int("degree", 2, 3)

        # coef0 используется в poly и sigmoid
        if kernel in ['poly', 'sigmoid']:
            params["coef0"] = trial.suggest_float("coef0", -3.0, 3.0)

        model = SVR(**params)
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    # Создание и оптимизация исследования
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Лучшие гиперпараметры:", study.best_params)
    print("Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими гиперпараметрами
    best_model = SVR(**study.best_params)
    best_model.fit(X_train, y_train)

    # Оцениваем модель
    evaluate_model(best_model, X_train, y_train, X_test, y_test)

    '''
    result = permutation_importance(
    best_model, X_test, y_test, n_repeats=10, random_state=42, scoring='neg_root_mean_squared_error'
    )

    # Вывод важности признаков
    feature_importance = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': result.importances_mean,
        'importance_std': result.importances_std
    }).sort_values(by='importance_mean', ascending=False)

    print(feature_importance)    
    # Сортируем по убыванию важности
    feature_importance_sorted = feature_importance.reset_index(drop=True)
    total_importance = feature_importance_sorted['importance_mean'].sum()
    feature_importance_sorted['cumulative_importance'] = feature_importance_sorted['importance_mean'].cumsum() / total_importance

    # Оставляем признаки, дающие в сумме до 80% важности
    top_features = feature_importance_sorted[feature_importance_sorted['cumulative_importance'] <= 0.8]

    # Если последний признак недобрал до 80%, добавим ещё один
    if top_features['cumulative_importance'].iloc[-1] < 0.95:
        top_features = feature_importance_sorted.iloc[:len(top_features)+1]

    print("Признаки, дающие 80% важности:")
    print(top_features[['feature', 'importance_mean', 'cumulative_importance']])

    # Формируем новые обучающие и тестовые выборки с отобранными признаками
    selected_features = top_features['feature'].tolist()
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Переобучаем модель на отобранных признаках
    final_model = SVR(**study.best_params)
    final_model.fit(X_train_selected, y_train)

    # Финальная оценка модели
    print("\nОценка модели на отобранных признаках:")
    evaluate_model(final_model, X_train_selected, y_train, X_test_selected, y_test)
    '''

    return best_model

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

for name, df in dfs_dict.items():
    print(f"\n=== {name} ===")

    # 1. Объединяем все датасеты, как и раньше
    X = pd.concat([df, X_cell_line_filtered, classification_filtered], axis=1)
    X.columns = pd.Index(X.columns).map(str)

    # 2. Определяем, какие колонки к какому типу относятся
    #    ВАЖНО: Укажите здесь реальные имена колонок
    numerical_cols = df.columns.map(str).tolist()
    categorical_cols = X_cell_line_filtered.columns.map(str).tolist() + classification_filtered.columns.map(str).tolist()

    # 3. Разделяем данные ДО ЛЮБОЙ обработки, чтобы избежать утечки данных
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Создаем пайплайн для непрерывных признаков
    numerical_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler()),
        ('pca', PCA(n_components=0.95))
    ])

    # 5. Создаем ColumnTransformer для применения разных шагов к разным колонкам
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', 'passthrough', categorical_cols) # 'passthrough' означает "оставить как есть"
        ],
        sparse_threshold=0, 
        remainder='drop' # Если вдруг появятся лишние колонки, они будут удалены
    )

    # 6. Применяем предобработку
    # fit_transform на обучающей выборке
    X_train_transformed = preprocessor.fit_transform(X_train)
    # ТОЛЬКО transform на тестовой выборке (используем параметры, выученные на train)
    X_test_transformed = preprocessor.transform(X_test)

    
    # --- ДИАГНОСТИЧЕСКАЯ СТРОКА ---
    print(f"Тип данных (dtype) итогового массива: {X_train_transformed.dtype}")
    # -----------------------------

    print(f"Original shape: {X_train.shape}, Transformed shape: {X_train_transformed.shape}")

    model = svr_optuna(X_train_transformed, y_train, X_test_transformed, y_test)
    
    print(f"Original shape: {X_train.shape}, Transformed shape: {X_train_transformed.shape}")
    # Новая форма будет (кол-во PCA компонент + кол-во категориальных признаков)

    # 7. Обучаем модель на новых, правильно подготовленных данных
    model = svr_optuna(X_train_transformed, y_train, X_test_transformed, y_test)


[I 2025-07-11 01:10:03,542] A new study created in memory with name: no-name-49f6fed4-0306-403e-9ad6-fa4c6b935887



=== RDKit_Descriptors ===
Тип данных (dtype) итогового массива: float64
Original shape: (214, 337), Transformed shape: (214, 130)


[I 2025-07-11 01:10:06,669] Trial 0 finished with value: -2.1558631109207025 and parameters: {'kernel': 'rbf', 'C': 2.0690066611697606, 'epsilon': 0.024546691841990063, 'gamma': 'auto'}. Best is trial 0 with value: -2.1558631109207025.
[I 2025-07-11 01:10:08,335] Trial 1 finished with value: -2.119745671007304 and parameters: {'kernel': 'rbf', 'C': 82.5350646032687, 'epsilon': 0.03605813217416961, 'gamma': 'scale'}. Best is trial 1 with value: -2.119745671007304.
[I 2025-07-11 01:10:09,943] Trial 2 finished with value: -2.147673822090648 and parameters: {'kernel': 'rbf', 'C': 4.851665861024547, 'epsilon': 0.29004802257143314, 'gamma': 'auto'}. Best is trial 1 with value: -2.119745671007304.
[I 2025-07-11 01:10:11,396] Trial 3 finished with value: -2.1027643064035137 and parameters: {'kernel': 'rbf', 'C': 2.15103648681277, 'epsilon': 0.7655517021529361, 'gamma': 'scale'}. Best is trial 3 with value: -2.1027643064035137.
[I 2025-07-11 01:10:11,424] Trial 4 finished with value: -2.1443747

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 33.18496425271245, 'epsilon': 0.0036038272417294338, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.0902477522425076
Train MSE: 4.0237
Train R2: 0.1771
Test MSE: 5.6245
Test R2: -0.0191
Original shape: (214, 337), Transformed shape: (214, 130)


[I 2025-07-11 01:10:15,059] Trial 7 finished with value: -2.1287896950270637 and parameters: {'kernel': 'rbf', 'C': 0.31220218904110375, 'epsilon': 0.21166121078901703, 'gamma': 'scale'}. Best is trial 1 with value: -2.0972378360906596.
[I 2025-07-11 01:10:15,086] Trial 8 finished with value: -2.1919676289608203 and parameters: {'kernel': 'rbf', 'C': 161.05387958906437, 'epsilon': 0.008115130168800227, 'gamma': 'scale'}. Best is trial 1 with value: -2.0972378360906596.
[I 2025-07-11 01:10:15,113] Trial 9 finished with value: -2.1247926832789075 and parameters: {'kernel': 'rbf', 'C': 4.3053020318838495, 'epsilon': 0.03687506950401427, 'gamma': 'scale'}. Best is trial 1 with value: -2.0972378360906596.
[I 2025-07-11 01:10:15,149] Trial 10 finished with value: -2.0972605563242874 and parameters: {'kernel': 'rbf', 'C': 36.178463469336, 'epsilon': 0.03281410513380848, 'gamma': 'scale'}. Best is trial 1 with value: -2.0972378360906596.
[I 2025-07-11 01:10:15,185] Trial 11 finished with value

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 13.945061459104645, 'epsilon': 0.7380466318311102, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.0448127040750825
Train MSE: 3.9797
Train R2: 0.1861
Test MSE: 5.5035
Test R2: 0.0028

=== BLOMAP_Embeddings ===
Тип данных (dtype) итогового массива: float64
Original shape: (214, 1459), Transformed shape: (214, 154)


[I 2025-07-11 01:10:18,271] Trial 4 finished with value: -2.0919051312296997 and parameters: {'kernel': 'rbf', 'C': 34.38720360725292, 'epsilon': 0.23741134462406865, 'gamma': 'scale'}. Best is trial 4 with value: -2.0919051312296997.
[I 2025-07-11 01:10:18,298] Trial 5 finished with value: -2.136312448635984 and parameters: {'kernel': 'rbf', 'C': 113.86010683332282, 'epsilon': 0.02656994706405982, 'gamma': 'auto'}. Best is trial 4 with value: -2.0919051312296997.
[I 2025-07-11 01:10:18,327] Trial 6 finished with value: -2.1156252034929945 and parameters: {'kernel': 'rbf', 'C': 0.10943577227774974, 'epsilon': 0.5176723009011446, 'gamma': 'scale'}. Best is trial 4 with value: -2.0919051312296997.
[I 2025-07-11 01:10:18,354] Trial 7 finished with value: -2.165782037865714 and parameters: {'kernel': 'rbf', 'C': 1.0752072415166956, 'epsilon': 0.0056145463711443026, 'gamma': 'auto'}. Best is trial 4 with value: -2.0919051312296997.
[I 2025-07-11 01:10:18,381] Trial 8 finished with value: -2

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 13.896719717629253, 'epsilon': 0.7719129591224, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.044703858310864
Train MSE: 3.9820
Train R2: 0.1856
Test MSE: 5.5426
Test R2: -0.0042
Original shape: (214, 1459), Transformed shape: (214, 154)


[I 2025-07-11 01:10:21,798] Trial 7 finished with value: -2.149312010590073 and parameters: {'kernel': 'rbf', 'C': 2.0966892882467913, 'epsilon': 0.008621405739572579, 'gamma': 'auto'}. Best is trial 3 with value: -2.0899426779609183.
[I 2025-07-11 01:10:21,828] Trial 8 finished with value: -2.1079331843620372 and parameters: {'kernel': 'rbf', 'C': 66.33536163257233, 'epsilon': 0.029579510427900572, 'gamma': 'scale'}. Best is trial 3 with value: -2.0899426779609183.
[I 2025-07-11 01:10:21,857] Trial 9 finished with value: -2.20717870301948 and parameters: {'kernel': 'rbf', 'C': 0.13416338871485942, 'epsilon': 0.0030458649490000492, 'gamma': 'auto'}. Best is trial 3 with value: -2.0899426779609183.
[I 2025-07-11 01:10:21,897] Trial 10 finished with value: -2.088070359917884 and parameters: {'kernel': 'rbf', 'C': 24.97154709936507, 'epsilon': 0.16635534862466494, 'gamma': 'scale'}. Best is trial 10 with value: -2.088070359917884.
[I 2025-07-11 01:10:21,937] Trial 11 finished with value: 

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 12.015168748342855, 'epsilon': 0.6414738071508538, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.045502476479274
Train MSE: 3.9943
Train R2: 0.1831
Test MSE: 5.4609
Test R2: 0.0105

=== Fingerprints_Embeddings ===
Тип данных (dtype) итогового массива: float64
Original shape: (214, 2162), Transformed shape: (214, 149)


[I 2025-07-11 01:10:25,581] Trial 2 finished with value: -2.1019071113188317 and parameters: {'kernel': 'rbf', 'C': 14.789835768955033, 'epsilon': 0.0020408512136059617, 'gamma': 'scale'}. Best is trial 2 with value: -2.1019071113188317.
[I 2025-07-11 01:10:25,608] Trial 3 finished with value: -2.117007845018814 and parameters: {'kernel': 'rbf', 'C': 0.23806365029476162, 'epsilon': 0.9494853091495433, 'gamma': 'scale'}. Best is trial 2 with value: -2.1019071113188317.
[I 2025-07-11 01:10:25,636] Trial 4 finished with value: -2.1222253836190665 and parameters: {'kernel': 'rbf', 'C': 0.5379694554623278, 'epsilon': 0.3990183907814873, 'gamma': 'scale'}. Best is trial 2 with value: -2.1019071113188317.
[I 2025-07-11 01:10:25,662] Trial 5 finished with value: -2.1401238058930634 and parameters: {'kernel': 'rbf', 'C': 182.70061079719838, 'epsilon': 0.0019907834092226396, 'gamma': 'auto'}. Best is trial 2 with value: -2.1019071113188317.
[I 2025-07-11 01:10:25,690] Trial 6 finished with value

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 14.033570086165534, 'epsilon': 0.7490114639437018, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.044507173220255
Train MSE: 3.9783
Train R2: 0.1864
Test MSE: 5.5145
Test R2: 0.0009
Original shape: (214, 2162), Transformed shape: (214, 149)


[I 2025-07-11 01:10:29,704] Trial 6 finished with value: -2.1012702125172336 and parameters: {'kernel': 'rbf', 'C': 13.650571561560183, 'epsilon': 0.0018676787758209262, 'gamma': 'scale'}. Best is trial 6 with value: -2.1012702125172336.
[I 2025-07-11 01:10:29,744] Trial 7 finished with value: -2.1438347784791034 and parameters: {'kernel': 'rbf', 'C': 3.3622659899471063, 'epsilon': 0.029583557128529205, 'gamma': 'auto'}. Best is trial 6 with value: -2.1012702125172336.
[I 2025-07-11 01:10:29,775] Trial 8 finished with value: -2.098289654071325 and parameters: {'kernel': 'rbf', 'C': 44.84371410702914, 'epsilon': 0.0013263226663983898, 'gamma': 'scale'}. Best is trial 8 with value: -2.098289654071325.
[I 2025-07-11 01:10:29,815] Trial 9 finished with value: -2.1403578480739727 and parameters: {'kernel': 'rbf', 'C': 110.41564820914557, 'epsilon': 0.03147414839717265, 'gamma': 'auto'}. Best is trial 8 with value: -2.098289654071325.
[I 2025-07-11 01:10:29,868] Trial 10 finished with value:

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 9.06292476466314, 'epsilon': 0.6543879259884127, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.049995446759028
Train MSE: 4.0047
Train R2: 0.1810
Test MSE: 5.4616
Test R2: 0.0104

=== ProtBERT_Embeddings ===
Тип данных (dtype) итогового массива: float64
Original shape: (214, 1138), Transformed shape: (214, 115)


[I 2025-07-11 01:10:33,895] Trial 2 finished with value: -2.179009037463265 and parameters: {'kernel': 'rbf', 'C': 0.8040308582370119, 'epsilon': 0.008351995055238284, 'gamma': 'auto'}. Best is trial 0 with value: -2.1492012506278946.
[I 2025-07-11 01:10:33,924] Trial 3 finished with value: -2.1493722789554845 and parameters: {'kernel': 'rbf', 'C': 319.1602836955921, 'epsilon': 0.03203299527292695, 'gamma': 'auto'}. Best is trial 0 with value: -2.1492012506278946.
[I 2025-07-11 01:10:33,952] Trial 4 finished with value: -2.1537206395017057 and parameters: {'kernel': 'rbf', 'C': 869.5737040916554, 'epsilon': 0.37649790080895756, 'gamma': 'auto'}. Best is trial 0 with value: -2.1492012506278946.
[I 2025-07-11 01:10:33,981] Trial 5 finished with value: -2.140656137972582 and parameters: {'kernel': 'rbf', 'C': 0.6644106105845131, 'epsilon': 0.004706867677699527, 'gamma': 'scale'}. Best is trial 5 with value: -2.140656137972582.
[I 2025-07-11 01:10:34,010] Trial 6 finished with value: -2.16

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 15.81791547467756, 'epsilon': 0.7769915111145795, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.0447918131168548
Train MSE: 3.9758
Train R2: 0.1869
Test MSE: 5.5571
Test R2: -0.0069
Original shape: (214, 1138), Transformed shape: (214, 115)


[I 2025-07-11 01:10:38,127] Trial 6 finished with value: -2.210117191972779 and parameters: {'kernel': 'rbf', 'C': 0.1090231789145865, 'epsilon': 0.003891733109718008, 'gamma': 'auto'}. Best is trial 3 with value: -2.120594622035097.
[I 2025-07-11 01:10:38,157] Trial 7 finished with value: -2.2410724760790712 and parameters: {'kernel': 'rbf', 'C': 246.64577336161713, 'epsilon': 0.01315936867841127, 'gamma': 'scale'}. Best is trial 3 with value: -2.120594622035097.
[I 2025-07-11 01:10:38,187] Trial 8 finished with value: -2.108373427676743 and parameters: {'kernel': 'rbf', 'C': 9.02725285997454, 'epsilon': 0.005060523277137313, 'gamma': 'scale'}. Best is trial 8 with value: -2.108373427676743.
[I 2025-07-11 01:10:38,228] Trial 9 finished with value: -2.1959343816068153 and parameters: {'kernel': 'rbf', 'C': 0.3819629476666575, 'epsilon': 0.0226774450888662, 'gamma': 'auto'}. Best is trial 8 with value: -2.108373427676743.
[I 2025-07-11 01:10:38,272] Trial 10 finished with value: -2.0888

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 13.886828931303324, 'epsilon': 0.731377482690618, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.045112727967262
Train MSE: 3.9799
Train R2: 0.1861
Test MSE: 5.4999
Test R2: 0.0035

=== Combined_All_Features ===


[I 2025-07-11 01:10:42,668] A new study created in memory with name: no-name-e37e7d73-1e49-4267-aa52-6dda97d5d8d9
[I 2025-07-11 01:10:42,697] Trial 0 finished with value: -2.2712522013675778 and parameters: {'kernel': 'rbf', 'C': 310.19296224898994, 'epsilon': 0.0011861817309886276, 'gamma': 'scale'}. Best is trial 0 with value: -2.2712522013675778.
[I 2025-07-11 01:10:42,727] Trial 1 finished with value: -2.124724824048696 and parameters: {'kernel': 'rbf', 'C': 92.85142593120516, 'epsilon': 0.16585550538651755, 'gamma': 'scale'}. Best is trial 1 with value: -2.124724824048696.
[I 2025-07-11 01:10:42,767] Trial 2 finished with value: -2.134686054480751 and parameters: {'kernel': 'rbf', 'C': 38.210117023468456, 'epsilon': 0.0014525219727096856, 'gamma': 'auto'}. Best is trial 1 with value: -2.124724824048696.
[I 2025-07-11 01:10:42,796] Trial 3 finished with value: -2.1329722214879077 and parameters: {'kernel': 'rbf', 'C': 2.03926842420646, 'epsilon': 0.05454949679898862, 'gamma': 'scal

Тип данных (dtype) итогового массива: float64
Original shape: (214, 4754), Transformed shape: (214, 167)


[I 2025-07-11 01:10:42,896] Trial 6 finished with value: -2.1980686443128974 and parameters: {'kernel': 'rbf', 'C': 0.2818898219001492, 'epsilon': 0.009190575776863425, 'gamma': 'auto'}. Best is trial 1 with value: -2.124724824048696.
[I 2025-07-11 01:10:42,924] Trial 7 finished with value: -2.1234244609926733 and parameters: {'kernel': 'rbf', 'C': 4.848838331955128, 'epsilon': 0.014108478917819557, 'gamma': 'scale'}. Best is trial 7 with value: -2.1234244609926733.
[I 2025-07-11 01:10:42,953] Trial 8 finished with value: -2.1110651575047807 and parameters: {'kernel': 'rbf', 'C': 7.897927687166847, 'epsilon': 0.02162479750807718, 'gamma': 'scale'}. Best is trial 8 with value: -2.1110651575047807.
[I 2025-07-11 01:10:42,983] Trial 9 finished with value: -2.1795876971505814 and parameters: {'kernel': 'rbf', 'C': 0.644421864737513, 'epsilon': 0.0077077280689583656, 'gamma': 'auto'}. Best is trial 8 with value: -2.1110651575047807.
[I 2025-07-11 01:10:43,027] Trial 10 finished with value: 

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 16.269230828517127, 'epsilon': 0.7227013341260581, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.0452224899619607
Train MSE: 3.9746
Train R2: 0.1871
Test MSE: 5.5015
Test R2: 0.0032
Original shape: (214, 4754), Transformed shape: (214, 167)


[I 2025-07-11 01:10:47,026] Trial 5 finished with value: -2.135044222710453 and parameters: {'kernel': 'rbf', 'C': 44.542196135954725, 'epsilon': 0.036390455551644114, 'gamma': 'auto'}. Best is trial 5 with value: -2.135044222710453.
[I 2025-07-11 01:10:47,066] Trial 6 finished with value: -2.134691150687359 and parameters: {'kernel': 'rbf', 'C': 411.24515163137835, 'epsilon': 0.001844212027571584, 'gamma': 'auto'}. Best is trial 6 with value: -2.134691150687359.
[I 2025-07-11 01:10:47,106] Trial 7 finished with value: -2.1650223833764928 and parameters: {'kernel': 'rbf', 'C': 1.0754997441224892, 'epsilon': 0.01950154042636042, 'gamma': 'auto'}. Best is trial 6 with value: -2.134691150687359.
[I 2025-07-11 01:10:47,135] Trial 8 finished with value: -2.055395197484705 and parameters: {'kernel': 'rbf', 'C': 24.190400128655863, 'epsilon': 0.7267349263007997, 'gamma': 'scale'}. Best is trial 8 with value: -2.055395197484705.
[I 2025-07-11 01:10:47,164] Trial 9 finished with value: -2.12416

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 15.62546560182069, 'epsilon': 0.7627893277341583, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 2.044579909402897
Train MSE: 3.9758
Train R2: 0.1869
Test MSE: 5.5381
Test R2: -0.0034

=== Classification_Descriptors ===


ValueError: Selected columns, ['seq_length', 'molecular_weight', 'nh3_tail', 'po3_pos', 'biotinylated', 'acylated_n_terminal', 'cyclic', 'amidated', 'stearyl_uptake', 'hexahistidine_tagged', 'aromaticity', 'instability_index', 'isoelectric_point', 'helix_fraction', 'turn_fraction', 'sheet_fraction', 'molar_extinction_coefficient_reduced', 'molar_extinction_coefficient_oxidized', 'gravy'], are not unique in dataframe

In [54]:
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

for name, df in dfs_dict.items():
    print(f"\n=== {name} ===")

    X = pd.concat([df, X_cell_line_filtered, classification_filtered], axis=1)

    X.columns = pd.Index(X.columns).map(str)
    
    X_filtered_var = apply_varThreshold(X, 0)
    # X_corr = apply_corr(X_filtered_var, 0.2)
    X_train, X_test, y_train, y_test = train_test_split(X_filtered_var, y, test_size=0.2, random_state=42)

    X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)
    X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled, 0.95)

    model = svr_optuna(X_train_transformed, y_train, X_test_transformed, y_test)


=== RDKit_Descriptors ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
          MW     GRAVY         pI     Charge  Charge_Density  Aromaticity  \
0  3151.8362 -1.492308  11.608322  12.577710        0.483758     0.192308   
1  2785.1939 -1.272727  11.839377   5.637233        0.256238     0.136364   
2  2078.3658 -1.123529  11.824485   3.657547        0.215150     0.000000   
3  3208.5313 -0.066667   4.139095  -2.395473       -0.079849     0.100000   
4  2624.9800  0.100000   4.783081  -1.396580       -0.053715     0.000000   

   Flexibility  Aliphatic_Index  Boman_Index  Hydrophobic_AA  ...  \
0     1.021711         0.269231     1.000000             7.0  ...   
1     1.000826         0.272727     0.727273             7.0  ...   
2     0.983251         0.235294     0.647059             4.0  ...   
3     0.993229         0.366667     1.033333            13.0  ...   
4     0.999597         0.461538     1.192308            12.0  ...   

   hexahistidine_tagged  aromaticity

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)
[I 2025-07-11 04:03:23,184] A new study created in memory with name: no-name-3bc2b6cc-8cb3-4f42-8cc1-8640db17a763


                MW       GRAVY          pI      Charge  Charge_Density  \
count   268.000000  268.000000  268.000000  268.000000      268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431        0.294253   
std    1027.891708    1.349577    2.217002    4.505073        0.257694   
min      89.093200   -4.500000    4.050028   -6.131594       -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760        0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524        0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305        0.457545   
max    8511.835100    2.475000   11.999968   35.776307        0.944656   

       Aromaticity  Flexibility  Aliphatic_Index  Boman_Index  Hydrophobic_AA  \
count   268.000000   268.000000       268.000000   268.000000      268.000000   
mean      0.095923     0.996992         0.193904     0.681644        4.343284   
std       0.112545     0.019976         0.182269     0.336047        3.891034   
min      

[I 2025-07-11 04:03:26,025] Trial 0 finished with value: -1.707575436755468 and parameters: {'kernel': 'rbf', 'C': 82.10838809788915, 'epsilon': 0.49455525771693126, 'gamma': 'scale'}. Best is trial 0 with value: -1.707575436755468.
[I 2025-07-11 04:03:27,629] Trial 1 finished with value: -1.7964112820426923 and parameters: {'kernel': 'rbf', 'C': 0.6561278569984318, 'epsilon': 0.005809430091838878, 'gamma': 'scale'}. Best is trial 0 with value: -1.707575436755468.
[I 2025-07-11 04:03:29,361] Trial 2 finished with value: -1.8441041250088195 and parameters: {'kernel': 'rbf', 'C': 0.3850965186002394, 'epsilon': 0.0010665757021917923, 'gamma': 'scale'}. Best is trial 0 with value: -1.707575436755468.
[I 2025-07-11 04:03:30,892] Trial 3 finished with value: -1.925545679981975 and parameters: {'kernel': 'rbf', 'C': 145.38629676571236, 'epsilon': 0.0056353241528319465, 'gamma': 'auto'}. Best is trial 0 with value: -1.707575436755468.
[I 2025-07-11 04:03:30,921] Trial 4 finished with value: -1

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 14.513330681351293, 'epsilon': 0.8096752532457047, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.636083903207623
Train MSE: 0.6401
Train R2: 0.8691
Test MSE: 2.9408
Test R2: 0.4672

=== BLOMAP_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
   blomap_0  blomap_1  blomap_2  blomap_3  blomap_5  blomap_6  blomap_7  \
0      0.62      0.29       0.0     -0.06     -1.50      0.00       2.9   
1      0.62      0.29       0.0     -0.06     -2.53      0.00       3.0   
2      0.62      0.29       0.0     -0.06     -0.74      0.00       1.4   
3      0.62      0.29       0.0     -0.06     -0.90      0.46       1.6   
4      0.62      0.29       0.0     -0.06     -0.74      0.00       1.4   

   blomap_8  blomap_10  blomap_11  ...  hexahistidine_tagged  aromaticity  \
0      0.00      -1.50        0.0  ...                   0.0     0.192308   
1      1.17      -1.50        0.0  ...                   0.0     0.136364   
2  

[I 2025-07-11 04:03:35,026] A new study created in memory with name: no-name-3b389d5d-d82d-4b1b-95be-b529e7445ab6
[I 2025-07-11 04:03:35,053] Trial 0 finished with value: -1.841336247908319 and parameters: {'kernel': 'rbf', 'C': 9.732310613727213, 'epsilon': 0.42044986560286246, 'gamma': 'auto'}. Best is trial 0 with value: -1.841336247908319.
[I 2025-07-11 04:03:35,080] Trial 1 finished with value: -1.8704917000276093 and parameters: {'kernel': 'rbf', 'C': 15.9911301070454, 'epsilon': 0.12925153852122853, 'gamma': 'auto'}. Best is trial 0 with value: -1.841336247908319.
[I 2025-07-11 04:03:35,108] Trial 2 finished with value: -2.0475030153861717 and parameters: {'kernel': 'rbf', 'C': 0.14242165202744603, 'epsilon': 0.011355142303857795, 'gamma': 'scale'}. Best is trial 0 with value: -1.841336247908319.
[I 2025-07-11 04:03:35,135] Trial 3 finished with value: -2.0743981713945088 and parameters: {'kernel': 'rbf', 'C': 0.10761140443361228, 'epsilon': 0.0020787213528309667, 'gamma': 'scal

         blomap_0    blomap_1    blomap_2    blomap_3    blomap_5    blomap_6  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean    -0.660112   -0.003433    1.272761    0.275821   -0.465746   -0.033284   
std      1.316615    0.166192    1.406847    0.501976    1.349090    0.169254   
min     -2.530000   -0.300000    0.000000   -0.060000   -2.530000   -0.300000   
25%     -1.500000   -0.100000    0.000000    0.000000   -1.500000   -0.100000   
50%     -0.400000    0.000000    0.200000    0.000000   -0.180000    0.000000   
75%      0.290000    0.000000    2.900000    0.000000    0.682500    0.000000   
max      1.380000    0.580000    3.000000    1.170000    1.380000    0.580000   

         blomap_7    blomap_8   blomap_10   blomap_11  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     1.161940    0.238321   -0.448470   -0.025709  ...   
std      1.361306    0.474427    1.330883    0.225169  ...   
min      0.000000   -0

[I 2025-07-11 04:03:35,229] Trial 6 finished with value: -2.0384636398661833 and parameters: {'kernel': 'rbf', 'C': 136.61245091688969, 'epsilon': 0.058590557955064244, 'gamma': 'auto'}. Best is trial 5 with value: -1.8223020741976743.
[I 2025-07-11 04:03:35,256] Trial 7 finished with value: -1.9062046408064977 and parameters: {'kernel': 'rbf', 'C': 0.5314542959910764, 'epsilon': 0.0021037058123381746, 'gamma': 'scale'}. Best is trial 5 with value: -1.8223020741976743.
[I 2025-07-11 04:03:35,284] Trial 8 finished with value: -2.1145587192181035 and parameters: {'kernel': 'rbf', 'C': 0.1922393045103043, 'epsilon': 0.25114248913351056, 'gamma': 'auto'}. Best is trial 5 with value: -1.8223020741976743.
[I 2025-07-11 04:03:35,323] Trial 9 finished with value: -2.4449695747795914 and parameters: {'kernel': 'rbf', 'C': 810.7172315527761, 'epsilon': 0.0016344309663946176, 'gamma': 'auto'}. Best is trial 5 with value: -1.8223020741976743.
[I 2025-07-11 04:03:35,361] Trial 10 finished with valu

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 2.758018257483951, 'epsilon': 0.4455042248106184, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.802768197359206
Train MSE: 1.1087
Train R2: 0.7733
Test MSE: 3.0734
Test R2: 0.4431

=== Fingerprints_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
   fingerprint_11  fingerprint_22  fingerprint_23  fingerprint_27  \
0             0.0             0.0             1.0             0.0   
1             1.0             0.0             0.0             0.0   
2             0.0             0.0             0.0             0.0   
3             0.0             0.0             0.0             0.0   
4             0.0             0.0             0.0             0.0   

   fingerprint_36  fingerprint_41  fingerprint_56  fingerprint_70  \
0             0.0             1.0             0.0             0.0   
1             0.0             0.0             1.0             0.0   
2             0.0             0.0             0.0         

[I 2025-07-11 04:03:39,289] A new study created in memory with name: no-name-5f2f63d2-8df6-4570-893f-58ec3f218e9c
[I 2025-07-11 04:03:39,327] Trial 0 finished with value: -2.041090529380013 and parameters: {'kernel': 'rbf', 'C': 678.9799930319338, 'epsilon': 0.0010738885309084566, 'gamma': 'auto'}. Best is trial 0 with value: -2.041090529380013.
[I 2025-07-11 04:03:39,356] Trial 1 finished with value: -1.747860826558791 and parameters: {'kernel': 'rbf', 'C': 4.740247224493303, 'epsilon': 0.3722669387007538, 'gamma': 'scale'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,384] Trial 2 finished with value: -1.8450482086656348 and parameters: {'kernel': 'rbf', 'C': 0.5274422063223863, 'epsilon': 0.018223440505100042, 'gamma': 'auto'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,412] Trial 3 finished with value: -1.9261531820070483 and parameters: {'kernel': 'rbf', 'C': 0.20586891463407503, 'epsilon': 0.0034176171247860742, 'gamma': 'scal

       fingerprint_11  fingerprint_22  fingerprint_23  fingerprint_27  \
count      268.000000      268.000000      268.000000      268.000000   
mean         0.227612        0.003731        0.007463        0.037313   
std          0.420075        0.061085        0.086225        0.189883   
min          0.000000        0.000000        0.000000        0.000000   
25%          0.000000        0.000000        0.000000        0.000000   
50%          0.000000        0.000000        0.000000        0.000000   
75%          0.000000        0.000000        0.000000        0.000000   
max          1.000000        1.000000        1.000000        1.000000   

       fingerprint_36  fingerprint_41  fingerprint_56  fingerprint_70  \
count      268.000000      268.000000      268.000000      268.000000   
mean         0.044776        0.294776        0.003731        0.003731   
std          0.207199        0.456795        0.061085        0.061085   
min          0.000000        0.000000        0.000

[I 2025-07-11 04:03:39,468] Trial 5 finished with value: -1.866335057278707 and parameters: {'kernel': 'rbf', 'C': 108.4274724463173, 'epsilon': 0.16553892432057243, 'gamma': 'scale'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,495] Trial 6 finished with value: -1.8995604436460973 and parameters: {'kernel': 'rbf', 'C': 0.27606265462942836, 'epsilon': 0.06638366994233728, 'gamma': 'auto'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,524] Trial 7 finished with value: -1.8706758345690893 and parameters: {'kernel': 'rbf', 'C': 71.57527817431189, 'epsilon': 0.08080597293351965, 'gamma': 'auto'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,551] Trial 8 finished with value: -1.821798733199747 and parameters: {'kernel': 'rbf', 'C': 0.5950169237653892, 'epsilon': 0.12287274573980644, 'gamma': 'scale'}. Best is trial 1 with value: -1.747860826558791.
[I 2025-07-11 04:03:39,577] Trial 9 finished with value: -1.78724

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 5.1703686376142395, 'epsilon': 0.830599641108961, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.739545273108939
Train MSE: 0.8020
Train R2: 0.8360
Test MSE: 3.3700
Test R2: 0.3894

=== ProtBERT_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
   protbert_0  protbert_1  protbert_2  protbert_3  protbert_4  protbert_5  \
0     0.03056    0.024347    0.136336    0.032939   -0.065259   -0.135983   
1     0.03056    0.024347    0.136336    0.032939   -0.065259   -0.135983   
2     0.03056    0.024347    0.136336    0.032939   -0.065259   -0.135983   
3     0.03056    0.024347    0.136336    0.032939   -0.065259   -0.135983   
4     0.03056    0.024347    0.136336    0.032939   -0.065259   -0.135983   

   protbert_6  protbert_7  protbert_8  protbert_9  ...  hexahistidine_tagged  \
0   -0.046607    0.032417    0.011024   -0.019775  ...                   0.0   
1   -0.046607    0.032417    0.011024   -0.019775  ...       

[I 2025-07-11 04:03:44,602] A new study created in memory with name: no-name-70185588-21d4-42bc-ad14-ab6af911d8f4
[I 2025-07-11 04:03:44,630] Trial 0 finished with value: -1.874935561147216 and parameters: {'kernel': 'rbf', 'C': 416.1253045550155, 'epsilon': 0.06823651225681801, 'gamma': 'scale'}. Best is trial 0 with value: -1.874935561147216.
[I 2025-07-11 04:03:44,657] Trial 1 finished with value: -2.107505386328151 and parameters: {'kernel': 'rbf', 'C': 0.10304607699503182, 'epsilon': 0.013611427711689047, 'gamma': 'scale'}. Best is trial 0 with value: -1.874935561147216.
[I 2025-07-11 04:03:44,685] Trial 2 finished with value: -1.8706709748834391 and parameters: {'kernel': 'rbf', 'C': 7.400471781520439, 'epsilon': 0.09345210816335672, 'gamma': 'auto'}. Best is trial 2 with value: -1.8706709748834391.
[I 2025-07-11 04:03:44,713] Trial 3 finished with value: -1.8701147307663821 and parameters: {'kernel': 'rbf', 'C': 11.02220122039642, 'epsilon': 0.47087814899160646, 'gamma': 'scale'

       protbert_0  protbert_1  protbert_2  protbert_3  protbert_4  protbert_5  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.030560    0.024347    0.136336    0.032939   -0.065260   -0.135982   
std      0.000003    0.000007    0.000007    0.000004    0.000014    0.000008   
min      0.030527    0.024347    0.136259    0.032896   -0.065424   -0.135983   
25%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
50%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
75%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
max      0.030560    0.024433    0.136337    0.032939   -0.065259   -0.135885   

       protbert_6  protbert_7  protbert_8  protbert_9  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean    -0.046606    0.032417    0.011023   -0.019774  ...   
std      0.000008    0.000007    0.000008    0.000008  ...   
min     -0.046607    0

[I 2025-07-11 04:03:44,768] Trial 5 finished with value: -1.88291219608332 and parameters: {'kernel': 'rbf', 'C': 1.9108452289881424, 'epsilon': 0.007081895879501656, 'gamma': 'auto'}. Best is trial 4 with value: -1.866496484493328.
[I 2025-07-11 04:03:44,796] Trial 6 finished with value: -1.9708489063892833 and parameters: {'kernel': 'rbf', 'C': 0.4078028069744354, 'epsilon': 0.014497456931542982, 'gamma': 'scale'}. Best is trial 4 with value: -1.866496484493328.
[I 2025-07-11 04:03:44,824] Trial 7 finished with value: -1.888812196443904 and parameters: {'kernel': 'rbf', 'C': 2.9892927540777547, 'epsilon': 0.8044269595796528, 'gamma': 'auto'}. Best is trial 4 with value: -1.866496484493328.
[I 2025-07-11 04:03:44,852] Trial 8 finished with value: -1.8674474843202702 and parameters: {'kernel': 'rbf', 'C': 13.483875694997483, 'epsilon': 0.06819729734540035, 'gamma': 'auto'}. Best is trial 4 with value: -1.866496484493328.
[I 2025-07-11 04:03:44,878] Trial 9 finished with value: -1.88998

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 90.49856975710506, 'epsilon': 0.0030676387318632125, 'gamma': 'auto'}
Лучшая средняя ошибка (CV): 1.828951937897779
Train MSE: 2.7181
Train R2: 0.4441
Test MSE: 4.2058
Test R2: 0.2380

=== Combined_All_Features ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
          MW     GRAVY         pI     Charge  Charge_Density  Aromaticity  \
0  3151.8362 -1.492308  11.608322  12.577710        0.483758     0.192308   
1  2785.1939 -1.272727  11.839377   5.637233        0.256238     0.136364   
2  2078.3658 -1.123529  11.824485   3.657547        0.215150     0.000000   
3  3208.5313 -0.066667   4.139095  -2.395473       -0.079849     0.100000   
4  2624.9800  0.100000   4.783081  -1.396580       -0.053715     0.000000   

   Flexibility  Aliphatic_Index  Boman_Index  Hydrophobic_AA  ...  \
0     1.021711         0.269231     1.000000             7.0  ...   
1     1.000826         0.272727     0.727273             7.0  ...   
2     0.98

  sqr = _ensure_numeric((avg - values) ** 2)
[I 2025-07-11 04:03:51,362] A new study created in memory with name: no-name-bc6953b5-4a91-4f44-a2fe-b9eaf641e65b
[I 2025-07-11 04:03:51,390] Trial 0 finished with value: -1.9952850145600727 and parameters: {'kernel': 'rbf', 'C': 969.0858002616169, 'epsilon': 0.001313298040481044, 'gamma': 'auto'}. Best is trial 0 with value: -1.9952850145600727.
[I 2025-07-11 04:03:51,418] Trial 1 finished with value: -1.800801485136693 and parameters: {'kernel': 'rbf', 'C': 1.329584677814765, 'epsilon': 0.7931446683957362, 'gamma': 'auto'}. Best is trial 1 with value: -1.800801485136693.
[I 2025-07-11 04:03:51,445] Trial 2 finished with value: -1.8785288820053274 and parameters: {'kernel': 'rbf', 'C': 0.3436120389931448, 'epsilon': 0.212635873806073, 'gamma': 'auto'}. Best is trial 1 with value: -1.800801485136693.


                MW       GRAVY          pI      Charge  Charge_Density  \
count   268.000000  268.000000  268.000000  268.000000      268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431        0.294253   
std    1027.891708    1.349577    2.217002    4.505073        0.257694   
min      89.093200   -4.500000    4.050028   -6.131594       -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760        0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524        0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305        0.457545   
max    8511.835100    2.475000   11.999968   35.776307        0.944656   

       Aromaticity  Flexibility  Aliphatic_Index  Boman_Index  Hydrophobic_AA  \
count   268.000000   268.000000       268.000000   268.000000      268.000000   
mean      0.095923     0.996992         0.193904     0.681644        4.343284   
std       0.112545     0.019976         0.182269     0.336047        3.891034   
min      

[I 2025-07-11 04:03:51,473] Trial 3 finished with value: -1.7763724801526959 and parameters: {'kernel': 'rbf', 'C': 21.755916848161373, 'epsilon': 0.021676351231647826, 'gamma': 'scale'}. Best is trial 3 with value: -1.7763724801526959.
[I 2025-07-11 04:03:51,500] Trial 4 finished with value: -1.7221419567633294 and parameters: {'kernel': 'rbf', 'C': 3.9764050101870057, 'epsilon': 0.634945877275383, 'gamma': 'auto'}. Best is trial 4 with value: -1.7221419567633294.
[I 2025-07-11 04:03:51,529] Trial 5 finished with value: -1.7198962479557962 and parameters: {'kernel': 'rbf', 'C': 94.48270243583806, 'epsilon': 0.5659368698663907, 'gamma': 'auto'}. Best is trial 5 with value: -1.7198962479557962.
[I 2025-07-11 04:03:51,556] Trial 6 finished with value: -1.9462511581762474 and parameters: {'kernel': 'rbf', 'C': 0.20410915047824918, 'epsilon': 0.052213903240003606, 'gamma': 'scale'}. Best is trial 5 with value: -1.7198962479557962.
[I 2025-07-11 04:03:51,583] Trial 7 finished with value: -1

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 5.852195455925944, 'epsilon': 0.7124636070273992, 'gamma': 'auto'}
Лучшая средняя ошибка (CV): 1.7055057909427238
Train MSE: 0.6100
Train R2: 0.8753
Test MSE: 3.1907
Test R2: 0.4219

=== Classification_Descriptors ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
   seq_length  molecular_weight  nh3_tail  amidated  hexahistidine_tagged  \
0        26.0         3151.8362       0.0       0.0                   0.0   
1        22.0         2785.1939       0.0       0.0                   0.0   
2        17.0         2078.3658       0.0       0.0                   0.0   
3        30.0         3208.5313       0.0       0.0                   0.0   
4        26.0         2624.9800       0.0       0.0                   0.0   

   aromaticity  instability_index  isoelectric_point  helix_fraction  \
0     0.192308          10.653846          11.608322        0.538462   
1     0.136364         115.700455          11.839377        0.227273  

[I 2025-07-11 04:03:55,296] Trial 1 finished with value: -1.881859451304726 and parameters: {'kernel': 'rbf', 'C': 1.5805801743200374, 'epsilon': 0.2655549961211395, 'gamma': 'auto'}. Best is trial 0 with value: -1.759359332590244.
[I 2025-07-11 04:03:55,324] Trial 2 finished with value: -1.8316883021519732 and parameters: {'kernel': 'rbf', 'C': 5.929393776011413, 'epsilon': 0.015466869810083966, 'gamma': 'auto'}. Best is trial 0 with value: -1.759359332590244.
[I 2025-07-11 04:03:55,365] Trial 3 finished with value: -1.7789535967411862 and parameters: {'kernel': 'rbf', 'C': 2.1248214493800117, 'epsilon': 0.0016398277301753216, 'gamma': 'scale'}. Best is trial 0 with value: -1.759359332590244.
[I 2025-07-11 04:03:55,397] Trial 4 finished with value: -2.1384474816789143 and parameters: {'kernel': 'rbf', 'C': 0.12486835205294419, 'epsilon': 0.003586360927767154, 'gamma': 'auto'}. Best is trial 0 with value: -1.759359332590244.
[I 2025-07-11 04:03:55,426] Trial 5 finished with value: -1.8

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 60.25429793124982, 'epsilon': 0.7747748909786593, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.6495924428242479
Train MSE: 0.5092
Train R2: 0.8959
Test MSE: 3.8979
Test R2: 0.2938


In [233]:
model = svr_optuna(X_train_scaled, y_train, X_test_scaled, y_test)

[I 2025-04-20 15:59:23,036] A new study created in memory with name: no-name-bfbe5d22-ed17-4593-9517-276647867d58
[I 2025-04-20 15:59:23,306] Trial 0 finished with value: -1.7120523389124205 and parameters: {'C': 13.035255162984711, 'epsilon': 0.03730097116246337, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,474] Trial 1 finished with value: -1.7159074409419854 and parameters: {'C': 7.070461329939782, 'epsilon': 0.2556291142643974, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,643] Trial 2 finished with value: -1.7673078358851804 and parameters: {'C': 2.7699494220727288, 'epsilon': 0.1453293064808481, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,800] Trial 3 finished with value: -1.8245612590408147 and parameters: {'C': 1.0947937764206288, 'epsilon': 0.043708963986457856, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15

Лучшие гиперпараметры: {'C': 12.862918844000914, 'epsilon': 0.6713343238724396, 'kernel': 'rbf'}
Лучшая средняя ошибка (CV): 1.6625375429926663
Train RMSE: 0.9757
Train R2: 0.8005
Test RMSE: 2.7727
Test R2: 0.4976


In [97]:
def run_model_with_optuna(model_name, X_train, y_train, X_test, y_test):

    def objective(trial):
        if model_name == "svm":
            params = {
                "C": trial.suggest_float("C", 1e-1, 100, log=True),
                "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
                "kernel": trial.suggest_categorical("kernel", ["rbf"])
            }
            model = SVR(**params)

        elif model_name == "sgd":
            params = {
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "penalty": trial.suggest_categorical("penalty", ["l2", "elasticnet"]),
                "max_iter": 1000
            }
            model = SGDRegressor(**params)

        elif model_name == "knn":
            params = {
                "n_neighbors": trial.suggest_int("n_neighbors", 3, 20),
                "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            }
            model = KNeighborsRegressor(**params)

        elif model_name == "dt":
            params = {
                "max_depth": trial.suggest_int("max_depth", 2, 20),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10)
            }
            model = DecisionTreeRegressor(**params)

        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = RandomForestRegressor(**params)

        elif model_name == "et":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = ExtraTreesRegressor(**params)

        elif model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = XGBRegressor(**params, objective='reg:squarederror', verbosity=0)

        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", -1, 20),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = LGBMRegressor(**params)

        elif model_name == "mlp":
            params = {
                "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(64,), (128,), (64, 64)]),
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "learning_rate_init": trial.suggest_float("learning_rate_init", 1e-4, 1e-2),
                "max_iter": 1000
            }
            model = MLPRegressor(**params)

        else:
            raise ValueError(f"Unknown model: {model_name}")

        # Кросс-валидация
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    print(f"\n🔍 Оптимизация модели: {model_name}")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    print("🏆 Лучшие гиперпараметры:", study.best_params)
    print("📉 Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими параметрами

    model_classes = {
        "svm": SVR,
        "sgd": SGDRegressor,
        "knn": KNeighborsRegressor,
        "dt": DecisionTreeRegressor,
        "rf": RandomForestRegressor,
        "et": ExtraTreesRegressor,
        "xgb": XGBRegressor,
        "lgbm": LGBMRegressor,
        "mlp": MLPRegressor
    }
    best_model = model_classes[model_name](**study.best_params)

    best_model.fit(X_train, y_train)

    # 🔍 Оценка модели
    print("📊 Оценка модели на train/test:")
    evaluate_model(best_model, X_train, y_train, X_test, y_test)


In [114]:
for model in ["lgbm"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-17 02:48:15,649] A new study created in memory with name: no-name-df10ada3-c973-4a8c-8442-1a9857f52a3d
[I 2025-04-17 02:48:15,821] Trial 0 finished with value: -1.6960896423980512 and parameters: {'n_estimators': 226, 'max_depth': 19, 'learning_rate': 0.16344993638147193, 'num_leaves': 122, 'subsample': 0.5216219378836007, 'colsample_bytree': 0.6092546502178895}. Best is trial 0 with value: -1.6960896423980512.



🔍 Оптимизация модели: lgbm


[I 2025-04-17 02:48:15,939] Trial 1 finished with value: -1.7828403799838721 and parameters: {'n_estimators': 125, 'max_depth': 8, 'learning_rate': 0.23183321761462347, 'num_leaves': 36, 'subsample': 0.6167854428141397, 'colsample_bytree': 0.5474085100679746}. Best is trial 0 with value: -1.6960896423980512.
[I 2025-04-17 02:48:16,068] Trial 2 finished with value: -1.654327358506092 and parameters: {'n_estimators': 141, 'max_depth': 13, 'learning_rate': 0.12245592721145833, 'num_leaves': 89, 'subsample': 0.9229162404440199, 'colsample_bytree': 0.6833690768132368}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,162] Trial 3 finished with value: -1.7479825187568578 and parameters: {'n_estimators': 95, 'max_depth': 17, 'learning_rate': 0.2742393313169498, 'num_leaves': 44, 'subsample': 0.7478510679912089, 'colsample_bytree': 0.5495123166150259}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,312] Trial 4 finished with value: -1.6840289838842

🏆 Лучшие гиперпараметры: {'n_estimators': 152, 'max_depth': 11, 'learning_rate': 0.07759053090052767, 'num_leaves': 50, 'subsample': 0.9318155523102657, 'colsample_bytree': 0.7137131727645704}
📉 Лучшая средняя ошибка (CV): 1.6518158163223498
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2117
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 29
[LightGBM] [Info] Start training from score 4.786516
📊 Оценка модели на train/test:
Train RMSE: 0.0997
Train R2: 0.9796
Test RMSE: 3.4065
Test R2: 0.3828


In [98]:
for model in ["svm", "sgd", "knn", "dt", "rf", "et", "xgb", "lgbm", "mlp"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-07-09 05:41:09,637] A new study created in memory with name: no-name-f818d370-7c1d-45c4-9db0-9009f4219622



🔍 Оптимизация модели: svm


[I 2025-07-09 05:41:13,535] Trial 0 finished with value: -1.8686646052059026 and parameters: {'C': 0.23566928561771353, 'epsilon': 0.0015486415716629684, 'kernel': 'rbf'}. Best is trial 0 with value: -1.8686646052059026.
[I 2025-07-09 05:41:15,184] Trial 1 finished with value: -1.7939979005093545 and parameters: {'C': 0.5231074616119769, 'epsilon': 0.004345458102411504, 'kernel': 'rbf'}. Best is trial 1 with value: -1.7939979005093545.
[I 2025-07-09 05:41:16,779] Trial 2 finished with value: -1.7887835525591584 and parameters: {'C': 5.883147587040274, 'epsilon': 0.03974938453501481, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7887835525591584.
[I 2025-07-09 05:41:18,200] Trial 3 finished with value: -1.7634798433000156 and parameters: {'C': 2.729202825127747, 'epsilon': 0.18329657329767216, 'kernel': 'rbf'}. Best is trial 3 with value: -1.7634798433000156.
[I 2025-07-09 05:41:18,228] Trial 4 finished with value: -1.7949936963650663 and parameters: {'C': 18.987269168076736, 'epsilo

🏆 Лучшие гиперпараметры: {'C': 36.62408663441766, 'epsilon': 0.9418719032780994, 'kernel': 'rbf'}
📉 Лучшая средняя ошибка (CV): 1.6821246037377733
📊 Оценка модели на train/test:
Train MSE: 0.7608
Train R2: 0.8444
Test MSE: 3.9010
Test R2: 0.2932

🔍 Оптимизация модели: sgd


[I 2025-07-09 05:41:19,953] Trial 6 finished with value: -1.835256120818069 and parameters: {'alpha': 2.0554608805029356e-05, 'penalty': 'elasticnet'}. Best is trial 5 with value: -1.8348357499671342.
[I 2025-07-09 05:41:19,980] Trial 7 finished with value: -1.8355666143817686 and parameters: {'alpha': 0.00012875781447356635, 'penalty': 'elasticnet'}. Best is trial 5 with value: -1.8348357499671342.
[I 2025-07-09 05:41:20,006] Trial 8 finished with value: -1.8361068298439882 and parameters: {'alpha': 4.213413090628312e-05, 'penalty': 'l2'}. Best is trial 5 with value: -1.8348357499671342.
[I 2025-07-09 05:41:20,032] Trial 9 finished with value: -1.8390257800721137 and parameters: {'alpha': 0.01071085619699765, 'penalty': 'l2'}. Best is trial 5 with value: -1.8348357499671342.
[I 2025-07-09 05:41:20,063] Trial 10 finished with value: -1.836091672413208 and parameters: {'alpha': 0.0003034128374382909, 'penalty': 'elasticnet'}. Best is trial 5 with value: -1.8348357499671342.
[I 2025-07-0

🏆 Лучшие гиперпараметры: {'alpha': 1.6815426901230905e-05, 'penalty': 'elasticnet'}
📉 Лучшая средняя ошибка (CV): 1.8341506892994435
📊 Оценка модели на train/test:
Train MSE: 2.7507
Train R2: 0.4374
Test MSE: 3.9740
Test R2: 0.2800

🔍 Оптимизация модели: knn


[I 2025-07-09 05:41:21,558] Trial 0 finished with value: -1.7099912688140342 and parameters: {'n_neighbors': 6, 'weights': 'distance'}. Best is trial 0 with value: -1.7099912688140342.
[I 2025-07-09 05:41:21,702] Trial 1 finished with value: -1.7380600270930242 and parameters: {'n_neighbors': 13, 'weights': 'distance'}. Best is trial 0 with value: -1.7099912688140342.
[I 2025-07-09 05:41:21,848] Trial 2 finished with value: -1.8057005891294846 and parameters: {'n_neighbors': 8, 'weights': 'uniform'}. Best is trial 0 with value: -1.7099912688140342.
[I 2025-07-09 05:41:21,970] Trial 3 finished with value: -1.8592687569948076 and parameters: {'n_neighbors': 13, 'weights': 'uniform'}. Best is trial 0 with value: -1.7099912688140342.
[I 2025-07-09 05:41:21,986] Trial 4 finished with value: -1.7452554084672225 and parameters: {'n_neighbors': 19, 'weights': 'distance'}. Best is trial 0 with value: -1.7099912688140342.
[I 2025-07-09 05:41:22,002] Trial 5 finished with value: -1.70999126881403

🏆 Лучшие гиперпараметры: {'n_neighbors': 7, 'weights': 'distance'}
📉 Лучшая средняя ошибка (CV): 1.6932989829617846
📊 Оценка модели на train/test:


[I 2025-07-09 05:41:23,220] A new study created in memory with name: no-name-cb522b3a-b8d8-480f-bc5a-f12f235123ba
[I 2025-07-09 05:41:23,284] Trial 0 finished with value: -2.1688412357556652 and parameters: {'max_depth': 5, 'min_samples_split': 6}. Best is trial 0 with value: -2.1688412357556652.
[I 2025-07-09 05:41:23,344] Trial 1 finished with value: -2.2613135740043737 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 0 with value: -2.1688412357556652.
[I 2025-07-09 05:41:23,383] Trial 2 finished with value: -2.289630152368121 and parameters: {'max_depth': 7, 'min_samples_split': 10}. Best is trial 0 with value: -2.1688412357556652.


Train MSE: 0.0037
Train R2: 0.9993
Test MSE: 3.7658
Test R2: 0.3177

🔍 Оптимизация модели: dt


[I 2025-07-09 05:41:23,420] Trial 3 finished with value: -2.1701278184702515 and parameters: {'max_depth': 5, 'min_samples_split': 10}. Best is trial 0 with value: -2.1688412357556652.
[I 2025-07-09 05:41:23,436] Trial 4 finished with value: -2.097095027768707 and parameters: {'max_depth': 3, 'min_samples_split': 3}. Best is trial 4 with value: -2.097095027768707.
[I 2025-07-09 05:41:23,463] Trial 5 finished with value: -2.097095027768707 and parameters: {'max_depth': 3, 'min_samples_split': 4}. Best is trial 4 with value: -2.097095027768707.
[I 2025-07-09 05:41:23,490] Trial 6 finished with value: -2.286960136587214 and parameters: {'max_depth': 19, 'min_samples_split': 2}. Best is trial 4 with value: -2.097095027768707.
[I 2025-07-09 05:41:23,517] Trial 7 finished with value: -2.285257324319812 and parameters: {'max_depth': 15, 'min_samples_split': 10}. Best is trial 4 with value: -2.097095027768707.
[I 2025-07-09 05:41:23,542] Trial 8 finished with value: -2.243128272693842 and para

🏆 Лучшие гиперпараметры: {'max_depth': 2, 'min_samples_split': 7}
📉 Лучшая средняя ошибка (CV): 2.048327182366868
📊 Оценка модели на train/test:
Train MSE: 3.3807
Train R2: 0.3086
Test MSE: 4.5106
Test R2: 0.1827

🔍 Оптимизация модели: rf


[I 2025-07-09 05:41:26,141] Trial 0 finished with value: -1.729289385843638 and parameters: {'n_estimators': 243, 'max_depth': 20}. Best is trial 0 with value: -1.729289385843638.
[I 2025-07-09 05:41:26,671] Trial 1 finished with value: -1.7703705754378223 and parameters: {'n_estimators': 120, 'max_depth': 9}. Best is trial 0 with value: -1.729289385843638.
[I 2025-07-09 05:41:27,632] Trial 2 finished with value: -1.728468126916603 and parameters: {'n_estimators': 233, 'max_depth': 15}. Best is trial 2 with value: -1.728468126916603.
[I 2025-07-09 05:41:28,193] Trial 3 finished with value: -1.7319373050196798 and parameters: {'n_estimators': 129, 'max_depth': 14}. Best is trial 2 with value: -1.728468126916603.
[I 2025-07-09 05:41:29,082] Trial 4 finished with value: -1.7249029734205166 and parameters: {'n_estimators': 219, 'max_depth': 20}. Best is trial 4 with value: -1.7249029734205166.
[I 2025-07-09 05:41:29,310] Trial 5 finished with value: -1.7176363344526073 and parameters: {'n_

🏆 Лучшие гиперпараметры: {'n_estimators': 54, 'max_depth': 15}
📉 Лучшая средняя ошибка (CV): 1.7029324859360817


[I 2025-07-09 05:41:56,425] A new study created in memory with name: no-name-5124a91d-8047-4c05-8efe-f7aef3266796
[I 2025-07-09 05:41:56,578] Trial 0 finished with value: -1.7074564284127554 and parameters: {'n_estimators': 90, 'max_depth': 7}. Best is trial 0 with value: -1.7074564284127554.


📊 Оценка модели на train/test:
Train MSE: 0.3581
Train R2: 0.9268
Test MSE: 3.7166
Test R2: 0.3266

🔍 Оптимизация модели: et


[I 2025-07-09 05:41:57,011] Trial 1 finished with value: -1.6290352708536016 and parameters: {'n_estimators': 186, 'max_depth': 17}. Best is trial 1 with value: -1.6290352708536016.
[I 2025-07-09 05:41:57,517] Trial 2 finished with value: -1.637262311741996 and parameters: {'n_estimators': 230, 'max_depth': 13}. Best is trial 1 with value: -1.6290352708536016.
[I 2025-07-09 05:41:58,168] Trial 3 finished with value: -1.6386137810925618 and parameters: {'n_estimators': 300, 'max_depth': 13}. Best is trial 1 with value: -1.6290352708536016.
[I 2025-07-09 05:41:58,459] Trial 4 finished with value: -1.7969760338207401 and parameters: {'n_estimators': 240, 'max_depth': 4}. Best is trial 1 with value: -1.6290352708536016.
[I 2025-07-09 05:41:59,115] Trial 5 finished with value: -1.637417048279565 and parameters: {'n_estimators': 293, 'max_depth': 20}. Best is trial 1 with value: -1.6290352708536016.
[I 2025-07-09 05:41:59,245] Trial 6 finished with value: -1.6605762116405618 and parameters: 

🏆 Лучшие гиперпараметры: {'n_estimators': 173, 'max_depth': 14}
📉 Лучшая средняя ошибка (CV): 1.6132522345151152


[I 2025-07-09 05:42:15,178] A new study created in memory with name: no-name-74f83b02-c1a3-4892-904b-55b672af3821


📊 Оценка модели на train/test:
Train MSE: 0.0177
Train R2: 0.9964
Test MSE: 3.8181
Test R2: 0.3082

🔍 Оптимизация модели: xgb


[I 2025-07-09 05:42:16,453] Trial 0 finished with value: -1.754510717084828 and parameters: {'n_estimators': 114, 'max_depth': 8, 'learning_rate': 0.15156036856983432, 'subsample': 0.6858815116990705, 'colsample_bytree': 0.941080282910509}. Best is trial 0 with value: -1.754510717084828.
[I 2025-07-09 05:42:17,869] Trial 1 finished with value: -1.7686862015495515 and parameters: {'n_estimators': 263, 'max_depth': 7, 'learning_rate': 0.20188664737355874, 'subsample': 0.917386535939889, 'colsample_bytree': 0.7547012325621414}. Best is trial 0 with value: -1.754510717084828.
[I 2025-07-09 05:42:19,045] Trial 2 finished with value: -1.9443659638911328 and parameters: {'n_estimators': 52, 'max_depth': 8, 'learning_rate': 0.2196272015193242, 'subsample': 0.5192472448220917, 'colsample_bytree': 0.5225920056558466}. Best is trial 0 with value: -1.754510717084828.
[I 2025-07-09 05:42:19,488] Trial 3 finished with value: -1.772027314194244 and parameters: {'n_estimators': 58, 'max_depth': 5, 'le

🏆 Лучшие гиперпараметры: {'n_estimators': 160, 'max_depth': 4, 'learning_rate': 0.10735686584852265, 'subsample': 0.801240445353836, 'colsample_bytree': 0.5903669202410528}
📉 Лучшая средняя ошибка (CV): 1.6491912679677747


[I 2025-07-09 05:42:32,787] A new study created in memory with name: no-name-9ba3e8b7-7c42-4af8-b17e-d808b559c843


📊 Оценка модели на train/test:
Train MSE: 0.0094
Train R2: 0.9981
Test MSE: 4.4553
Test R2: 0.1928

🔍 Оптимизация модели: lgbm


[I 2025-07-09 05:42:33,386] Trial 0 finished with value: -1.8864979597342508 and parameters: {'n_estimators': 75, 'max_depth': 15, 'learning_rate': 0.02338592890821179, 'num_leaves': 113, 'subsample': 0.7410649984269623, 'colsample_bytree': 0.967248426970347}. Best is trial 0 with value: -1.8864979597342508.
[I 2025-07-09 05:42:33,843] Trial 1 finished with value: -1.751806001791661 and parameters: {'n_estimators': 178, 'max_depth': 17, 'learning_rate': 0.1093110920405383, 'num_leaves': 32, 'subsample': 0.7612651646217969, 'colsample_bytree': 0.8411074354265697}. Best is trial 1 with value: -1.751806001791661.
[I 2025-07-09 05:42:34,313] Trial 2 finished with value: -1.7278995177744652 and parameters: {'n_estimators': 177, 'max_depth': 8, 'learning_rate': 0.07914091210426902, 'num_leaves': 111, 'subsample': 0.6948713819109151, 'colsample_bytree': 0.7198357705085509}. Best is trial 2 with value: -1.7278995177744652.
[I 2025-07-09 05:42:34,679] Trial 3 finished with value: -1.72025204937

🏆 Лучшие гиперпараметры: {'n_estimators': 217, 'max_depth': 0, 'learning_rate': 0.1515581633323828, 'num_leaves': 133, 'subsample': 0.7031848750970534, 'colsample_bytree': 0.5520326667065882}
📉 Лучшая средняя ошибка (CV): 1.689419817597208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2187
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 30
[LightGBM] [Info] Start training from score 4.786516
📊 Оценка модели на train/test:
Train MSE: 0.0081
Train R2: 0.9983
Test MSE: 4.4072
Test R2: 0.2015

🔍 Оптимизация модели: mlp


[I 2025-07-09 05:42:43,101] Trial 0 finished with value: -2.3952934311039185 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.0001596396931336731, 'learning_rate_init': 0.003963973580379033}. Best is trial 0 with value: -2.3952934311039185.
[I 2025-07-09 05:42:44,147] Trial 1 finished with value: -1.743732536492903 and parameters: {'hidden_layer_sizes': (64, 64), 'alpha': 0.0006795385774992075, 'learning_rate_init': 0.0002035008939895783}. Best is trial 1 with value: -1.743732536492903.
[I 2025-07-09 05:42:44,622] Trial 2 finished with value: -2.318667677147823 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.0044556320369275836, 'learning_rate_init': 0.00862543137453122}. Best is trial 1 with value: -1.743732536492903.
[I 2025-07-09 05:42:45,106] Trial 3 finished with value: -2.6239771295531193 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.00037523831556514196, 'learning_rate_init': 0.00976876399627731}. Best is trial 1 with value: -1.743732536492903.
[I 

🏆 Лучшие гиперпараметры: {'hidden_layer_sizes': (64, 64), 'alpha': 2.474878556121419e-05, 'learning_rate_init': 0.00019736337496229997}
📉 Лучшая средняя ошибка (CV): 1.7041128376302026
📊 Оценка модели на train/test:
Train MSE: 2.9648
Train R2: 0.3937
Test MSE: 4.4028
Test R2: 0.2023




In [53]:
svr_learning(X_train_scaled, y_train, X_test_scaled, y_test)

Train RMSE: 3.7003
Train R2: 0.2432
Test RMSE: 4.5344
Test R2: 0.1784


In [24]:
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

for name, df in dfs_dict.items():
    print(f"\n=== {name} ===")

    X = pd.concat([df, X_cell_line_filtered], axis=1)

    X.columns = pd.Index(X.columns).map(str)
    
    X_filtered_var = apply_varThreshold(X, 0)
    # X_corr = apply_corr(X_filtered_var, 0.2)
    X_train, X_test, y_train, y_test = train_test_split(X_filtered_var, y, test_size=0.2, random_state=42)

    X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)

    # 1. Создаем пайплайн, объединяющий UMAP и SVR
    # Важно: UMAP должен быть первым шагом
    pipeline = Pipeline([
        ('umap', umap.UMAP(random_state=42)),
        ('svr', SVR(kernel = 'rbf'))
    ])

    # 2. Определяем пространство поиска гиперпараметров
    # Используем распределения для Randomized Search
    param_distributions = {
        'umap__n_components': randint(5, 151), # Пример значений
        'umap__n_neighbors': randint(5, 76),       # Пример значений
        'umap__min_dist': uniform(0.0, 0.5),         # Пример значений
        'umap__metric': ['euclidean', 'cosine', 'correlation'], # Пример метрик

        # Параметры SVR тоже включаем в поиск!
        'svr__C': loguniform(1.0, 1e2),          # от 1 до 100
        'svr__epsilon': loguniform(1e-3, 1.0),    # от 0.001 до 1.0
        'svr__gamma': ['scale', 'auto'] + list(np.logspace(-4, 1, 6)) # Например, 'scale', 'auto' и значения
    }

    # 3. Определяем метрику для оценки
    # Для регрессии часто используют отрицательные метрики ошибок для оптимизации
    # (чем меньше ошибка, тем лучше -> чем больше отрицательная ошибка, тем лучше)
    scoring_metric = make_scorer(mean_squared_error, greater_is_better=False) # Оптимизируем по MSE (минимизируем)

    # 4. Настраиваем кросс-валидацию
    kf = KFold(n_splits=5, shuffle=True, random_state=42) # Например, 5 фолдов

    # 5. Создаем объект RandomizedSearchCV
    # n_iter - количество комбинаций для проверки (увеличьте, если есть ресурсы)
    # cv - объект кросс-валидации или число фолдов
    # scoring - метрика для оценки
    # verbose - для вывода информации о процессе
    # n_jobs - количество ядер для параллельных вычислений (-1 использует все доступные)
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_distributions,
        n_iter=200, # Проверим 50 случайных комбинаций
        cv=5,      # 5-кратная кросс-валидация
        scoring=scoring_metric,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # 6. Запускаем поиск гиперпараметров на тренировочных данных
    print("Запуск Randomized Search для пайплайна UMAP+SVR...")
    random_search.fit(X_train_scaled, y_train)
    print("Поиск завершен.")

    # 7. Получаем лучшие параметры и лучший результат CV
    print("\nЛучшие параметры найдены:")
    print(random_search.best_params_)
    print(f"\nЛучший результат кросс-валидации (Negative MSE): {random_search.best_score_:.4f}")
    print(f"Соответствующий MSE: {-random_search.best_score_:.4f}")

    # 8. Лучший обученный пайплайн доступен как random_search.best_estimator_
    # Он уже обучен на всем X_train с лучшими параметрами.
    # --- Добавляем оценку на данных ---
    print("\nОценка лучшей модели на тренировочных данных:")
    # Получаем предсказания лучшей модели на тренировочных данных
    y_train_pred = random_search.best_estimator_.predict(X_train_scaled)

    # Рассчитываем MSE на тренировочных данных
    train_mse = mean_squared_error(y_train, y_train_pred)
    # Рассчитываем R2 на тренировочных данных
    train_r2 = r2_score(y_train, y_train_pred)

    # Выводим метрики для тренировочных данных
    print(f"MSE на X_train: {train_mse:.4f}")
    print(f"R2 на X_train: {train_r2:.4f}")

    # Оцениваем его на отложенном тестовом наборе
    print("\nОценка лучшей модели на тестовых данных:")
    y_pred = random_search.best_estimator_.predict(X_test_scaled)
    final_mse = mean_squared_error(y_test, y_pred)
    print(f"MSE на X_test: {final_mse:.4f}")
    final_r2 = r2_score(y_test, y_pred)
    print(f"R2 на X_test: {final_r2:.4f}")


=== RDKit_Descriptors ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
           0         1          2          3         4         5         6  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   

          7         8     9  ...  cell_line_MDA-MB-435S cells  \
0  0.269231  1.000000   7.0  ...                          0.0   
1  0.272727  0.727273   7.0  ...                          0.0   
2  0.235294  0.647059   4.0  ...                          0.0   
3  0.366667  1.033333  13.0  ...                          0.0   
4  0.461538  1.192308  12.0  ...                          0.0   

   cell_line_N. tabacum cells  cell_line_NIH-3T3 cells

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)


                 0           1           2           3           4  \
count   268.000000  268.000000  268.000000  268.000000  268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431    0.294253   
std    1027.891708    1.349577    2.217002    4.505073    0.257694   
min      89.093200   -4.500000    4.050028   -6.131594   -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760    0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524    0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305    0.457545   
max    8511.835100    2.475000   11.999968   35.776307    0.944656   

                5           6           7           8           9  ...  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.095923    0.996992    0.193904    0.681644    4.343284  ...   
std      0.112545    0.019976    0.182269    0.336047    3.891034  ...   
min      0.000000    0.933810    0.000000    0.000000    0.000000  ...   

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 12.814703696157855, 'svr__epsilon': 0.8089237879387583, 'svr__gamma': 'auto', 'umap__metric': 'correlation', 'umap__min_dist': 0.31469931906763127, 'umap__n_components': 18, 'umap__n_neighbors': 19}

Лучший результат кросс-валидации (Negative MSE): -3.2599
Соответствующий MSE: 3.2599

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 2.7961
R2 на X_train: 0.4282

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.1142
R2 на X_test: 0.2546

=== BLOMAP_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
      0     1    2     3     5     6    7     8    10   11  ...  \
0  0.62  0.29  0.0 -0.06 -1.50  0.00  2.9  0.00 -1.50  0.0  ...   
1  0.62  0.29  0.0 -0.06 -2.53  0.00  3.0  1.17 -1.50  0.0  ...   
2  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -2.53  0.0  ...   
3  0.62  0.29  0.0 -0.06 -0.90  0.46  1.6  0.00  1.08 -0.1  ...   
4  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -1.50  0.0  ...



                0           1           2           3           5           6  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean    -0.660112   -0.003433    1.272761    0.275821   -0.465746   -0.033284   
std      1.316615    0.166192    1.406847    0.501976    1.349090    0.169254   
min     -2.530000   -0.300000    0.000000   -0.060000   -2.530000   -0.300000   
25%     -1.500000   -0.100000    0.000000    0.000000   -1.500000   -0.100000   
50%     -0.400000    0.000000    0.200000    0.000000   -0.180000    0.000000   
75%      0.290000    0.000000    2.900000    0.000000    0.682500    0.000000   
max      1.380000    0.580000    3.000000    1.170000    1.380000    0.580000   

                7           8          10          11  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     1.161940    0.238321   -0.448470   -0.025709  ...   
std      1.361306    0.474427    1.330883    0.225169  ...   
min      0.000000   -0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 25.00238716980236, 'svr__epsilon': 0.0036954269299859697, 'svr__gamma': 0.01, 'umap__metric': 'correlation', 'umap__min_dist': 0.4806576440924537, 'umap__n_components': 35, 'umap__n_neighbors': 27}

Лучший результат кросс-валидации (Negative MSE): -3.5998
Соответствующий MSE: 3.5998

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 3.4976
R2 на X_train: 0.2847

Оценка лучшей модели на тестовых данных:
MSE на X_test: 5.2941
R2 на X_test: 0.0408

=== Fingerprints_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
    11   22   23   27   36   41   56   70   72   79  ...  \
0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  ...   
1  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  ...   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...   
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  ...   
4  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...   

   cell_line_MDA-MB-435S cells  



               11          22          23          27          36          41  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.227612    0.003731    0.007463    0.037313    0.044776    0.294776   
std      0.420075    0.061085    0.086225    0.189883    0.207199    0.456795   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
50%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
75%      0.000000    0.000000    0.000000    0.000000    0.000000    1.000000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

               56          70          72          79  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.003731    0.003731    0.014925    0.727612  ...   
std      0.061085    0.061085    0.121481    0.446021  ...   
min      0.000000    0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 1.8536616154169725, 'svr__epsilon': 0.0012199668475623266, 'svr__gamma': 1.0, 'umap__metric': 'euclidean', 'umap__min_dist': 0.31015477567673233, 'umap__n_components': 70, 'umap__n_neighbors': 55}

Лучший результат кросс-валидации (Negative MSE): -3.6756
Соответствующий MSE: 3.6756

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 2.0501
R2 на X_train: 0.5807

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.4579
R2 на X_test: 0.1923

=== ProtBERT_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
         0         1         2         3         4         5         6  \
0  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
1  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
2  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
3  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
4  0.03056  0.024347  0.1363



                0           1           2           3           4           5  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.030560    0.024347    0.136336    0.032939   -0.065260   -0.135982   
std      0.000003    0.000007    0.000007    0.000004    0.000014    0.000008   
min      0.030527    0.024347    0.136259    0.032896   -0.065424   -0.135983   
25%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
50%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
75%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
max      0.030560    0.024433    0.136337    0.032939   -0.065259   -0.135885   

                6           7           8           9  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean    -0.046606    0.032417    0.011023   -0.019774  ...   
std      0.000008    0.000007    0.000008    0.000008  ...   
min     -0.046607    0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 5.203857985647047, 'svr__epsilon': 0.00219115336741892, 'svr__gamma': 'auto', 'umap__metric': 'cosine', 'umap__min_dist': 0.26015385045189665, 'umap__n_components': 122, 'umap__n_neighbors': 7}

Лучший результат кросс-валидации (Negative MSE): -3.4849
Соответствующий MSE: 3.4849

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 3.3635
R2 на X_train: 0.3121

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.5766
R2 на X_test: 0.1708

=== Combined_All_Features ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
     0_rdkit   1_rdkit    2_rdkit    3_rdkit   4_rdkit   5_rdkit   6_rdkit  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)


           0_rdkit     1_rdkit     2_rdkit     3_rdkit     4_rdkit  \
count   268.000000  268.000000  268.000000  268.000000  268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431    0.294253   
std    1027.891708    1.349577    2.217002    4.505073    0.257694   
min      89.093200   -4.500000    4.050028   -6.131594   -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760    0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524    0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305    0.457545   
max    8511.835100    2.475000   11.999968   35.776307    0.944656   

          5_rdkit     6_rdkit     7_rdkit     8_rdkit     9_rdkit  ...  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.095923    0.996992    0.193904    0.681644    4.343284  ...   
std      0.112545    0.019976    0.182269    0.336047    3.891034  ...   
min      0.000000    0.933810    0.000000    0.000000    0.000000  ...   

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 2.54754843343881, 'svr__epsilon': 0.6738460883888424, 'svr__gamma': 1.0, 'umap__metric': 'correlation', 'umap__min_dist': 0.3473924665198523, 'umap__n_components': 32, 'umap__n_neighbors': 70}

Лучший результат кросс-валидации (Negative MSE): -3.6723
Соответствующий MSE: 3.6723

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 1.6785
R2 на X_train: 0.6567

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.1997
R2 на X_test: 0.2391


