<a href="https://colab.research.google.com/github/ttk66/Chem_analyzis/blob/main/si_median_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

# Загрузка и подготовка данных
df = pd.read_csv("final_filtered_data.csv")
df.fillna(df.median(numeric_only=True), inplace=True)

# Целевая переменная
target = (df["log_SI"] > df["log_SI"].median()).astype(int)

# Исключаем признаки, напрямую связанные с таргетом
exclude_cols = ["SI", "log_SI", "IC50", "log_IC50", "CC50", "log_CC50"]
feature_cols = df.select_dtypes(include=[np.number]).drop(columns=exclude_cols).columns
X = df[feature_cols]

# Масштабирование
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

# Модели и гиперпараметры
model_configs = {
    "Logistic Regression": (LogisticRegression(), {}),
    "Random Forest": (RandomForestClassifier(random_state=42), {
        "n_estimators": [100, 200],
        "max_depth": [None, 10]
    }),
    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), {
        "n_estimators": [100],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5]
    }),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        "n_estimators": [100],
        "max_depth": [3, 6],
        "learning_rate": [0.05, 0.1]
    }),
    "SVM": (SVC(probability=True), {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"]
    })
}

# Обучение и метрики
for name, (model, params) in model_configs.items():
    if params:
        model = GridSearchCV(model, params, cv=3, scoring="f1", n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    print(f"\nМодель: {name}")
    if isinstance(model, GridSearchCV):
        print("Лучшие параметры:", model.best_params_)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))



Модель: Logistic Regression
Accuracy: 0.6567164179104478
Precision: 0.625
Recall: 0.6451612903225806
F1 Score: 0.6349206349206349
ROC AUC: 0.6702508960573477
              precision    recall  f1-score   support

     Class 0       0.69      0.67      0.68       108
     Class 1       0.62      0.65      0.63        93

    accuracy                           0.66       201
   macro avg       0.66      0.66      0.66       201
weighted avg       0.66      0.66      0.66       201


Модель: Random Forest
Лучшие параметры: {'max_depth': None, 'n_estimators': 100}
Accuracy: 0.6865671641791045
Precision: 0.6630434782608695
Recall: 0.6559139784946236
F1 Score: 0.6594594594594595
ROC AUC: 0.7288928713659896
              precision    recall  f1-score   support

     Class 0       0.71      0.71      0.71       108
     Class 1       0.66      0.66      0.66        93

    accuracy                           0.69       201
   macro avg       0.68      0.68      0.68       201
weighted avg     