<a href="https://colab.research.google.com/github/ttk66/Chem_analyzis/blob/main/IC50_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

# Загрузка данных
df = pd.read_csv("final_filtered_data.csv")
df.fillna(df.median(numeric_only=True), inplace=True)

# Подготовка данных
target = (df["log_IC50"] > df["log_IC50"].median()).astype(int)
feature_cols = df.select_dtypes(include=[np.number]).drop(columns=["IC50", "CC50", "SI", "log_IC50", "log_CC50", "log_SI"]).columns
X = df[feature_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    print(f"\nЗадача: log_IC50 > median, Модель: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))



Задача: log_IC50 > median, Модель: Logistic Regression
Accuracy: 0.7014925373134329
Precision: 0.7156862745098039
Recall: 0.7019230769230769
F1 Score: 0.7087378640776699
ROC AUC: 0.7570876288659794
              precision    recall  f1-score   support

     Class 0       0.69      0.70      0.69        97
     Class 1       0.72      0.70      0.71       104

    accuracy                           0.70       201
   macro avg       0.70      0.70      0.70       201
weighted avg       0.70      0.70      0.70       201


Задача: log_IC50 > median, Модель: Random Forest
Accuracy: 0.7412935323383084
Precision: 0.782608695652174
Recall: 0.6923076923076923
F1 Score: 0.7346938775510204
ROC AUC: 0.7856363996827914
              precision    recall  f1-score   support

     Class 0       0.71      0.79      0.75        97
     Class 1       0.78      0.69      0.73       104

    accuracy                           0.74       201
   macro avg       0.74      0.74      0.74       201
weighted a