<a href="https://colab.research.google.com/github/ttk66/Chem_analyzis/blob/main/SI%3E8_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("final_filtered_data.csv")
df.fillna(df.median(numeric_only=True), inplace=True)

threshold = np.log(8)  # около 2.08
target = (df["log_SI"] > threshold).astype(int)
feature_cols = df.select_dtypes(include=[np.number]).drop(columns=["IC50", "CC50", "SI", "log_IC50", "log_CC50", "log_SI"]).columns
X = df[feature_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    print(f"\nЗадача: log_SI > ln(8), Модель: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))


Задача: log_SI > ln(8), Модель: Logistic Regression
Accuracy: 0.681592039800995
Precision: 0.578125
Recall: 0.5
F1 Score: 0.5362318840579711
ROC AUC: 0.6870610768248564
              precision    recall  f1-score   support

     Class 0       0.73      0.79      0.76       127
     Class 1       0.58      0.50      0.54        74

    accuracy                           0.68       201
   macro avg       0.65      0.64      0.65       201
weighted avg       0.67      0.68      0.68       201


Задача: log_SI > ln(8), Модель: Random Forest
Accuracy: 0.7213930348258707
Precision: 0.6607142857142857
Recall: 0.5
F1 Score: 0.5692307692307692
ROC AUC: 0.7204724409448819
              precision    recall  f1-score   support

     Class 0       0.74      0.85      0.79       127
     Class 1       0.66      0.50      0.57        74

    accuracy                           0.72       201
   macro avg       0.70      0.68      0.68       201
weighted avg       0.71      0.72      0.71       201


