<a href="https://colab.research.google.com/github/ttk66/Chem_analyzis/blob/main/CC50_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("final_filtered_data.csv")
df.fillna(df.median(numeric_only=True), inplace=True)

target = (df["log_CC50"] > df["log_CC50"].median()).astype(int)
feature_cols = df.select_dtypes(include=[np.number]).drop(columns=["IC50", "CC50", "SI", "log_IC50", "log_CC50", "log_SI"]).columns
X = df[feature_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    print(f"\nЗадача: log_CC50 > median, Модель: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))



Задача: log_CC50 > median, Модель: Logistic Regression
Accuracy: 0.7562189054726368
Precision: 0.78
Recall: 0.7428571428571429
F1 Score: 0.7609756097560976
ROC AUC: 0.8507440476190476
              precision    recall  f1-score   support

     Class 0       0.73      0.77      0.75        96
     Class 1       0.78      0.74      0.76       105

    accuracy                           0.76       201
   macro avg       0.76      0.76      0.76       201
weighted avg       0.76      0.76      0.76       201


Задача: log_CC50 > median, Модель: Random Forest
Accuracy: 0.8009950248756219
Precision: 0.8651685393258427
Recall: 0.7333333333333333
F1 Score: 0.7938144329896907
ROC AUC: 0.877827380952381
              precision    recall  f1-score   support

     Class 0       0.75      0.88      0.81        96
     Class 1       0.87      0.73      0.79       105

    accuracy                           0.80       201
   macro avg       0.81      0.80      0.80       201
weighted avg       0.81 