# Мини‑таск 4: Генерация и отбор молекул‑кандидатов

### Импорт необходимых библиотек

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import joblib
from rdkit.Chem import QED, Descriptors, Lipinski
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from sascorer import calculateScore as sa_score
from rdkit.Chem import Draw

Загружаем модель и токенизатор

In [None]:
tokenizer = AutoTokenizer.from_pretrained("liyuesen/druggpt")
model     = AutoModelForCausalLM.from_pretrained("liyuesen/druggpt")

Создаём pipeline для генерации

In [None]:
gen_pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer,
    device=-1, return_full_text=False
)

In [None]:
n_samples  = 2000
batch_size = 16
all_smiles = []

In [None]:
for _ in range(n_samples // batch_size):
    outputs = gen_pipe(
        [""] * batch_size,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=batch_size,
    )
    all_smiles += [out["generated_text"].strip().split()[0] for out in outputs]

In [None]:
remainder = n_samples - len(all_smiles)
if remainder:
    outputs = gen_pipe(
        [""] * remainder,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=remainder,
    )
    all_smiles += [out["generated_text"].strip().split()[0] for out in outputs]


In [None]:
pd.DataFrame({"smiles": all_smiles}).to_csv("generated_data.csv", index=False)

### Валидация SMILES и расчёт фингерпринтов

In [None]:
df = pd.read_csv('generated_data.csv')
valid = []
for smi in tqdm(df['smiles'], desc='Валидация'):
    if Chem.MolFromSmiles(smi):
        valid.append(smi)

In [None]:
df_valid = pd.DataFrame({'smiles': valid})
df_valid.to_csv('gen_valid.csv', index=False)

In [None]:
len(valid)

In [None]:
def featurize(smi):
    mol = Chem.MolFromSmiles(smi)
    fp  = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return np.array(fp)

### Предсказание активности (pIC50)

Загружаем модель и scaler

In [None]:
model = joblib.load('rf_model.pkl')
scaler = joblib.load('scaler.pkl')

Расчёт дескрипторов

In [None]:
X = np.vstack([featurize(s) for s in df_valid['smiles']])
X_scaled = scaler.transform(X)

In [None]:
df_valid['pIC50'] = model.predict(X_scaled)
print(df_valid.head())

### Расчёт QED, SA Score, токсофоров и Липински

Настраиваем BRENK каталог

In [None]:
tox_params = FilterCatalogParams()
tox_params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
brenk = FilterCatalog(tox_params)

Функции

In [None]:
def has_tox(mol): return len(brenk.GetMatches(mol)) > 0

def lipinski_violations(mol):
    v = 0
    if Descriptors.MolWt(mol) >= 500: v += 1
    if Descriptors.MolLogP(mol) > 5: v += 1
    if Lipinski.NumHDonors(mol) > 5: v += 1
    if Lipinski.NumHAcceptors(mol) > 10: v += 1
    return v

Считаем свойства

In [None]:
rows = []
for smi, pic50 in zip(df_valid['smiles'], df_valid['pIC50']):
    mol = Chem.MolFromSmiles(smi)
    rows.append({
        'smiles': smi,
        'pIC50': pic50,
        'QED': QED.qed(mol),
        'SA': sa_score(mol),
        'ToxAlert': int(has_tox(mol)),
        'LipinskiViol': lipinski_violations(mol)
    })

In [None]:
df_prop = pd.DataFrame(rows)
print(df_prop.head())

### Финальный отбор кандидатов

Фильтр по критериям

In [None]:
df_hits = df_prop[
    (df_prop.pIC50 > 6.0) &
    (df_prop.QED >= 0.7) &
    (df_prop.SA > 2) & (df_prop.SA < 6) &
    (df_prop.ToxAlert == 0) &
    (df_prop.LipinskiViol <= 1)
].copy()

In [None]:
df_hits['Comment'] = 'Подходит'
df_hits.to_csv('selected_hits.csv', index=False)

In [None]:
len(df_hits)

### Визуализация результатов

In [None]:
mols = [Chem.MolFromSmiles(s) for s in df_hits['smiles']]
legends = [f"pIC50={v:.2f}" for v in df_hits['pIC50']]
Draw.MolsToGridImage(mols, molsPerRow=5, legends=legends)