In [1]:
%run ../code/Limpeza.ipynb

In [2]:
%run ../code/Representacao.ipynb

In [3]:
%run ../code/Clusterizacao.ipynb

In [4]:
#-------BASE-------#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
#-------CHEM-------#
from rdkit import RDLogger
import cirpy
#-------MACHINE LEARNING-------#
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [5]:
# Desabilita os warnings do RDKit
RDLogger.DisableLog('rdApp.*')

***

In [6]:
def ANN(fpSize: int):
    
    # Define o modelo ANN
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(300, activation='relu', input_shape=(fpSize,)),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(1, activation='linear')
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    return model

***

# MOUSE, INTRAVENOSA

In [7]:
use_count = True
fpSize = 8192
radius = 2

In [None]:
# Obtem os dados
mouse_vi = pd.read_csv('../dados/mouse_vi.csv', usecols=['mouse_vi', 'smiles'])

# Converter valores da coluna 'valor' para float
mouse_vi['mouse_vi'] = pd.to_numeric(mouse_vi['mouse_vi'], errors='coerce')

# Remove NaN
mouse_vi.dropna(subset=['mouse_vi', 'smiles'], inplace=True, ignore_index=True)

# Normaliza LD50
mouse_vi['log_ld50'] = -np.log(mouse_vi['mouse_vi'])

# Realiza a limpeza dos dados
limpeza = Limpeza(dataframe=mouse_vi)
mouse_vi = limpeza.dados_limpos(col_smiles='smiles', col_valor='mouse_vi', sanitize=True, cutoff=.05, fragmento=False)

# Define a representação fingerprint
representacao = Representacao(dataframe=mouse_vi)
mouse_vi = representacao.fingerprint(col_smiles='smiles', fingerprint='morgan', use_count=use_count, fpSize=fpSize, radius=radius)

# Define os conjuntos de treinamento e teste
X = np.array(mouse_vi['Features'].to_list())
y = mouse_vi['log_ld50'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplica ANN
model = ANN(fpSize=fpSize)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.summary()

history = model.fit(X_train, y_train, validation_split=0.1, epochs=200, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 4.3661 - mae: 1.4896 - val_loss: 2.1115 - val_mae: 0.9453
Epoch 2/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 1.1610 - mae: 0.7528 - val_loss: 1.7591 - val_mae: 0.8906
Epoch 3/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 0.8006 - mae: 0.6084 - val_loss: 1.5977 - val_mae: 0.8361
Epoch 4/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 0.5798 - mae: 0.5128 - val_loss: 1.6801 - val_mae: 0.8511
Epoch 5/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 0.4731 - mae: 0.4630 - val_loss: 1.5491 - val_mae: 0.8338
Epoch 6/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 0.4288 - mae: 0.4277 - val_loss: 1.5125 - val_mae: 0.8183
Epoch 7/200
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0

In [None]:
# History
pd.DataFrame(history.history).plot()
plt.grid(True)
plt.show()

In [None]:
# Obtem as métricas do modelo
loss, mae = model.evaluate(X_test, y_test)

# R-squared
predictions = model.predict(X_test)
y_true = np.array(y_test)
y_pred = np.array(predictions)

r2 = r2_score(y_true, y_pred)

print(f'loss: {loss}')
print(f'mae: {mae}')
print(f'r2: {r2}')

***

# Validação

In [None]:
def ld50(log_ld50):
    return float(np.exp(-log_ld50))

def ghs_category(ld_50):
    if ld_50 <= 5:
        return 1
    elif ld_50 > 5 and ld_50 <= 50:
        return 2
    elif ld_50 > 50 and ld_50 <= 300:
        return 3
    elif ld_50 > 300 and ld_50 <= 2000:
        return 4
    elif ld_50 >= 2000:
        return 5
    else:
        return None

def classe_eh_a_mesma(ghs_observada, ghs_predita):
    return ghs_observada == ghs_predita

def reduz_animais(ghs_observada, ghs_predita):
    if ghs_observada > 3 and ghs_observada > ghs_predita:
        return True
    elif ghs_observada < 3 and ghs_observada < ghs_predita:
        return True
    elif ghs_observada == ghs_predita and ghs_observada != 3:
        return True
    elif ghs_predita == 3:
        return False
    else:
        return False

In [None]:
ghs = pd.DataFrame({
    'observado':y_true,
    'predito': [pred[0] for pred in y_pred]
})

***

# ATC method

In [None]:
ghs['ld50_observado'] = ghs['observado'].apply(ld50)
ghs['ld50_predito'] = ghs['predito'].apply(ld50)

In [None]:
ghs['ghs_observada'] = ghs['ld50_observado'].apply(ghs_category)
ghs['ghs_predita'] = ghs['ld50_predito'].apply(ghs_category)

In [None]:
ghs['mesma_classe'] = ghs.apply(lambda row: classe_eh_a_mesma(row['ghs_observada'], row['ghs_predita']), axis=1)

In [None]:
ghs['reduz_animais'] = ghs.apply(lambda row: reduz_animais(row['ghs_observada'], row['ghs_predita']), axis=1)

In [None]:
print(f'Predição da classe correta: {(ghs['mesma_classe'].sum() * 100) / ghs.shape[0]}')
print(f'Redução de animais: {(ghs['reduz_animais'].sum() * 100) / ghs.shape[0]}')

In [None]:
ghs

In [None]:
ghs.to_excel('validacao_mouse_vi.xlsx', index=False)