# Tox 24 challenge - see how deepFPlearn performs

We are doing the following steps
- load challenge data: training and test datasets
- remove duplicated SMILES with different target values
- scale the target value to the range [0, 1]
- use the whole set of SMILES (test and train substances), generate 2048 bit binary molecular fingerprints, train a specific autoencoder for compressing 2048 bit binary molecular fingerprints into 256 bit vectors with less zeros
- use the trained specific autoencoder to encode the 2048 bit fingerprints of the training substances
- train a regression model with this data 
- use the trained autoencoder to encode the test substances, use the regression model to predict the scaled target values
- reverse the scaling of the target values
- submit the predictions

## Setup

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import wandb
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
import os
import fnmatch
from IPython.display import Image, display
from keras.models import load_model

from dfpl import options, fingerprint as fp, utils, autoencoder as ac, single_label_model as sl, predictions


In [2]:
base_out_dir = 'data/output/fnn_compressed/'

In [3]:
pd.concat([pd.read_csv('data/tox24_challenge_train.csv'),
           pd.read_csv('data/tox24_challenge_test.csv')],
          ignore_index=True).to_csv('data/tox24_challenge_smiles_all.csv', index=False)

## Train the autoencoder

For this load train and test datasets first to get the full set of molecular structures. Store all structures again in a .csv file.

Adjust all options for training the autoencoder

In [4]:
opts = options.Options(
    inputFile='data/tox24_challenge_smiles_all.csv',
    outputDir=f'{base_out_dir}',
    ecModelDir=f'{base_out_dir}/AE_encoder/',
    ecWeightsFile='',
    type='smiles',
    fpType='topological',
    fpSize=2048,
    encFPSize=256,
    verbose=2,
    trainAC=True,
    aeActivationFunction='tanh',
    aeEpochs=3000,
    aeBatchSize=52,
    aeLearningRate=0.004123771070856377,
    aeLearningRateDecay=0.05465859583974732,
    trainFNN=False,
    wabTracking=True,
)


Allow tracking the training in Weights & Biases.

This requires a Weights & Biases account and at least the free plan. Feel free to comment this code cell.

In [5]:
if opts.wabTracking:
    wandb.init(project=f"tox_24",
               entity="dfpl_regression",
               config=vars(opts))

Load the training data and generate fingerprints.

In [6]:
df = fp.importDataFile(opts.inputFile, import_function=fp.importCSV, fp_size=opts.fpSize)

Train the autoencoder

In [7]:
opts.trainAC=False

In [8]:
utils.createDirectory(opts.outputDir)

# opts.trainAC=False
if opts.trainAC:
    # train an autoencoder on the full feature matrix
    encoder = ac.train_full_ac(df, opts)

Update the options for training the regression model with compressed features.

In [9]:
opts = options.Options(
    inputFile='data/tox24_challenge_train.csv',
    outputDir=f'{base_out_dir}',
    ecModelDir=f'{base_out_dir}/AE_encoder/',
    ecWeightsFile='',
    type='smiles',
    fpType='topological',
    fpSize=2048,
    encFPSize=256,
    verbose=2,
    trainFNN=True,
    compressFeatures=True,
    kFolds=5,
    testSize=0.2,
    optimizer="SGD",
    lossFunction="mse",
    epochs=5000,
    batchSize=56,
    activationFunction="tanh",
    dropout=0.15657883016344468,
    learningRate=0.017935022040821466,
    l2reg=0.009308121424156192,
    fnnType="REG",
    enableMultiLabel=False,
    wabTarget="activity",
)


In [10]:
df = fp.importDataFile(opts.inputFile, import_function=fp.importCSV, fp_size=opts.fpSize)

In [11]:
if opts.compressFeatures:
    # load trained model for autoencoder
    encoder = keras.models.load_model(opts.ecModelDir)

    # compress the fingerprints using the autoencoder
    df = ac.compress_fingerprints(df, encoder)

Scale the target values to [0,1]

In [12]:
df.columns

In [13]:
unscaled_target = df['activity'].to_numpy().reshape(-1, 1)

scaler = MinMaxScaler()
scaler.fit(unscaled_target)
scaled_target = scaler.transform(unscaled_target)
df = df.drop('activity', axis=1)
df = pd.concat([df, pd.DataFrame(scaled_target, columns=['activity'])], axis=1)

Now train the regression model

In [14]:
opts.inputFile

In [15]:
if opts.trainFNN:
    sl.train_single_label_models(df=df, opts=opts);

Find best fold

In [16]:
def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename


best_fold = 0
for filename in find_files(base_out_dir, '*.best.model.weights*'):
    filename_split = filename.split('.')
    best_fold = int(filename_split[0][-1])
    img_file = filename_split[0] + '.history.jpg'
    img = Image(filename=img_file)
    display(img)
best_fold

In [17]:
model_evaluation = pd.read_csv(
    filepath_or_buffer=f'{base_out_dir}/single_label_model.evaluation.csv')
model_evaluation[model_evaluation['fold'] == best_fold]

## Predict test dataset

In [18]:
opts = options.Options(
    inputFile='data/tox24_challenge_test.csv',
    outputDir=f'{base_out_dir}',
    outputFile=f'{base_out_dir}test.predictions.csv',
    ecModelDir=f'{base_out_dir}/AE_encoder/',
    fnnModelDir=f'{base_out_dir}/activity_saved_model',
    compressFeatures=True,
)

In [19]:
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)


In [20]:
if opts.compressFeatures:
    # load trained model for autoencoder
    encoder = keras.models.load_model(opts.ecModelDir)

    # compress the fingerprints using the autoencoder
    df = ac.compress_fingerprints(df, encoder)

In [21]:
# predict
df2 = predictions.predict_values(df=df, opts=opts)

In [23]:
df2.columns

Reverse the scaling of the predicted values

In [37]:
activity_predicted = df2['predicted']

activity_predicted_rescaled = scaler.inverse_transform(activity_predicted.to_numpy().reshape(-1, 1))

In [35]:
activity_trained = pd.read_csv('data/tox24_challenge_train.csv')['activity']

In [41]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(activity_trained, bins=100, alpha=0.5, color='r', label='train')
plt.hist(activity_predicted_rescaled, bins=50, alpha=0.5, color='b', label='predict')
plt.legend(loc='upper right')
plt.title('Distribution of target values')
plt.xlabel('Mean % activity')
plt.ylabel('Count')
plt.show()

Save for submission

In [40]:
pd.DataFrame(activity_predicted_rescaled, columns=["prediction"]).to_csv(f'{base_out_dir}/tox24_challenge_test_predicted.csv')