In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from tqdm.notebook import tqdm
from fastai.tabular.all import *

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
def run_experiments(X, y, lr, layers_choice, epochs, ps, lin_first, focal_gamma, use_scaler, sampler, use_early_stopping=True):
    layers_map = {
        0: [200, 100],
        1: [300, 100],
        2: [300, 200],
        3: [100, 50],
        4: [50, 50],
        5: [50, 100, 50],
        6: [35, 70, 35],
        7: [200, 50],
        8: [350, 250],
        9: [400, 300],
        10: [100, 200, 100],
        11: [75, 50]
    }
    layers = layers_map[layers_choice]

    scaler = None
    if use_scaler:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X = pd.DataFrame(X_scaled, columns=X.columns)

    # Apply sampling
    if sampler == 'RandomUnderSampler':
        spl = RandomUnderSampler(sampling_strategy='majority')
    elif sampler == 'SMOTE':
        spl = SMOTE()
    elif sampler == 'ADASYN':
        spl = ADASYN()
    elif sampler == 'TomekLinks':
        spl = TomekLinks()
    else:
        spl = None

    if spl:
        X_resampled, y_resampled = spl.fit_resample(X, y)
        df = pd.concat([X_resampled, y_resampled], axis=1)
        y_stratify = y_resampled
    else:
        df = pd.concat([X, y], axis=1)
        y_stratify = y
    
    # Split data into train, validation, and test sets
    train_val_df, test_df = train_test_split(df, test_size=0.2, stratify=y_stratify)
    train_val_df.reset_index(drop=True, inplace=True)
    train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df[y.name])

    # Create FastAI DataLoaders
    cont_names = list(X.columns)
    target = y.name

    valid_idx = list(val_df.index)
    dls = TabularDataLoaders.from_df(train_val_df, cont_names=cont_names, y_names=[target], y_block=CategoryBlock(vocab=[0, 1]), valid_idx=valid_idx, bs=64)

    # Create FastAI Learner
    config = tabular_config(ps=ps, use_bn=True, bn_final=True, bn_cont=False, lin_first=lin_first)
    loss_func = FocalLossFlat(gamma=focal_gamma)
    if use_early_stopping:
        early_stop_cb = EarlyStoppingCallback(monitor='f1_score', min_delta=0.001, patience=5)
        learn = tabular_learner(dls, layers=layers, loss_func=loss_func, config=config, metrics=[accuracy, Precision(), Recall(), F1Score()], cbs=[early_stop_cb])
    else:
        learn = tabular_learner(dls, layers=layers, loss_func=loss_func, config=config, metrics=[accuracy, Precision(), Recall(), F1Score()])

    # Train the model
    learn.fit_one_cycle(epochs, lr_max=lr)
    return learn, dls, test_df, scaler

In [None]:
def evaluate_metrics(learn, dl, filename):
    preds, targs = learn.get_preds(dl=dl)
    y_true = targs.numpy()
    y_pred = preds.argmax(dim=1).numpy()
    y_prob = preds[:, 1].numpy()

    # Calculate ROC curve data
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)

    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "ROC AUC": roc_auc_score(y_true, y_prob),
        "Confusion Matrix": confusion_matrix(y_true, y_pred).tolist(),  # Convert numpy array to list for JSON serialization
        "ROC Curve Data": {
            "FPR": fpr.tolist(),
            "TPR": tpr.tolist(),
            "Thresholds": thresholds.tolist()
        }
    }

    # Print metrics
    for metric, value in metrics.items():
        if metric != "ROC Curve Data":
            print(f"{metric}: {value}")

    # Save metrics to JSON file
    with open(filename, 'w') as file:
        json.dump(metrics, file)

In [None]:
def predict_on_test(model, X_test, scaler):
    X_to_use = X_test
    
    if scaler:
        X_to_use = scaler.transform(X_test)
        X_to_use = pd.DataFrame(X_to_use, columns=features)
    
    dl = model.dls.test_dl(X_to_use)
    preds, _ = model.get_preds(dl=dl)
    y_pred = preds.argmax(dim=1).numpy()
    
    return y_pred

In [None]:
df = pd.read_csv("pqqmlp/PQQ-basic-descriptors.csv")
df

In [None]:
df_bbb = pd.read_csv("pqqmlp/BBB-basic-descriptors.csv")
df_bbb

In [None]:
df_test = pd.read_csv("pqqmlp/PQQ-basic-descriptors-test.csv")
df_test

In [None]:
features = ['ExactMolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds',
            'FractionCSP3', 'NumAromaticRings', 'MaxPartialCharge', 'MinPartialCharge',
            'NumNitrogen', 'NumOxygen']

In [None]:
targets = ['WP:3844', 'GO:0000165', 'GO:0004896', 'KEGG:hsa04064', 'KEGG:hsa04210', 'KEGG:hsa04630']

In [None]:
X = df[features]
X

In [None]:
X_bbb = df_bbb[features]
X_bbb

In [None]:
X_test = df_test[features]
X_test

In [None]:
predicted_features = {}

## Target BBB

In [None]:
y = df_bbb["label"]
y

In [None]:
config = {'lr': 0.001035773886129621, 'layers_choice': 1, 'epochs': 27, 'ps': 0.10008147411616398, 'lin_first': False, 'focal_gamma': 4.031398499719507, 'use_scaler': False, 'sampler': 'SMOTE'}
modelbbb, dls, test_df, scaler = run_experiments(X_bbb, y, **config)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(modelbbb, train_dl, "BBB-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(modelbbb, val_dl, "BBB-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(modelbbb, test_dl, "BBB-test.json")

In [None]:
feature_name = "BBB"
predicted_features[feature_name] = predict_on_test(modelbbb, X_test, scaler)

## Target 0

In [None]:
y = df[targets[0]]
y

In [None]:
config = {'lr': 0.002792902603027406, 'layers_choice': 2, 'epochs': 27, 'ps': 0.10169820739232566, 'lin_first': True, 'focal_gamma': 1.5887326762387637, 'use_scaler': True, 'sampler': 'SMOTE'}
model0, dls, test_df, scaler = run_experiments(X, y, **config)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model0, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model0, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model0, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[0]
predicted_features[feature_name] = predict_on_test(model0, X_test, scaler)

## Target 1

In [None]:
y = df[targets[1]]
y

In [None]:
config = {'lr': 0.00041238308020455307, 'layers_choice': 8, 'epochs': 26, 'ps': 0.151492634495349, 'lin_first': True, 'focal_gamma': 1.858491874138969, 'use_scaler': True, 'sampler': 'ADASYN'}
model1, dls, test_df, scaler = run_experiments(X, y, **config)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model1, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model1, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model1, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[1]
predicted_features[feature_name] = predict_on_test(model1, X_test, scaler)

## Target 2

In [None]:
y = df[targets[2]]
y

In [None]:
config = {'lr': 0.003991385406159671, 'layers_choice': 9, 'epochs': 19, 'ps': 0.10316706123945521, 'lin_first': True, 'focal_gamma': 4.113254237727293, 'use_scaler': True, 'sampler': 'SMOTE'}
model2, dls, test_df, scaler = run_experiments(X, y, **config)

In [None]:
model2.summary()

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model2, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model2, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model2, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[2]
predicted_features[feature_name] = predict_on_test(model2, X_test, scaler)

## Target 3

In [None]:
y = df[targets[3]]
y

In [None]:
config = {'lr': 0.0013650323272663461, 'layers_choice': 9, 'epochs': 22, 'ps': 0.13004603261655698, 'lin_first': False, 'focal_gamma': 4.148699864295439, 'use_scaler': False, 'sampler': 'SMOTE'}
model3, dls, test_df, scaler = run_experiments(X, y, **config, use_early_stopping=False)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model3, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model3, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model3, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[3]
predicted_features[feature_name] = predict_on_test(model3, X_test, scaler)

## Target 4

In [None]:
y = df[targets[4]]
y

In [None]:
config = {'lr': 0.0016289044146153332, 'layers_choice': 10, 'epochs': 30, 'ps': 0.10142910582152623, 'lin_first': True, 'focal_gamma': 2.3858461523935732, 'use_scaler': True, 'sampler': 'SMOTE'}
model4, dls, test_df, scaler = run_experiments(X, y, **config)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model4, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model4, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model4, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[4]
predicted_features[feature_name] = predict_on_test(model4, X_test, scaler)

## Target 5

In [None]:
y = df[targets[5]]
y

In [None]:
config = {'lr': 0.0030853737113547453, 'layers_choice': 10, 'epochs': 29, 'ps': 0.10713463021780657, 'lin_first': True, 'focal_gamma': 4.72346604865011, 'use_scaler': True, 'sampler': 'SMOTE'}
model5, dls, test_df, scaler = run_experiments(X, y, **config)

In [None]:
print("Metrics for Training Set:")
train_dl = dls.train.new(shuffle=False)
evaluate_metrics(model5, train_dl, f"{y.name.replace(':', '')}-train.json")

In [None]:
print("Metrics for Validation Set:")
val_dl = dls.valid.new(shuffle=False)
evaluate_metrics(model5, val_dl, f"{y.name.replace(':', '')}-val.json")

In [None]:
print("Metrics for Test Set:")
test_dl = dls.test_dl(test_df)
evaluate_metrics(model5, test_dl, f"{y.name.replace(':', '')}-test.json")

In [None]:
feature_name = targets[5]
predicted_features[feature_name] = predict_on_test(model5, X_test, scaler)

## Postprocessing

In [None]:
y_pred_df = pd.DataFrame(predicted_features)
y_pred_df

In [None]:
y_pred_df['count_1s'] = y_pred_df.sum(axis=1)
result = pd.concat([df_test["cid"], X_test, y_pred_df], axis=1)

## Result

In [None]:
result

In [None]:
result[result["cid"]== 1024]

In [None]:
df_test[df_test["cid"] == 1024]

In [None]:
result[result['count_1s'] == result['count_1s'].max()]

In [None]:
result[result['count_1s'] == result['count_1s'].min()]

In [None]:
result.to_csv("result.csv", index=False)