# Benchmarks

## Initialize

In [None]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib

import numpy as np
import pandas as pd
import lifelines
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import shutil

import plotly.express as px
import plotly.graph_objects as go
from plotly.graph_objects import Box

import matplotlib.pyplot as plt
from lifelines import CRCSplineFitter
import warnings
from lifelines.utils import CensoringType

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

from IPython.display import clear_output

from tqdm.notebook import tqdm
from lifelines.utils import concordance_index

from pycox.evaluation import EvalSurv
api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiZTI4M2QxNjYtNWZkNS00ZDQwLWFkYWUtMGVmOTM4ZGZlOWJjIn0="

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=1, threads_per_worker=10)
client = Client(cluster)

In [None]:
#from utils import get_best_experiment_predictions
import time
from IPython.display import display, HTML
def get_best_experiment(df, module="CoxPH", datamodule="UKBBDataModule", endpoint="M_MACE", feature_set="AgeSex", prediction_available=None, partition="0"):
    if df.empty:
        print("No experiments with given tag!") 
        return None, None       
    
    df = df[df['parameters/module'].str.contains(module)]
    df = df[df['parameters/datamodule'].str.contains(datamodule)]
    df = df[df['parameters/train_targets'].str.contains(endpoint, na=False)]
    df = df[df['parameters/feature_set']==feature_set]
    #print(partition)
    #if partition is not None:
    #print(df.parameter_cv_partition
    df = df[df['parameters/cv_partition'].astype(float).astype(int) == int(partition)]
    df = df[df['logs/checkpoint_value'] == df['logs/checkpoint_value']]
    if df.empty: print("No relevant experiment completed!")
    #display(df)
    if prediction_available==True: 
        try:
            df = df[df['logs/prediction_available'] == "TRUE"]
        except:
            print("No predictions!")
            display(df)
    if df.empty: 
        print("No predictions!")   
        return None, None
    else:
        for f in ['logs/checkpoint_value']: df.loc[:, f] = df.loc[:,f].astype(float)
        experiment_name = df.iloc[[df['logs/checkpoint_value'].argmax()]]["sys/id"].values[0]
        return experiment_name, df

def get_best_experiment_paths(df, module="CoxPH", datamodule="UKBBDataModule", endpoint="M_MACE", feature_set="AgeSex", partition="0", api_token=None):
    prediction_available=True
    experiment_name, df_out = get_best_experiment(df, module, datamodule, endpoint, feature_set, prediction_available, partition)
    if experiment_name is not None:
        cpt_path = df_out[df_out["sys/id"]==experiment_name]['logs/checkpoint_path'].values[0]
        #print(df[df.id==experiment_name].channel_prediction_path)
        pred_path = df_out[df_out["sys/id"]==experiment_name]['logs/prediction_path'].values[0]
        if pred_path is not None: return cpt_path, pred_path
        else: return cpt_path, None
    else: return None, None

In [None]:
project_name = "210616_centres_dask"
data_path = "/data/analysis/ag-reils/steinfej"
data_pre = f"{data_path}/data/2_datasets_pre/{project_name}"
data_post = f"{data_path}/data/3_datasets_post/{project_name}"

project_label = "21_PGS_Revision"
project_path = f"/data/analysis/ag-reils/ag-reils-shared/cardioRS/results/projects/{project_label}"
figures_path = f"{project_path}/figures"
data_results_path = f"{project_path}/data"
pathlib.Path(figures_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(data_results_path).mkdir(parents=True, exist_ok=True)

In [None]:
endpoints = ['MACE']

In [None]:
partitions = [str(p) for p in range(0, 22)]
splits = ["train", "valid", "test"]

### Load models

In [None]:
project="CardioRS/benchmarks"
#modules = ["CoxPH", "DeepHit", "DeepSurvivalMachine"]
modules = ["OLDDeepSurvivalMachine", "DeepSurvivalMachine"]
datamodules = ["UKBBSurvivalDatamodule"]
feature_sets = ["CVDCoreVariablesREORDERED", "CVDCoreVariablesWithPGS"]
tag="210701_REVISION_NEWDSM_CENTERS_fr_6"
import neptune.new as neptune
project = neptune.get_project(project, api_token)
df_new = project.fetch_runs_table(tag=tag).to_pandas()

In [None]:
df = pd.concat([df_new, df_old], axis=0)

In [None]:
def get_model_predictions(df, module, datamodule, endpoint, feature_set, partition): # maybe perspectively we could hand over a dict for filtering?
    #tqdm.write(f"{module} - {datamodule} - {endpoint} - {feature_set} - {partition}")
    _, preds_path = get_best_experiment_paths(df=df, module=module, datamodule=datamodule, endpoint=endpoint, feature_set=feature_set, partition=partition)
    #print(preds_path)
    return preds_path

In [None]:
pred_paths = Parallel(n_jobs=1)(delayed(get_model_predictions)(df, module, datamodule, endpoint, feature_set, partition) 
                                for endpoint in tqdm(endpoints) 
                                for feature_set in feature_sets
                                for partition in partitions
                                for module in modules 
                                for datamodule in datamodules)

In [None]:
import joblib
def get_df(path): return pd.read_feather(path)#return pd.read_csv(f"{path[:-8]}.csv", index_col=0)
print("Status: ", sum(x is not None for x in pred_paths)/len(pred_paths))
with joblib.parallel_backend('dask'):
    dfs = Parallel(n_jobs=80)(delayed(get_df)(path) for path in tqdm(pred_paths) if path is not None if not pd.isna(path))

In [None]:
preds_all = pd.concat(dfs, axis=0).reset_index(drop=True)

In [None]:
def clean_df(df, predictions):
    feature_df = df[["parameters/feature_set", "parameters/train_features"]].drop_duplicates().dropna()
    feature_map ={row["parameters/train_features"]: row["parameters/feature_set"] for i, row in feature_df.iterrows()}
    endpoint_map = {e: e[2:-8] for e in predictions.event_names.unique().tolist()}
    predictions["features"] = predictions["feature_names"].map(feature_map)
    predictions["endpoint"] = predictions["event_names"].map(endpoint_map)
    feature_read_map ={"CVDCoreVariables": "clinical", 
                       #"CVDCoreVariablesWithPGSno18REORDERED": "clinical_pgs_paper",
                      "CVDCoreVariablesWithPGS": "clinical_pgs_all" 
                       #"CVDCoreVariablesWithPGS000018":"clinical_pgs_18"
                      }
    predictions["features"] = predictions["features"].map(feature_read_map)
    return predictions

In [None]:
preds_all_cleaned = clean_df(df, preds_all)

In [None]:
time_cols = [f"0_{t}_Ft_native" for t in range(1, 27)]
preds = preds_all_cleaned[["eid", 'endpoint', 'features', 'split', 'partition', 'module', 'datamodule', 'net'] + time_cols]

In [None]:
for col in tqdm(preds.columns.to_list()):
    if preds[col].dtype == "object": preds[col]= preds[col].astype("category")

In [None]:
def fix_column_names(df):
    # rename and fix time bugs!!! -> 0_11_Ft -> Ft at t=10 -> fix earlier
    time_fix_map = dict(zip([col for col in df.columns if "Ft" in col], [f"Ft_{col}" for col in range(len([col for col in df.columns if "Ft" in col]))]))
    df = df.rename(time_fix_map, axis="columns")
    return df
preds = fix_column_names(preds)

In [None]:
preds.to_feather(f"{data_results_path}/predictions_model_210703_FINAL.feather")