In [None]:
import sys
sys.path.append('../')  # Adjust the path accordingly
import warnings
warnings.filterwarnings("ignore")

from DataAnalysis.EventAnalyzer import EventAnalyzer
import pandas as pd
from PrepareDataset.DataEncoder.FeatureCollector import FeatureCollector

from PrepareDataset.DataEncoder.PreprocessLogger import PreprocessLogger

logger = PreprocessLogger(PreprocessLogger.__name__, jupyter=False, file_name="liver.log").logger

In [None]:
## bca data
PROJECT_PATH = "/home/dmitrii/GitHub/ukbb_risk_assessment/"
cohort_path = PROJECT_PATH + 'PrepareDataset/resources/3m_3y/pancreas_3m_3y/'
data_showcase_path = (PROJECT_PATH + 'PrepareDataset/resources/Data_Dictionary_Showcase.csv')

eids_path = cohort_path +'labels.csv'
eids_to_read = pd.read_csv(eids_path)['eid'].tolist()
cardiac_radiomics_path = PROJECT_PATH + "PrepareDataset/resources/cardiac_features/table_all.csv"


In [None]:
len(eids_to_read)

In [None]:
feature_collector = FeatureCollector(label_path=(cohort_path + "labels_with_val.csv"), logger=logger)
file_path_to_features = cohort_path + "/preprocessed_features/"
#file_path_to_features = cohort_path + "/encoded_features/"
feature_collector.load_features(data_showcase_path=data_showcase_path, file_path_to_features=file_path_to_features)

In [None]:
set_of_features= set([
    #"cardiac_radiomics",
    #"bca_norm", 
    #"elixhauser_comorbidities", 
    #"basic_features", 
    #"met_physical_activity", 
    #"smoking", 
    #"alcohol", 
    #"general_health", 
    #"diet", 
    #"clinical_biomarkers"
    #"total_radiomics",
])

In [None]:
df_labels = pd.read_csv(cohort_path + "labels_with_val.csv")

In [None]:
df_labels['split'].value_counts()

In [None]:
features = feature_collector.get_features(set_of_features)

In [None]:
pd.DataFrame(features.columns)

In [None]:
features["split"].value_counts()

In [None]:
# replace split column in features_ckd with df_labels split column merging by eid
features.drop(columns=["split"], inplace=True)
features = features.merge(df_labels[["eid", "split"]], on="eid", how="left")

In [None]:
features["split"].value_counts()

In [None]:
features.drop(columns=["event", "time_to_event", "split"], inplace=True)
#features.to_csv("/home/dmitrii/GitHub/ukbb_risk_assessment/PrepareDataset/resources/3m_3y/cvd2_3m_3y/tabular_final_preprocessed/bca+cardiac.csv")

In [None]:
features = pd.read_csv("/home/dmitrii/GitHub/ukbb_risk_assessment/data/projects/risk_assessment/labels/3m_3y/cvd2_3m_3y/tabular_final_preprocessed/bca+cardiac+nonimage_tabular.csv")
len(features.columns)

In [None]:
features.columns

In [None]:
ca = EventAnalyzer(features, logger=logger)
ca.remove_one_value_columns()

In [None]:
corr_table = ca.get_correlated_with_target(threshold=0)

In [None]:
pd.DataFrame(corr_table)

In [None]:
ca.data.columns

In [None]:
ca.data.drop(columns=["10P Liver PDFF (proton density fat fraction)", 'FR liver PDFF mean', 'Total lean tissue volume', 'Total thigh fat-free muscle volume'], inplace=True)

In [None]:
ca.split_data()

In [None]:
def _visualize_results(results):
    df = pd.DataFrame({
    model: {
        metric: f"{values['mean']:.3f} ± {values['std']:.3f}"
        for metric, values in metrics.items()
    }
    for model, metrics in results.items()
    }).T
    return df

In [None]:
import autogluon.tabular as ag

from autogluon.common import space
import random
import torch
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
from itertools import product
from autogluon.common import space


def train_and_evaluate(ca):
    seeds = ca.RANDOM_SET_SEED[:5]
    
    # Prepare the training and test datasets
    train_data = ca.data[ca.data['split'] == 'train'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    train_data['label'] = ca.data[ca.data['split'] == 'train']['event'].copy()
    test_data = ca.data[ca.data['split'] == 'test'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    test_data['label'] = ca.data[ca.data['split'] == 'test']['event'].copy()
    val_data = ca.data[ca.data['split'] == 'val'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    val_data['label'] = ca.data[ca.data['split'] == 'val']['event'].copy()
    
    results = {}

    models_to_train = ['RF', 'XGB', 'NN_TORCH'] #['XT', 'RF', 'XGB', 'NN_TORCH', 'CAT', 'GBM']

    
    for model in models_to_train:
        results[model] = {'accuracy': [], 'balanced_accuracy': [], 'f1': [], 'roc_auc': [], 'feature_importances': {"test":[], "test_val": []}}
        for seed in seeds:

            predictor = ag.TabularPredictor(label='label', eval_metric='balanced_accuracy', path=f'./autogluon/model_{model}_seed_{seed}', verbosity=2, problem_type='binary')
            hyperparameters = {
                'RF': {
                    'RF': {
                        'random_state': seed,  # Use your seed here
                    }
                },  
                'XGB': {
                    'XGB': {
                        'random_state': seed,
                        'subsample': 0.98,
                        'colsample_bytree': 0.98,
                    }
                },
                'NN_TORCH': {
                    'NN_TORCH': {
                        "seed_value":seed,
                    }
                }
            }
            predictor.fit(
                train_data=train_data,
                tuning_data=val_data,  # Set validation data for hyperparameter tuning
                hyperparameters=hyperparameters[model],
                time_limit=600,   # Time limit per seed (10 minutes)
                num_bag_folds=0,  # No bagging
                num_stack_levels=0,  # No stacking
                presets='best_quality',  # Best quality preset
            )
            
            y_pred = predictor.predict(test_data)
            y_proba = predictor.predict_proba(test_data)
            y_proba = y_proba.to_numpy()[:, 1]
            metrics = {
                "accuracy": accuracy_score(test_data['label'], y_pred),
                "balanced_accuracy": balanced_accuracy_score(test_data['label'], y_pred),
                "f1": f1_score(test_data['label'], y_pred),
                "roc_auc": roc_auc_score(test_data['label'], y_proba),
            }
    
            # Store the metrics for this model and seed
            for metric in metrics:
                results[model][metric].append(metrics[metric])
            test_val_data = pd.concat([test_data, val_data])
            results[model]['feature_importances']["test"].append(predictor.feature_importance(data=test_data, num_shuffle_sets=5))
            results[model]['feature_importances']["test_val"].append(predictor.feature_importance(data=test_val_data, num_shuffle_sets=5))
                
                
            

        
    # Calculate mean and standard deviation across seeds for each model
    results_combined = {}
    for model_name in results:
        results_combined[model_name] = {}
        for metric in results[model_name]:
            if metric != 'feature_importances':
                results_combined[model_name][metric] = {
                    "mean": np.mean(results[model_name][metric]),
                    "std": np.std(results[model_name][metric])
                }
            else:
                results_combined[model_name][metric] = results[model_name][metric]
    
    return results_combined#, best_model_predictor


results = train_and_evaluate(ca)

In [None]:
# remove feature_importances from results
results_visualized = {model: {metric: values for metric, values in metrics.items() if metric != 'feature_importances'} for model, metrics in results.items()}

In [None]:
_visualize_results(results_visualized)

In [None]:
# save feature importances with folder structure as keys
import os
import json
dataset_name = "cvd2WOBCA"
save_dir = "/home/dmitrii/GitHub/ukbb_risk_assessment/analysisNumericFeatures/resources/feature_importances/permutation_importances"
for model in results:
    for split in results[model]['feature_importances']:
        for i, fi in enumerate(results[model]['feature_importances'][split]):
            if not os.path.exists(f"{save_dir}/{dataset_name}/{model}/{split}/"):
                os.makedirs(f"{save_dir}/{dataset_name}/{model}/{split}/")
            fi.to_csv(f"{save_dir}/{dataset_name}/{model}/{split}/feature_importances_{ca.RANDOM_SET_SEED[i]}.csv")

In [None]:
results['XGB']['feature_importances']

In [None]:
train_data = ca.data[ca.data['split'] == 'train'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
train_data['label'] = ca.data[ca.data['split'] == 'train']['event'].copy()
test_data = ca.data[ca.data['split'] == 'test'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
test_data['label'] = ca.data[ca.data['split'] == 'test']['event'].copy()

In [None]:
model = best_model_predictor[0]._trainer.load_model("XGBoost")

In [None]:
# take only column from model.features
train_data = train_data[model.features + ['label']]
test_data = test_data[model.features + ['label']]

In [None]:
model = best_model_predictor[0]._trainer.load_model("XGBoost")
len(model.model.feature_importances_), len(model.features), len(train_data)

In [None]:
all_feature_importances = []
for predictor in best_model_predictor:
    model = predictor._trainer.load_model("XGBoost")
    feature_importances = model.model.feature_importances_
    feature_names = model.features
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importances
    })
    all_feature_importances.append(feature_importances)

In [None]:
all_metrics = []
seeds = [1514, 0, 42, 867228, 29847]
for i, predictor in enumerate(best_model_predictor):
    model = predictor._trainer.load_model("XGBoost")
    model.model.set_params(random_state=seeds[i], subsample=0.95, colsample_by_tree=0.9)
    model.model.fit(train_data.drop(columns=['label']), train_data['label'])
    y_pred = model.model.predict(test_data.drop(columns=['label']))
    y_proba = model.model.predict_proba(test_data.drop(columns=['label']))
    y_proba = y_proba[:, 1]
    y_labels = test_data['label']
    metrics = {
        "accuracy": accuracy_score(y_labels, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_labels, y_pred),
        "f1": f1_score(y_labels, y_pred),
        "roc_auc": roc_auc_score(y_labels, y_proba),
    }
    all_metrics.append(metrics)
print("Mean metrics across seeds:")
print("Balanced Accuracy: ", np.mean([x['balanced_accuracy'] for x in all_metrics]), np.std([x['balanced_accuracy'] for x in all_metrics]))

In [None]:
from sklearn.ensemble import RandomForestClassifier
# import xgb
from xgboost import XGBClassifier
rf = RandomForestClassifier(random_state=1514, max_leaf_nodes=15000, n_estimators=300)

# Assuming 'predictor' is your AutoGluon predictor and 'data' is your original dataset
y = train_data['label']
X = train_data.drop(columns=['label'])


In [None]:
rf.fit(X, y)
y_pred = rf.predict(test_data.drop(columns=['label']))
y_proba = rf.predict_proba(test_data.drop(columns=['label']))
y_proba = y_proba[:, 1]
metrics = {
    "accuracy": accuracy_score(test_data['label'], y_pred),
    "balanced_accuracy": balanced_accuracy_score(test_data['label'], y_pred),
    "f1": f1_score(test_data['label'], y_pred),
    "roc_auc": roc_auc_score(test_data['label'], y_proba),
}
metrics

In [None]:
importances = rf.feature_importances_
feature_names = rf.feature_names_in_
feature_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
import pandas as pd
from xgboost import XGBClassifier

# Assuming 'train_data' and 'test_data' are your dataframes
y_train = train_data['label']
X_train = train_data.drop(columns=['label'])
y_test = test_data['label']
X_test = test_data.drop(columns=['label'])

# List of seeds
seeds = [1514, 0, 42, 867228, 29847]

# To store metrics and feature importances
all_metrics = []
all_feature_importances = []

for seed in seeds:
    # Initialize RandomForest with current seed
    rf = RandomForestClassifier(random_state=seed, max_leaf_nodes=15000, n_estimators=300)
    #rf = XGBClassifier(random_state=seed, subsample=0.95, colsample_by_tree=0.95, booster='gbtree', objective='binary:logistic', base_score=5E-1)
    
    # Fit the model
    rf.fit(X_train, y_train)
    
    # Predictions
    y_pred = rf.predict(X_test)
    y_proba = rf.predict_proba(X_test)[:, 1]
    
    # Compute metrics
    metrics = {
        "seed": seed,
        "accuracy": accuracy_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }
    all_metrics.append(metrics)
    
    # Compute feature importances
    feature_importances = pd.DataFrame({
        'feature': rf.feature_names_in_,
        'importance': rf.feature_importances_,
        'seed': seed
    })
    all_feature_importances.append(feature_importances)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_average_feature_importance_boxplot(all_feature_importances, category_dict, title=""):
    """
    Plots a boxplot for average feature importances by category, showing all categories, 
    even those without any features, but leaving them empty.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the boxplot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Calculate average importance for each feature across all seeds
    avg_feature_importances_df = combined_feature_importances_df.groupby('feature')['importance'].mean().reset_index()
    
    # Prepare the data for the boxplot
    plot_data = []

    for category, features in category_dict.items():
        # Filter for the relevant features in the current category based on averaged importances
        category_data = avg_feature_importances_df[avg_feature_importances_df['feature'].isin(features)]
        if not category_data.empty:
            # Assign category and append to plot_data list
            category_data = category_data.assign(Category=category)  # Add 'Category' column
        else:
            # If no data for this category, create a placeholder with NaN importance
            placeholder_data = pd.DataFrame({
                'feature': [np.nan],  # NaN feature to indicate no data
                'importance': [np.nan],  # NaN importance to indicate no data
                'Category': [category]  # Add the category name
            })
            plot_data.append(placeholder_data)
        
        plot_data.append(category_data)
    
    # Combine all data into a single DataFrame
    combined_plot_data = pd.concat(plot_data, ignore_index=True)

    # Plotting the boxplot using seaborn
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Category', y='importance', data=combined_plot_data, palette='Set3', showmeans=True)
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Average Importance')
    plt.xticks(rotation=45)
    plt.show()

# Example usage
plot_average_feature_importance_boxplot(all_feature_importances, features_by_category, title="CKD Feature Importances")


In [None]:
#mean and std of metris
mean, std = np.mean([x['balanced_accuracy'] for x in all_metrics]), np.std([x['balanced_accuracy'] for x in all_metrics])
print(f"Mean Balanced Accuracy: {mean:.3f} ± {std:.3f}")

In [None]:
# save all_feature_importances
import pandas as pd
all_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)

In [None]:
all_feature_importances_df.to_csv("/home/dmitrii/GitHub/ukbb_risk_assessment/analysisNumericFeatures/resources/feature_importances/ckd_feature_importances.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_feature_importance_boxplot(all_feature_importances, category_dict, title=""):
    """
    Plots a boxplot for feature importances by category, showing all categories,
    including data from multiple seeds.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the boxplot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Prepare the data for the boxplot
    plot_data = []

    for category, features in category_dict.items():
        # Filter for the relevant features in the current category across all seeds
        category_data = combined_feature_importances_df[combined_feature_importances_df['feature'].isin(features)]
        if not category_data.empty:
            # Append category and importance values to the plot_data list
            category_data = category_data.assign(Category=category)  # Add 'Category' column
            plot_data.append(category_data)
    
    # Combine all data into a single DataFrame
    combined_plot_data = pd.concat(plot_data, ignore_index=True)

    # Plotting the boxplot using seaborn
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Category', y='importance', data=combined_plot_data, palette='Set3', showmeans=True)
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Importance')
    plt.xticks(rotation=45)
    plt.show()

plot_feature_importance_boxplot(all_feature_importances, features_by_category, title="Pancreas Feature Importances")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_average_feature_importance_boxplot(all_feature_importances, category_dict, title=""):
    """
    Plots a boxplot for average feature importances by category, showing all categories.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the boxplot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Calculate average importance for each feature across all seeds
    avg_feature_importances_df = combined_feature_importances_df.groupby('feature')['importance'].mean().reset_index()
    
    # Prepare the data for the boxplot
    plot_data = []

    for category, features in category_dict.items():
        # Filter for the relevant features in the current category based on averaged importances
        category_data = avg_feature_importances_df[avg_feature_importances_df['feature'].isin(features)]
        if not category_data.empty:
            # Assign category and append to plot_data list
            category_data = category_data.assign(Category=category)  # Add 'Category' column
            plot_data.append(category_data)
    
    # Combine all data into a single DataFrame
    combined_plot_data = pd.concat(plot_data, ignore_index=True)

    # Plotting the boxplot using seaborn
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Category', y='importance', data=combined_plot_data, palette='Set3', showmeans=True)
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Average Importance')
    plt.xticks(rotation=45)
    plt.show()

plot_average_feature_importance_boxplot(all_feature_importances, features_by_category, title="CVD2 w/o BCA Feature Importances")
# Example usage:
# Assuming 'all_feature_importances' is a list of DataFrames containing feature importances from all seeds
# with columns: 'feature', 'importance', and 'seed'.

# Example category_dict:
# category_dict = {
#     'Category1': ['feature1', 'feature2'],
#     'Category2': ['feature3', 'feature4'],
#     # ... other categories


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_average_feature_importance_boxplot(all_feature_importances, category_dict, title=""):
    """
    Plots a boxplot for average feature importances by category, showing all categories, 
    even those without any features, but leaving them empty.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the boxplot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Calculate average importance for each feature across all seeds
    avg_feature_importances_df = combined_feature_importances_df.groupby('feature')['importance'].mean().reset_index()
    
    # Prepare the data for the boxplot
    plot_data = []

    for category, features in category_dict.items():
        # Filter for the relevant features in the current category based on averaged importances
        category_data = avg_feature_importances_df[avg_feature_importances_df['feature'].isin(features)]
        if not category_data.empty:
            # Assign category and append to plot_data list
            category_data = category_data.assign(Category=category)  # Add 'Category' column
        else:
            # If no data for this category, create a placeholder with NaN importance
            placeholder_data = pd.DataFrame({
                'feature': [np.nan],  # NaN feature to indicate no data
                'importance': [np.nan],  # NaN importance to indicate no data
                'Category': [category]  # Add the category name
            })
            plot_data.append(placeholder_data)
        
        plot_data.append(category_data)
    
    # Combine all data into a single DataFrame
    combined_plot_data = pd.concat(plot_data, ignore_index=True)

    # Plotting the boxplot using seaborn
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Category', y='importance', data=combined_plot_data, palette='Set3', showmeans=True)
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Average Importance')
    plt.xticks(rotation=45)
    plt.show()

# Example usage
plot_average_feature_importance_boxplot(all_feature_importances, features_by_category, title="Pancreas Feature Importances")


In [None]:
# read json
import json
with open('./resources/feature_importances/features_by_category.json') as f:
    features_by_category = json.load(f)

In [None]:
plot_feature_importance_boxplot(feature_importances, features_by_category, title="Pancreas Feature Importances")

In [None]:
def plot_feature_importance_manhattan(df, category_dict):
    """
    Plots a Manhattan plot for feature importances by category.

    Parameters:
    df (pd.DataFrame): DataFrame with columns 'feature' and 'importance'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the Manhattan plot.
    """
    # Create a reverse lookup for category from feature
    feature_to_category = {feature: category for category, features in category_dict.items() for feature in features}
    
    # Map each feature to its category
    df['category'] = df['feature'].map(feature_to_category)
    
    # Assign a numeric value to each category
    category_mapping = {category: idx for idx, category in enumerate(category_dict.keys())}
    df['category_id'] = df['category'].map(category_mapping)

    # Plotting the Manhattan plot
    plt.figure(figsize=(12, 6))
    plt.scatter(df['category_id'], df['importance'], c='blue', alpha=0.6)
    
    # Add horizontal lines for significance thresholds if necessary
    # plt.axhline(y=0.1, color='r', linestyle='--', label='Threshold')

    # Customize x-axis with category labels
    plt.xticks(ticks=list(category_mapping.values()), labels=list(category_mapping.keys()), rotation=45)
    plt.xlabel('Category')
    plt.ylabel('Feature Importance')
    plt.title('Feature Importance Manhattan Plot by Category')
    plt.grid(True)
    plt.show()

plot_feature_importance_manhattan(feature_importances, features_by_category)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_feature_importance_manhattan(all_feature_importances, category_dict):
    """
    Plots a Manhattan plot for feature importances by category across all seeds.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the Manhattan plot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Create a reverse lookup for category from feature
    feature_to_category = {feature: category for category, features in category_dict.items() for feature in features}
    
    # Map each feature to its category
    combined_feature_importances_df['category'] = combined_feature_importances_df['feature'].map(feature_to_category)
    
    # Assign a numeric value to each category
    category_mapping = {category: idx for idx, category in enumerate(category_dict.keys())}
    combined_feature_importances_df['category_id'] = combined_feature_importances_df['category'].map(category_mapping)

    # Plotting the Manhattan plot
    plt.figure(figsize=(12, 6))
    plt.scatter(combined_feature_importances_df['category_id'], 
                combined_feature_importances_df['importance'], 
                c='blue', alpha=0.6)
    
    # Customize x-axis with category labels
    plt.xticks(ticks=list(category_mapping.values()), labels=list(category_mapping.keys()), rotation=45)
    plt.xlabel('Category')
    plt.ylabel('Feature Importance')
    plt.title('Feature Importance Manhattan Plot by Category Across All Seeds')
    plt.grid(True)
    plt.show()
plot_feature_importance_manhattan(all_feature_importances, features_by_category)
# Example usage:
# Assuming 'all_feature_importances' is a list of DataFrames containing feature importances from all seeds
# with columns: 'feature', 'importance', and 'seed'.

# Example category_dict:
# category_dict = {
#     'Category1': ['feature1', 'feature2'],
#     'Category2': ['feature3', 'feature4'],
#     # ... other categories
# }
# plot_feature_importance_manhattan(all_feature_importances, category_dict)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_average_feature_importance_manhattan(all_feature_importances, category_dict):
    """
    Plots a Manhattan plot for average feature importances by category across seeds.

    Parameters:
    all_feature_importances (list): List of DataFrames, each containing feature importances for a seed 
                                    with columns 'feature', 'importance', and 'seed'.
    category_dict (dict): Dictionary where keys are category names and values are lists of features.

    Returns:
    None: Displays the Manhattan plot.
    """
    # Concatenate all DataFrames in the list to form a single DataFrame
    combined_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    
    # Calculate the average importance for each feature across all seeds
    avg_feature_importances_df = combined_feature_importances_df.groupby('feature')['importance'].mean().reset_index()
    
    # Create a reverse lookup for category from feature
    feature_to_category = {feature: category for category, features in category_dict.items() for feature in features}
    
    # Map each feature to its category
    avg_feature_importances_df['category'] = avg_feature_importances_df['feature'].map(feature_to_category)
    
    # Assign a numeric value to each category
    category_mapping = {category: idx for idx, category in enumerate(category_dict.keys())}
    avg_feature_importances_df['category_id'] = avg_feature_importances_df['category'].map(category_mapping)

    # Plotting the Manhattan plot
    plt.figure(figsize=(12, 6))
    plt.scatter(avg_feature_importances_df['category_id'], 
                avg_feature_importances_df['importance'], 
                c='blue', alpha=0.6)
    
    # Customize x-axis with category labels
    plt.xticks(ticks=list(category_mapping.values()), labels=list(category_mapping.keys()), rotation=45)
    plt.xlabel('Category')
    plt.ylabel('Average Feature Importance')
    plt.title('Average Feature Importance Manhattan Plot by Category Across Seeds')
    plt.grid(True)
    plt.show()
plot_average_feature_importance_manhattan(all_feature_importances, features_by_category)
# Example usage:
# Assuming 'all_feature_importances' is a list of DataFrames containing feature importances from all seeds
# with columns: 'feature', 'importance', and 'seed'.

# Example category_dict:
# category_dict = {
#     'Category1': ['feature1', 'feature2'],
#     'Category2': ['feature3', 'feature4'],
#     # ... other categories
# }
# plot_average_feature_importance_manhattan(all_feature_importances, category_dict)


In [None]:
importances = model.model.feature_importances_
feature_names = model.feature_metadata.get_features()
feature_importances = dict(zip(feature_names, importances))
print(feature_importances)

In [None]:
# read json
import json
with open('./resources/feature_importances/features_by_category.json') as f:
    features_by_category = json.load(f)

In [None]:
# bca +/ cardiac + questionnaire
_visualize_results(results)

In [None]:
# bca +/ cardiac + questionnaire
_visualize_results(results)

In [None]:
train_data = ca.data[ca.data['split'] == 'train'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
train_data['label'] = ca.data[ca.data['split'] == 'train']['event'].copy()
test_data = ca.data[ca.data['split'] == 'test'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
test_data['label'] = ca.data[ca.data['split'] == 'test']['event'].copy()
val_data = ca.data[ca.data['split'] == 'val'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
val_data['label'] = ca.data[ca.data['split'] == 'val']['event'].copy()
test_val_data = pd.concat([test_data, val_data])
model = 'RF'
seed = SEEDS[0]

predictor = ag.TabularPredictor(label='label', eval_metric='balanced_accuracy', path=f'./autogluon/model_{model}_seed_{seed}', verbosity=2, problem_type='binary')
hyperparameters = {
    model: {
        model: {
            'random_state': seed,  # Use your seed here
            #'subsample': 0.98,
            #'colsample_bytree': 0.98,
        }
    },
}
predictor.fit(
    train_data=train_data,
    tuning_data=val_data,  # Set validation data for hyperparameter tuning
    hyperparameters=hyperparameters[model],
    time_limit=3600,   # Time limit per seed (10 minutes)
    num_bag_folds=0,  # No bagging
    num_stack_levels=0,  # No stacking
    presets='best_quality',  # Best quality preset
)

y_pred = predictor.predict(test_data)
y_proba = predictor.predict_proba(test_data)
y_proba = y_proba.to_numpy()[:, 1]
metrics = {
    "accuracy": accuracy_score(test_data['label'], y_pred),
    "balanced_accuracy": balanced_accuracy_score(test_data['label'], y_pred),
    "f1": f1_score(test_data['label'], y_pred),
    "roc_auc": roc_auc_score(test_data['label'], y_proba),
}

print(metrics)

fi = predictor.feature_importance(data=test_val_data, num_shuffle_sets=5)


In [None]:
len(fi[fi['importance'] >= 0])

In [None]:
fi[fi['p_value'] < 0.05]

In [None]:
#save the feature importance
fi.to_csv("/home/dmitrii/GitHub/ukbb_risk_assessment/analysisNumericFeatures/resources/cvdWOBCA_feature_importances.csv")

In [None]:
fi[fi['p_value'] < 0.05]

In [None]:
#save the feature importance
fi.to_csv("/home/dmitrii/GitHub/ukbb_risk_assessment/analysisNumericFeatures/resources/cvd2All_feature_importances_+val.csv")

In [None]:
# bca / cardiac
_visualize_results(results)

In [None]:
# questionnaire
_visualize_results(results)

In [None]:
import autogluon.tabular as ag
from autogluon.common import space
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score

def train_and_evaluate(ca):
    seeds = ca.RANDOM_SET_SEED[:5]
    
    # Prepare the training and test datasets
    train_data = ca.data[ca.data['split'] == 'train'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    train_data['label'] = ca.data[ca.data['split'] == 'train']['event'].copy()
    test_data = ca.data[ca.data['split'] == 'test'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    test_data['label'] = ca.data[ca.data['split'] == 'test']['event'].copy()
    val_data = ca.data[ca.data['split'] == 'val'].drop(columns=['split', 'eid', 'event', 'time_to_event']).copy()
    val_data['label'] = ca.data[ca.data['split'] == 'val']['event'].copy()
    
    results = {}

    models_to_train = ['RF', 'XGB', 'NN_TORCH']

    for model in models_to_train:
        results[model] = {'accuracy': [], 'balanced_accuracy': [], 'f1': [], 'roc_auc': []}
        for seed in seeds:

            predictor = ag.TabularPredictor(label='label', eval_metric='f1', path=f'./autogluon/model_{model}_seed_{seed}', verbosity=2, problem_type='binary')

            hyperparameters = {
                'RF': {
                    'RF': {
                        'n_estimators': space.Int(100, 1000),
                        'max_depth': space.Int(5, 50),
                        'min_samples_split': space.Int(2, 10),
                        'min_samples_leaf': space.Int(1, 4),
                        'random_state': space.Int(seed, seed),
                    }
                },  
                'XGB': {
                    'XGB': {
                        'n_estimators': space.Int(100, 1000),
                        'learning_rate': space.Real(0.01, 0.3),
                        'max_depth': space.Int(3, 10),
                        'subsample': space.Real(0.5, 1.0),  # Tuning subsample
                        'colsample_bytree': space.Real(0.5, 1.0),  # Tuning colsample_bytree
                        'random_state': seed,
                    }
                },
                'NN_TORCH': {
                    'NN_TORCH': {
                        "num_layers": space.Int(1, 3),
                        "hidden_size": space.Int(64, 512),
                        "dropout_prob": space.Real(0.0, 0.5),
                        "learning_rate": space.Real(0.0001, 0.01),
                        "seed_value": seed,
                    }
                }
            }

            predictor.fit(
                train_data=train_data,
                tuning_data=val_data,  # Set validation data for hyperparameter tuning
                hyperparameters=hyperparameters[model],
                time_limit=600,   # Time limit per seed (10 minutes)
                num_bag_folds=0,  # No bagging
                num_stack_levels=0,  # No stacking
                presets='best_quality',  # Best quality preset
            )
            
            y_pred = predictor.predict(test_data)
            y_proba = predictor.predict_proba(test_data)
            y_proba = y_proba.to_numpy()[:, 1]
            metrics = {
                "accuracy": accuracy_score(test_data['label'], y_pred),
                "balanced_accuracy": balanced_accuracy_score(test_data['label'], y_pred),
                "f1": f1_score(test_data['label'], y_pred),
                "roc_auc": roc_auc_score(test_data['label'], y_proba),
            }
    
            # Store the metrics for this model and seed
            for metric in metrics:
                results[model][metric].append(metrics[metric])
        
    # Calculate mean and standard deviation across seeds for each model
    results_combined = {}
    for model_name in results:
        results_combined[model_name] = {}
        for metric in results[model_name]:
            results_combined[model_name][metric] = {
                "mean": np.mean(results[model_name][metric]),
                "std": np.std(results[model_name][metric])
            }
    
    return results_combined, predictor


results, predictor = train_and_evaluate(ca)
