In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
import os
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

In [3]:
# Load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from collections import Counter, defaultdict
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LassoCV, LogisticRegression, LogisticRegressionCV, ElasticNetCV, ElasticNet
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import LeaveOneOut, KFold, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import statistics
import xgboost as xgb

In [None]:
# Load dataset
species_data = pd.read_excel('species_abundance_merged_2024-08-13.xlsx')
pathway_data = pd.read_excel('pathway_abundance_merged_2024-08-21.xlsx')

# List of columns to merge on
merge_columns = ['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country']

# Perform the merge
combined_data = pd.merge(species_data, pathway_data, on=merge_columns, how='inner')
combined_data

In [5]:
# Exclude subjects with a CD onset of 12 months
excluded_subjects = [23, 31]
combined_data = combined_data[~combined_data['Subject_number'].isin(excluded_subjects)]

In [None]:
# Function to apply abundance and prevalence thresholds
def filter_features(data, abundance_threshold = 0.001, prevalence_threshold = 0.1):
    initial_features_count = data.shape[1] - 8
    sample_count = data.shape[0]
    
    # Calculate prevalence threshold
    min_prevalent_samples = int(prevalence_threshold * sample_count)

    # Filter features based on the thresholds
    features_columns = data.columns.difference(['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'])
    features_data = data[features_columns]
    
    # Calculate the abundance and prevalence for each feature
    features_above_threshold = (features_data >= abundance_threshold).sum(axis=0) >= min_prevalent_samples
    filtered_features = features_columns[features_above_threshold]
    
    # Filter the data to keep only the selected species
    filtered_data = data[['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'] + filtered_features.tolist()]

    final_features_count = len(filtered_features)
    print(f"Initial number of features: {initial_features_count}")
    print(f"Number of features after filtering: {final_features_count}")
    
    return filtered_data

# Filter the data
combined_data_filtered = filter_features(combined_data)

In [7]:
# Divide subjects into early onset (≤ 30 months) and late onset (> 30 months)
early_onset_subjects = [9, 20, 27, 29, 30, 35, 10, 21, 22, 36, 5, 13, 18, 25, 34]

late_onset_subjects = [11, 24, 3, 12, 15, 17, 28, 32, 16, 1, 4, 6, 8, 14, 19, 2, 7]

# Filter data for each group
early_onset_data = combined_data_filtered[(combined_data_filtered['Subject_number'].isin(early_onset_subjects)) & (combined_data_filtered['timepoint_numeric'] < 18)]
late_onset_data = combined_data_filtered[(combined_data_filtered['Subject_number'].isin(late_onset_subjects)) & (combined_data_filtered['timepoint_numeric'] < 36)]

In [None]:
from sklearn.metrics import f1_score

# Define selected features for each time point
features_timepoint_12 = [
    'Phocaeicola_dorei', 'Bifidobacterium_pseudocatenulatum', 'Roseburia_inulinivorans', 'Bifidobacterium_dentium', 
    'Megasphaera_micronuciformis', 'Roseburia_faecis', 'Bacteroides_stercoris', 'Streptococcus_thermophilus', 
    'P164-PWY: purine nucleobases degradation I (anaerobic)', 'COA-PWY-1: superpathway of coenzyme A biosynthesis III (mammals)', 
    'ASPASN-PWY: superpathway of L-aspartate and L-asparagine biosynthesis', 'PWY-5121: superpathway of geranylgeranyl diphosphate biosynthesis II (via MEP)'
]

features_timepoint_15 = [
    'Intestinibacter_bartlettii', 'Bifidobacterium_pseudocatenulatum', 'Anaerostipes_hadrus', 'Ruminococcus_gnavus', 'Escherichia_coli', 
    'Clostridium_SGB6179', 'Mediterraneibacter_faecis', 'GLYOXYLATE-BYPASS: glyoxylate cycle', 'COMPLETE-ARO-PWY: superpathway of aromatic amino acid biosynthesis', 
    'FASYN-ELONG-PWY: fatty acid elongation -- saturated', 'HEMESYN2-PWY: heme b biosynthesis II (oxygen-independent)', 'CENTFERM-PWY: pyruvate fermentation to butanoate'
]

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Select appropriate features based on the time point
        if time_point == 12:
            selected_features = features_timepoint_12
        elif time_point == 15:
            selected_features = features_timepoint_15
        else:
            print(f"No predefined features for time point {time_point}.")
            return time_point, None

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data[selected_features] # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X):
                        X_train, X_test = X.iloc[train_index][selected_features], X.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'ElasticNet':
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)
                        elif ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, eval_metric='logloss', random_state=42)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict(X_test) >= threshold).astype(int) if ml_model_name == 'ElasticNet' else (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('LASSO', 'ElasticNet'), ('ElasticNet', 'ElasticNet')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for early onset groups
early_onset_max_timepoint = 18

# Evaluate models for early onset group
print("Evaluating Early Onset Group")
results_early_onset = evaluate_models(early_onset_data, early_onset_max_timepoint)

# Print results for early onset group
print("Results for Early Onset Group:")
for time_point, res in results_early_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
from sklearn.metrics import f1_score

# Define selected features for each time point
features_timepoint_12 = [
    'Phocaeicola_dorei', 'Bifidobacterium_pseudocatenulatum', 'Roseburia_inulinivorans', 'Bifidobacterium_dentium', 
    'Megasphaera_micronuciformis', 'Roseburia_faecis', 'Bacteroides_stercoris', 'Streptococcus_thermophilus', 
    'P164-PWY: purine nucleobases degradation I (anaerobic)', 'COA-PWY-1: superpathway of coenzyme A biosynthesis III (mammals)', 
    'ASPASN-PWY: superpathway of L-aspartate and L-asparagine biosynthesis', 'PWY-5121: superpathway of geranylgeranyl diphosphate biosynthesis II (via MEP)'
]

features_timepoint_15 = [
    'Intestinibacter_bartlettii', 'Bifidobacterium_pseudocatenulatum', 'Anaerostipes_hadrus', 'Ruminococcus_gnavus', 'Escherichia_coli', 
    'Clostridium_SGB6179', 'Mediterraneibacter_faecis', 'GLYOXYLATE-BYPASS: glyoxylate cycle', 'COMPLETE-ARO-PWY: superpathway of aromatic amino acid biosynthesis', 
    'FASYN-ELONG-PWY: fatty acid elongation -- saturated', 'HEMESYN2-PWY: heme b biosynthesis II (oxygen-independent)', 'CENTFERM-PWY: pyruvate fermentation to butanoate'
]

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Select appropriate features based on the time point
        if time_point == 12:
            selected_features = features_timepoint_12
        elif time_point == 15:
            selected_features = features_timepoint_15
        else:
            print(f"No predefined features for time point {time_point}.")
            return time_point, None

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data[selected_features] # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X):
                        X_train, X_test = X.iloc[train_index][selected_features], X.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'ElasticNet':
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)
                        elif ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, eval_metric='logloss', random_state=42)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict(X_test) >= threshold).astype(int) if ml_model_name == 'ElasticNet' else (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('LASSO', 'ElasticNet'), ('ElasticNet', 'ElasticNet'), 
                        ('LASSO', 'RandomForest'), ('ElasticNet', 'RandomForest'), 
                        ('LASSO', 'XGBoost'), ('ElasticNet', 'XGBoost')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for early onset groups
early_onset_max_timepoint = 18

# Evaluate models for early onset group
print("Evaluating Early Onset Group")
results_early_onset = evaluate_models(early_onset_data, early_onset_max_timepoint)

# Print results for early onset group
print("Results for Early Onset Group:")
for time_point, res in results_early_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
from sklearn.metrics import f1_score

# Define selected features for each time point
features_timepoint_12 = [
    'Veillonella_dispar', 'Lachnospira_pectinoschiza', 'Phocaeicola_dorei', 'Gemmiger_formicilis', 'Bifidobacterium_breve', 'Bifidobacterium_dentium', 
    'Enterococcus_faecium', 'Roseburia_inulinivorans', 'Escherichia_coli', 'Parabacteroides_merdae', 'Enterococcus_avium', 'Bifidobacterium_pseudocatenulatum', 
    'Alistipes_putredinis', 'Faecalibacterium_sp_HTFF', 'Lacticaseibacillus_rhamnosus', 'PWY-5005: biotin biosynthesis II', 'PWY-5838: superpathway of menaquinol-8 biosynthesis I'
]

features_timepoint_15 = [
    'Veillonella_ratti', 'Longicatena_caecimuris', 'Erysipelatoclostridium_ramosum', 'Anaerobutyricum_soehngenii', 'Roseburia_intestinalis', 
    'Bifidobacterium_pseudocatenulatum', 'Sutterella_wadsworthensis', 'Phocaeicola_dorei', 'Roseburia_faecis', 'Bacteroides_uniformis', 'Faecalibacterium_sp_HTFF', 
    'HISDEG-PWY: L-histidine degradation I', 'P23-PWY: reductive TCA cycle I', 'ARGININE-SYN4-PWY: L-ornithine biosynthesis II', 
    'CENTFERM-PWY: pyruvate fermentation to butanoate', 'P621-PWY: nylon-6 oligomer degradation', 'P562-PWY: myo-inositol degradation I', 
    'P461-PWY: hexitol fermentation to lactate, formate, ethanol and acetate', 'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 
    'COLANSYN-PWY: colanic acid building blocks biosynthesis', 'NAGLIPASYN-PWY: lipid IVA biosynthesis (E. coli)', 'LIPA-CORESYN-PWY: lipid A-core biosynthesis (E. coli K-12)', 
    'LPSSYN-PWY: superpathway of lipopolysaccharide biosynthesis', 'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)', 
    'PROPFERM-PWY: superpathway of L-alanine fermentation (Stickland reaction)', 'P161-PWY: acetylene degradation (anaerobic)', 'GALACTITOLCAT-PWY: galactitol degradation', 
    'GLCMANNANAUT-PWY: superpathway of N-acetylglucosamine, N-acetylmannosamine and N-acetylneuraminate degradation', 
    'PWY-1861: formaldehyde assimilation II (assimilatory RuMP Cycle)', 'METHGLYUT-PWY: superpathway of methylglyoxal degradation', 'CITRULBIO-PWY: L-citrulline biosynthesis', 
    'DENITRIFICATION-PWY: nitrate reduction I (denitrification)'
]
features_timepoint_18 = [
    'Streptococcus_thermophilus', 'Lactococcus_lactis', 'Bifidobacterium_adolescentis', 'GGB51647_SGB4348', 'Fusicatenibacter_saccharivorans', 'Phocaeicola_dorei', 
    'Streptococcus_salivarius', 'Faecalibacterium_sp_HTFF', 'Alistipes_putredinis', 'Anaerobutyricum_hallii', 'Gemmiger_formicilis', 'Bifidobacterium_longum', 
    'Blautia_caecimuris', 'GLUDEG-I-PWY: GABA shunt', 'PWY-5861: superpathway of demethylmenaquinol-8 biosynthesis I', 'P42-PWY: incomplete reductive TCA cycle', 
    'ARG+POLYAMINE-SYN: superpathway of arginine and polyamine biosynthesis', 'DENOVOPURINE2-PWY: superpathway of purine nucleotides de novo biosynthesis II', 
    'PWY-5022: 4-aminobutanoate degradation V', 'P621-PWY: nylon-6 oligomer degradation', 'PWY-561: superpathway of glyoxylate cycle and fatty acid degradation', 
    'PWY-5918: superpathway of heme b biosynthesis from glutamate', 'GLYCOLYSIS-E-D: superpathway of glycolysis and the Entner-Doudoroff pathway', 
    'PWY-5005: biotin biosynthesis II', 'PWY-5994: palmitate biosynthesis (type I fatty acid synthase)', 'PWY-241: C4 photosynthetic carbon assimilation cycle, NADP-ME type', 
    'HEME-BIOSYNTHESIS-II: heme b biosynthesis I (aerobic)', 'CITRULBIO-PWY: L-citrulline biosynthesis', 'PWY-5686: UMP biosynthesis I', 
    'BIOTIN-BIOSYNTHESIS-PWY: biotin biosynthesis I'
]

features_timepoint_21 = [
    'Alistipes_finegoldii', 'Roseburia_intestinalis', 'Clostridiaceae_bacterium', 'Phocaeicola_dorei', 'Faecalimonas_umbilicata', 'GGB3005_SGB3996', 'Bacteroides_stercoris', 
    'Mediterraneibacter_faecis', 'Anaerostipes_hadrus', 'Enterococcus_faecalis', 'P42-PWY: incomplete reductive TCA cycle', 'PWY-5913: partial TCA cycle (obligate autotrophs)', 
    'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 'PWY-6143: CMP-pseudaminate biosynthesis', 'FERMENTATION-PWY: mixed acid fermentation', 
    'DENOVOPURINE2-PWY: superpathway of purine nucleotides de novo biosynthesis II', 
    'GLYCOLYSIS-TCA-GLYOX-BYPASS: superpathway of glycolysis, pyruvate dehydrogenase, TCA, and glyoxylate bypass', 'PWY-1269: CMP-3-deoxy-D-manno-octulosonate biosynthesis'
]
features_timepoint_24 = [
    'Alistipes_shahii', 'Blautia_faecis', 'Lachnospira_eligens', 'Blautia_wexlerae', 'Blautia_massiliensis', 'Roseburia_inulinivorans', 'Phocaeicola_dorei', 
    'Parabacteroides_distasonis', 'Bacteroides_thetaiotaomicron', 'Faecalibacterium_prausnitzii', 'Romboutsia_timonensis', 'Clostridium_SGB6179', 'Ruminococcus_bicirculans', 
    'Bifidobacterium_pseudocatenulatum', 'Bacteroides_xylanisolvens', 'Longicatena_caecimuris', 'Roseburia_sp_AF02_12', 'Veillonella_ratti', 
    'P621-PWY: nylon-6 oligomer degradation', 'POLYAMINSYN3-PWY: superpathway of polyamine biosynthesis II', 
    'HCAMHPDEG-PWY: 3-phenylpropanoate and 3-(3-hydroxyphenyl)propanoate degradation to 2-hydroxypentadienoate', 'GLUCOSE1PMETAB-PWY: glucose and glucose-1-phosphate degradation', 
    'PWY-5104: L-isoleucine biosynthesis IV', 'PWY-5188: tetrapyrrole biosynthesis I (from glutamate)', 'PWY-5030: L-histidine degradation III', 
    'GLUCUROCAT-PWY: superpathway of &beta;-D-glucuronosides degradation'
]

features_timepoint_27 = [
    'Dorea_longicatena', 'Lachnospira_eligens', 'Bifidobacterium_bifidum', 'Parabacteroides_merdae', 'Bacteroides_uniformis', 'Blautia_wexlerae', 'Bifidobacterium_adolescentis', 
    'Bacteroides_fragilis', 'Blautia_massiliensis', 'Faecalibacterium_prausnitzii', 'Lachnospira_pectinoschiza', 'Blautia_hansenii', 'Gemmiger_formicilis', 'Alistipes_shahii', 
    'Anaerobutyricum_hallii', 'ORNDEG-PWY: superpathway of ornithine degradation', 'KETOGLUCONMET-PWY: ketogluconate metabolism', 
    'P105-PWY: TCA cycle IV (2-oxoglutarate decarboxylase)', 'P125-PWY: superpathway of (R,R)-butanediol biosynthesis', 'P108-PWY: pyruvate fermentation to propanoate I', 
    'FERMENTATION-PWY: mixed acid fermentation', 'P23-PWY: reductive TCA cycle I', 'GLYOXYLATE-BYPASS: glyoxylate cycle', 'P621-PWY: nylon-6 oligomer degradation', 
    'COBALSYN-PWY: superpathway of adenosylcobalamin salvage from cobinamide I', 'NAGLIPASYN-PWY: lipid IVA biosynthesis (E. coli)', 
    'P4-PWY: superpathway of L-lysine, L-threonine and L-methionine biosynthesis I', 'METH-ACETATE-PWY: methanogenesis from acetate', 
    'HEME-BIOSYNTHESIS-II-1: heme b biosynthesis V (aerobic)', 'MET-SAM-PWY: superpathway of S-adenosyl-L-methionine biosynthesis', 'GALACTARDEG-PWY: D-galactarate degradation I', 
    'GLUCARGALACTSUPER-PWY: superpathway of D-glucarate and D-galactarate degradation', 'PHOSLIPSYN-PWY: superpathway of phospholipid biosynthesis I (bacteria)', 
    'METSYN-PWY: superpathway of L-homoserine and L-methionine biosynthesis', 'GLUCARDEG-PWY: D-glucarate degradation I', 
    'GLYCOLYSIS-E-D: superpathway of glycolysis and the Entner-Doudoroff pathway', 'HOMOSER-METSYN-PWY: L-methionine biosynthesis I', 
    'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)'
]
features_timepoint_30 = [
    'Anaerobutyricum_soehngenii', 'Parabacteroides_distasonis', 'Dorea_longicatena', 'Flavonifractor_plautii', 'Brotolimicola_acetigignens', 'Phocaeicola_dorei', 
    'Lachnospira_eligens', 'GGB9480_SGB14874', 'Bacteroides_thetaiotaomicron', 'Bacteroides_caccae', 'Eubacterium_rectale', 'Blautia_hansenii', 
    'P42-PWY: incomplete reductive TCA cycle', 'PWY-5497: purine nucleobases degradation II (anaerobic)', 'PPGPPMET-PWY: ppGpp metabolism', 'PWY-5030: L-histidine degradation III', 
    'PWY-5505: L-glutamate and L-glutamine biosynthesis', 'ARGININE-SYN4-PWY: L-ornithine biosynthesis II', 'PWY-1269: CMP-3-deoxy-D-manno-octulosonate biosynthesis', 
    'PWY-5981: CDP-diacylglycerol biosynthesis III', 'DAPLYSINESYN-PWY: L-lysine biosynthesis I', 'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 
    'P124-PWY: Bifidobacterium shunt', 'GLYCOLYSIS: glycolysis I (from glucose 6-phosphate)'
]

features_timepoint_33 = [
    'Alistipes_putredinis', 'Bifidobacterium_adolescentis', 'GGB51647_SGB4348', 'Bacteroides_uniformis', 'Akkermansia_muciniphila', 'Lachnospira_pectinoschiza', 
    'Faecalibacterium_sp_HTFF', 'Phocaeicola_dorei', 'P164-PWY: purine nucleobases degradation I (anaerobic)', 'PWY-5130: 2-oxobutanoate degradation I', 
    'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)', 'GLYCOCAT-PWY: glycogen degradation I', 'POLYAMINSYN3-PWY: superpathway of polyamine biosynthesis II', 
    'P125-PWY: superpathway of (R,R)-butanediol biosynthesis', 'PWY-6588: pyruvate fermentation to acetone', 'PWY-5971: palmitate biosynthesis (type II fatty acid synthase)', 
    'PWY-6630: superpathway of L-tyrosine biosynthesis', 'PWY-6892: thiazole component of thiamine diphosphate biosynthesis I', 'PWY-5005: biotin biosynthesis II', 
    'PWY-5136: fatty acid &beta;-oxidation II (plant peroxisome)', 'PRPP-PWY: superpathway of histidine, purine, and pyrimidine biosynthesis', 
    'PWY-6478: GDP-D-glycero-&alpha;-D-manno-heptose biosynthesis', 'PWY-5981: CDP-diacylglycerol biosynthesis III'
]

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Select appropriate features based on the time point
        if time_point == 12:
            selected_features = features_timepoint_12
        elif time_point == 15:
            selected_features = features_timepoint_15
        elif time_point == 18:
            selected_features = features_timepoint_18
        elif time_point == 21:
            selected_features = features_timepoint_21
        elif time_point == 24:
            selected_features = features_timepoint_24
        elif time_point == 27:
            selected_features = features_timepoint_27
        elif time_point == 30:
            selected_features = features_timepoint_30
        elif time_point == 33:
            selected_features = features_timepoint_33
        else:
            print(f"No predefined features for time point {time_point}.")
            return time_point, None

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data[selected_features]  # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X):
                        X_train, X_test = X.iloc[train_index][selected_features], X.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'ElasticNet':
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)
                        elif ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, eval_metric='logloss', random_state=42)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict(X_test) >= threshold).astype(int) if ml_model_name == 'ElasticNet' else (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('LASSO', 'ElasticNet'), ('ElasticNet', 'ElasticNet')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for late onset groups
late_onset_max_timepoint = 36

# Evaluate models for late onset group
print("Evaluating Late Onset Group")
results_late_onset = evaluate_models(late_onset_data, late_onset_max_timepoint)

# Print results for late onset group
print("Results for Late Onset Group:")
for time_point, res in results_late_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
from sklearn.metrics import f1_score

# Define selected features for each time point
features_timepoint_12 = [
    'Veillonella_dispar', 'Lachnospira_pectinoschiza', 'Phocaeicola_dorei', 'Gemmiger_formicilis', 'Bifidobacterium_breve', 'Bifidobacterium_dentium', 
    'Enterococcus_faecium', 'Roseburia_inulinivorans', 'Escherichia_coli', 'Parabacteroides_merdae', 'Enterococcus_avium', 'Bifidobacterium_pseudocatenulatum', 
    'Alistipes_putredinis', 'Faecalibacterium_sp_HTFF', 'Lacticaseibacillus_rhamnosus', 'PWY-5005: biotin biosynthesis II', 'PWY-5838: superpathway of menaquinol-8 biosynthesis I'
]

features_timepoint_15 = [
    'Veillonella_ratti', 'Longicatena_caecimuris', 'Erysipelatoclostridium_ramosum', 'Anaerobutyricum_soehngenii', 'Roseburia_intestinalis', 
    'Bifidobacterium_pseudocatenulatum', 'Sutterella_wadsworthensis', 'Phocaeicola_dorei', 'Roseburia_faecis', 'Bacteroides_uniformis', 'Faecalibacterium_sp_HTFF', 
    'HISDEG-PWY: L-histidine degradation I', 'P23-PWY: reductive TCA cycle I', 'ARGININE-SYN4-PWY: L-ornithine biosynthesis II', 
    'CENTFERM-PWY: pyruvate fermentation to butanoate', 'P621-PWY: nylon-6 oligomer degradation', 'P562-PWY: myo-inositol degradation I', 
    'P461-PWY: hexitol fermentation to lactate, formate, ethanol and acetate', 'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 
    'COLANSYN-PWY: colanic acid building blocks biosynthesis', 'NAGLIPASYN-PWY: lipid IVA biosynthesis (E. coli)', 'LIPA-CORESYN-PWY: lipid A-core biosynthesis (E. coli K-12)', 
    'LPSSYN-PWY: superpathway of lipopolysaccharide biosynthesis', 'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)', 
    'PROPFERM-PWY: superpathway of L-alanine fermentation (Stickland reaction)', 'P161-PWY: acetylene degradation (anaerobic)', 'GALACTITOLCAT-PWY: galactitol degradation', 
    'GLCMANNANAUT-PWY: superpathway of N-acetylglucosamine, N-acetylmannosamine and N-acetylneuraminate degradation', 
    'PWY-1861: formaldehyde assimilation II (assimilatory RuMP Cycle)', 'METHGLYUT-PWY: superpathway of methylglyoxal degradation', 'CITRULBIO-PWY: L-citrulline biosynthesis', 
    'DENITRIFICATION-PWY: nitrate reduction I (denitrification)'
]
features_timepoint_18 = [
    'Streptococcus_thermophilus', 'Lactococcus_lactis', 'Bifidobacterium_adolescentis', 'GGB51647_SGB4348', 'Fusicatenibacter_saccharivorans', 'Phocaeicola_dorei', 
    'Streptococcus_salivarius', 'Faecalibacterium_sp_HTFF', 'Alistipes_putredinis', 'Anaerobutyricum_hallii', 'Gemmiger_formicilis', 'Bifidobacterium_longum', 
    'Blautia_caecimuris', 'GLUDEG-I-PWY: GABA shunt', 'PWY-5861: superpathway of demethylmenaquinol-8 biosynthesis I', 'P42-PWY: incomplete reductive TCA cycle', 
    'ARG+POLYAMINE-SYN: superpathway of arginine and polyamine biosynthesis', 'DENOVOPURINE2-PWY: superpathway of purine nucleotides de novo biosynthesis II', 
    'PWY-5022: 4-aminobutanoate degradation V', 'P621-PWY: nylon-6 oligomer degradation', 'PWY-561: superpathway of glyoxylate cycle and fatty acid degradation', 
    'PWY-5918: superpathway of heme b biosynthesis from glutamate', 'GLYCOLYSIS-E-D: superpathway of glycolysis and the Entner-Doudoroff pathway', 
    'PWY-5005: biotin biosynthesis II', 'PWY-5994: palmitate biosynthesis (type I fatty acid synthase)', 'PWY-241: C4 photosynthetic carbon assimilation cycle, NADP-ME type', 
    'HEME-BIOSYNTHESIS-II: heme b biosynthesis I (aerobic)', 'CITRULBIO-PWY: L-citrulline biosynthesis', 'PWY-5686: UMP biosynthesis I', 
    'BIOTIN-BIOSYNTHESIS-PWY: biotin biosynthesis I'
]

features_timepoint_21 = [
    'Alistipes_finegoldii', 'Roseburia_intestinalis', 'Clostridiaceae_bacterium', 'Phocaeicola_dorei', 'Faecalimonas_umbilicata', 'GGB3005_SGB3996', 'Bacteroides_stercoris', 
    'Mediterraneibacter_faecis', 'Anaerostipes_hadrus', 'Enterococcus_faecalis', 'P42-PWY: incomplete reductive TCA cycle', 'PWY-5913: partial TCA cycle (obligate autotrophs)', 
    'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 'PWY-6143: CMP-pseudaminate biosynthesis', 'FERMENTATION-PWY: mixed acid fermentation', 
    'DENOVOPURINE2-PWY: superpathway of purine nucleotides de novo biosynthesis II', 
    'GLYCOLYSIS-TCA-GLYOX-BYPASS: superpathway of glycolysis, pyruvate dehydrogenase, TCA, and glyoxylate bypass', 'PWY-1269: CMP-3-deoxy-D-manno-octulosonate biosynthesis'
]
features_timepoint_24 = [
    'Alistipes_shahii', 'Blautia_faecis', 'Lachnospira_eligens', 'Blautia_wexlerae', 'Blautia_massiliensis', 'Roseburia_inulinivorans', 'Phocaeicola_dorei', 
    'Parabacteroides_distasonis', 'Bacteroides_thetaiotaomicron', 'Faecalibacterium_prausnitzii', 'Romboutsia_timonensis', 'Clostridium_SGB6179', 'Ruminococcus_bicirculans', 
    'Bifidobacterium_pseudocatenulatum', 'Bacteroides_xylanisolvens', 'Longicatena_caecimuris', 'Roseburia_sp_AF02_12', 'Veillonella_ratti', 
    'P621-PWY: nylon-6 oligomer degradation', 'POLYAMINSYN3-PWY: superpathway of polyamine biosynthesis II', 
    'HCAMHPDEG-PWY: 3-phenylpropanoate and 3-(3-hydroxyphenyl)propanoate degradation to 2-hydroxypentadienoate', 'GLUCOSE1PMETAB-PWY: glucose and glucose-1-phosphate degradation', 
    'PWY-5104: L-isoleucine biosynthesis IV', 'PWY-5188: tetrapyrrole biosynthesis I (from glutamate)', 'PWY-5030: L-histidine degradation III', 
    'GLUCUROCAT-PWY: superpathway of &beta;-D-glucuronosides degradation'
]

features_timepoint_27 = [
    'Dorea_longicatena', 'Lachnospira_eligens', 'Bifidobacterium_bifidum', 'Parabacteroides_merdae', 'Bacteroides_uniformis', 'Blautia_wexlerae', 'Bifidobacterium_adolescentis', 
    'Bacteroides_fragilis', 'Blautia_massiliensis', 'Faecalibacterium_prausnitzii', 'Lachnospira_pectinoschiza', 'Blautia_hansenii', 'Gemmiger_formicilis', 'Alistipes_shahii', 
    'Anaerobutyricum_hallii', 'ORNDEG-PWY: superpathway of ornithine degradation', 'KETOGLUCONMET-PWY: ketogluconate metabolism', 
    'P105-PWY: TCA cycle IV (2-oxoglutarate decarboxylase)', 'P125-PWY: superpathway of (R,R)-butanediol biosynthesis', 'P108-PWY: pyruvate fermentation to propanoate I', 
    'FERMENTATION-PWY: mixed acid fermentation', 'P23-PWY: reductive TCA cycle I', 'GLYOXYLATE-BYPASS: glyoxylate cycle', 'P621-PWY: nylon-6 oligomer degradation', 
    'COBALSYN-PWY: superpathway of adenosylcobalamin salvage from cobinamide I', 'NAGLIPASYN-PWY: lipid IVA biosynthesis (E. coli)', 
    'P4-PWY: superpathway of L-lysine, L-threonine and L-methionine biosynthesis I', 'METH-ACETATE-PWY: methanogenesis from acetate', 
    'HEME-BIOSYNTHESIS-II-1: heme b biosynthesis V (aerobic)', 'MET-SAM-PWY: superpathway of S-adenosyl-L-methionine biosynthesis', 'GALACTARDEG-PWY: D-galactarate degradation I', 
    'GLUCARGALACTSUPER-PWY: superpathway of D-glucarate and D-galactarate degradation', 'PHOSLIPSYN-PWY: superpathway of phospholipid biosynthesis I (bacteria)', 
    'METSYN-PWY: superpathway of L-homoserine and L-methionine biosynthesis', 'GLUCARDEG-PWY: D-glucarate degradation I', 
    'GLYCOLYSIS-E-D: superpathway of glycolysis and the Entner-Doudoroff pathway', 'HOMOSER-METSYN-PWY: L-methionine biosynthesis I', 
    'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)'
]
features_timepoint_30 = [
    'Anaerobutyricum_soehngenii', 'Parabacteroides_distasonis', 'Dorea_longicatena', 'Flavonifractor_plautii', 'Brotolimicola_acetigignens', 'Phocaeicola_dorei', 
    'Lachnospira_eligens', 'GGB9480_SGB14874', 'Bacteroides_thetaiotaomicron', 'Bacteroides_caccae', 'Eubacterium_rectale', 'Blautia_hansenii', 
    'P42-PWY: incomplete reductive TCA cycle', 'PWY-5497: purine nucleobases degradation II (anaerobic)', 'PPGPPMET-PWY: ppGpp metabolism', 'PWY-5030: L-histidine degradation III', 
    'PWY-5505: L-glutamate and L-glutamine biosynthesis', 'ARGININE-SYN4-PWY: L-ornithine biosynthesis II', 'PWY-1269: CMP-3-deoxy-D-manno-octulosonate biosynthesis', 
    'PWY-5981: CDP-diacylglycerol biosynthesis III', 'DAPLYSINESYN-PWY: L-lysine biosynthesis I', 'POLYISOPRENSYN-PWY: polyisoprenoid biosynthesis (E. coli)', 
    'P124-PWY: Bifidobacterium shunt', 'GLYCOLYSIS: glycolysis I (from glucose 6-phosphate)'
]

features_timepoint_33 = [
    'Alistipes_putredinis', 'Bifidobacterium_adolescentis', 'GGB51647_SGB4348', 'Bacteroides_uniformis', 'Akkermansia_muciniphila', 'Lachnospira_pectinoschiza', 
    'Faecalibacterium_sp_HTFF', 'Phocaeicola_dorei', 'P164-PWY: purine nucleobases degradation I (anaerobic)', 'PWY-5130: 2-oxobutanoate degradation I', 
    'HEXITOLDEGSUPER-PWY: superpathway of hexitol degradation (bacteria)', 'GLYCOCAT-PWY: glycogen degradation I', 'POLYAMINSYN3-PWY: superpathway of polyamine biosynthesis II', 
    'P125-PWY: superpathway of (R,R)-butanediol biosynthesis', 'PWY-6588: pyruvate fermentation to acetone', 'PWY-5971: palmitate biosynthesis (type II fatty acid synthase)', 
    'PWY-6630: superpathway of L-tyrosine biosynthesis', 'PWY-6892: thiazole component of thiamine diphosphate biosynthesis I', 'PWY-5005: biotin biosynthesis II', 
    'PWY-5136: fatty acid &beta;-oxidation II (plant peroxisome)', 'PRPP-PWY: superpathway of histidine, purine, and pyrimidine biosynthesis', 
    'PWY-6478: GDP-D-glycero-&alpha;-D-manno-heptose biosynthesis', 'PWY-5981: CDP-diacylglycerol biosynthesis III'
]

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Select appropriate features based on the time point
        if time_point == 12:
            selected_features = features_timepoint_12
        elif time_point == 15:
            selected_features = features_timepoint_15
        elif time_point == 18:
            selected_features = features_timepoint_18
        elif time_point == 21:
            selected_features = features_timepoint_21
        elif time_point == 24:
            selected_features = features_timepoint_24
        elif time_point == 27:
            selected_features = features_timepoint_27
        elif time_point == 30:
            selected_features = features_timepoint_30
        elif time_point == 33:
            selected_features = features_timepoint_33
        else:
            print(f"No predefined features for time point {time_point}.")
            return time_point, None

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data[selected_features]  # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X):
                        X_train, X_test = X.iloc[train_index][selected_features], X.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'ElasticNet':
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)
                        elif ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, eval_metric='logloss', random_state=42)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict(X_test) >= threshold).astype(int) if ml_model_name == 'ElasticNet' else (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('LASSO', 'ElasticNet'), ('ElasticNet', 'ElasticNet'), 
                        ('LASSO', 'RandomForest'), ('ElasticNet', 'RandomForest'), 
                        ('LASSO', 'XGBoost'), ('ElasticNet', 'XGBoost')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for late onset groups
late_onset_max_timepoint = 36

# Evaluate models for late onset group
print("Evaluating Late Onset Group")
results_late_onset = evaluate_models(late_onset_data, late_onset_max_timepoint)

# Print results for late onset group
print("Results for Late Onset Group:")
for time_point, res in results_late_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
# Create a line chart to show the F1 score for each time point
# Sorting the time points
sorted_features_time_points = [12, 15]
sorted_features_f1_scores = [90, 87]

# Create the line chart
plt.figure(figsize=(8, 4))

# Plot the line with points and set line color, marker size, and style
plt.plot(sorted_features_time_points, sorted_features_f1_scores, marker='o', linestyle='-', color='blue', markersize=10)

# Add labels to each point
for i, score in enumerate(sorted_features_f1_scores):
    plt.text(sorted_features_time_points[i], score + 1, f"{score}", ha='center', va='bottom', fontsize=14)

# Add titles and labels
plt.title('F1 Score by Age for Early Onset Combined Species and Pathways Abundance Data', fontsize=14, color='blue', pad=20)
plt.xlabel('Age (Months)', fontsize=14)
plt.ylabel('Average F1 Score', fontsize=14)

# Adjust the axis limits
plt.xticks(sorted_features_time_points, fontsize=12)
plt.ylim(0, 100)
plt.yticks(fontsize=12)

# Show the plot with tight layout
plt.tight_layout()
plt.show()

In [None]:
# Create a line chart to show the F1 score for each time point
# Sorting the time points
sorted_features_time_points = [12, 15, 18, 21, 24, 27, 30, 33]
sorted_features_f1_scores = [97, 91, 94, 90, 94, 85, 85, 93]

# Create the line chart
plt.figure(figsize=(8, 4))

# Plot the line with points and set line color, marker size, and style
plt.plot(sorted_features_time_points, sorted_features_f1_scores, marker='o', linestyle='-', color='blue', markersize=10)

# Add labels to each point
for i, score in enumerate(sorted_features_f1_scores):
    plt.text(sorted_features_time_points[i], score + 1, f"{score}", ha='center', va='bottom', fontsize=14)

# Add titles and labels
plt.title('F1 Score by Age for Late Onset Combined Species and Pathways Abundance Data', fontsize=14, color='blue', pad=20)
plt.xlabel('Age (Months)', fontsize=14)
plt.ylabel('Average F1 Score', fontsize=14)

# Adjust the axis limits
plt.xticks(sorted_features_time_points, fontsize=12)
plt.ylim(0, 100)
plt.yticks(fontsize=12)

# Show the plot with tight layout
plt.tight_layout()
plt.show()