In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
import os
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

In [3]:
# Load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from collections import Counter, defaultdict
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LassoCV, LogisticRegression, LogisticRegressionCV, ElasticNetCV, ElasticNet
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import LeaveOneOut, KFold, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import statistics
import xgboost as xgb

In [None]:
# Load dataset
file_path = 'pathway_abundance_merged_2024-08-21.xlsx'
new_data = pd.read_excel(file_path)
new_data

In [5]:
# Exclude subjects with a CD onset of 12 months
excluded_subjects = [23, 31]
new_data = new_data[~new_data['Subject_number'].isin(excluded_subjects)]

In [None]:
# Function to apply abundance and prevalence thresholds
def filter_pathways(data, abundance_threshold = 0.001, prevalence_threshold = 0.1):
    initial_pathways_count = data.shape[1] - 8
    sample_count = data.shape[0]
    
    # Calculate prevalence threshold
    min_prevalent_samples = int(prevalence_threshold * sample_count)

    # Filter pathways based on the thresholds
    pathways_columns = data.columns.difference(['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'])
    pathways_data = data[pathways_columns]
    
    # Calculate the abundance and prevalence for each pathway
    pathways_above_threshold = (pathways_data >= abundance_threshold).sum(axis=0) >= min_prevalent_samples
    filtered_pathways = pathways_columns[pathways_above_threshold]
    
    # Filter the data to keep only the selected pathways
    filtered_data = data[['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'] + filtered_pathways.tolist()]

    final_pathways_count = len(filtered_pathways)
    print(f"Initial number of pathways: {initial_pathways_count}")
    print(f"Number of pathways after filtering: {final_pathways_count}")
    
    return filtered_data

# Filter the data
new_data_filtered = filter_pathways(new_data)

In [7]:
# Divide subjects into early onset (≤ 30 months) and late onset (> 30 months)
early_onset_subjects = [9, 20, 27, 29, 30, 35, 10, 21, 22, 36, 5, 13, 18, 25, 34]

late_onset_subjects = [11, 24, 3, 12, 15, 17, 28, 32, 16, 1, 4, 6, 8, 14, 19, 2, 7]

# Filter data for each group
early_onset_data = new_data_filtered[(new_data_filtered['Subject_number'].isin(early_onset_subjects)) & (new_data_filtered['timepoint_numeric'] < 18)]
late_onset_data = new_data_filtered[(new_data_filtered['Subject_number'].isin(late_onset_subjects)) & (new_data_filtered['timepoint_numeric'] < 36)]

In [None]:
from sklearn.metrics import f1_score

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data.drop(['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'], axis=1) # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initial feature reduction to 100 features
        selector = SelectKBest(score_func=f_classif, k=100)
        X_reduced = selector.fit_transform(X, y)
        selected_feature_names = X.columns[selector.get_support(indices=True)]
        X_reduced = pd.DataFrame(X_reduced, columns=selected_feature_names)

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X_reduced):
                X_train, X_test = X_reduced.iloc[train_index], X_reduced.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X_reduced)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X_reduced):
                        X_train, X_test = X_reduced.iloc[train_index][selected_features], X_reduced.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict_proba(X_test)[:, 1] >= threshold).astype(int) if ml_model_name == 'RandomForest' else (model.predict(X_test) >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('LASSO', 'ElasticNet'), ('ElasticNet', 'ElasticNet'), 
                        ('LASSO', 'RandomForest'), ('ElasticNet', 'RandomForest')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for early onset groups
early_onset_max_timepoint = 18

# Evaluate models for early onset group
print("Evaluating Early Onset Group")
results_early_onset = evaluate_models(early_onset_data, early_onset_max_timepoint)

# Print results for early onset group
print("Results for Early Onset Group:")
for time_point, res in results_early_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
from sklearn.metrics import f1_score

# Define a function to evaluate models
def evaluate_models(data, max_timepoint):
    results = {}

    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Filter data for the current time point
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data.drop(['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Country', 'Diagnosis', 'CD_onset'], axis=1)
        y = current_data['Diagnosis']

        # Check if there are enough samples for cross-validation
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initial feature reduction to 100 features (move outside the CV loop)
        selector = SelectKBest(score_func=f_classif, k=100)
        X_reduced = selector.fit_transform(X, y)
        selected_feature_names = X.columns[selector.get_support(indices=True)]
        X_reduced = pd.DataFrame(X_reduced, columns=selected_feature_names)

        # Initialize leave-one-out cross-validation
        loo = LeaveOneOut()

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Perform feature selection on the entire dataset (outside the CV loop)
            if feature_selector_name == 'LASSO':
                feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_reduced, y)
            else:
                feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_reduced, y)

            selected_features = X_reduced.columns[feature_selector.coef_ != 0]
            selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y)))]

            if len(selected_features) == 0:
                return None

            X_selected = X_reduced[selected_features]

            # Perform logistic regression for ranking features (outside the CV loop)
            logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_selected, y)
            importances = abs(logistic.coef_[0])
            ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

            # Now proceed to test different numbers of features and thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(ranked_features), int(0.8 * len(y))) + 1):
                selected_features_subset = [feature[0] for feature in ranked_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # leave-one-out cross-validation loop (using consistent features)
                    for train_index, test_index in loo.split(X_reduced):
                        X_train = X_reduced.iloc[train_index][selected_features_subset]
                        X_test = X_reduced.iloc[test_index][selected_features_subset]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Perform model prediction
                        if ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                            model.fit(X_train, y_train)
                            test_prediction = (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
                        else:
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)
                            model.fit(X_train, y_train)
                            test_prediction = (model.predict(X_test) >= threshold).astype(int)

                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    f1_score_avg = np.mean(fold_f1_scores)

                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features_subset,
                        'threshold': best_threshold_for_features,
                        'performance': best_overall_performance,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }
                    best_percentage = i / len(ranked_features)

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance'],
                'best_percentage': best_percentage
            }

        # Process combinations of LASSO and ElasticNet for feature selection and RandomForest and ElasticNet for prediction
        combinations = [('LASSO', 'ElasticNet')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Only process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for late onset groups
late_onset_max_timepoint = 36  # Only consider timepoints before 36 months for late onset

# Evaluate models for late onset group
print("Evaluating Late Onset Group")
results_late_onset = evaluate_models(late_onset_data, late_onset_max_timepoint)

# Print results for late onset group
print("Results for Late Onset Group:")
for time_point, res in results_late_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print(f"  Best Percentage: {res['best_percentage']:.2%}")
        print("-" * 40)

In [None]:
from sklearn.metrics import f1_score

# Define a function to evaluate combinations of feature selectors and models
def evaluate_models(data, max_timepoint):
    results = {}

    # Define a function to process each time point independently
    def process_time_point(time_point):
        print(f"Processing time point: {time_point}")

        # Filter data for the current time point and extract selected features
        current_data = data[data['timepoint_numeric'] == time_point]
        X = current_data.drop(['SampleID', 'Subject', 'Subject_number', 'timepoint_numeric', 'Diagnosis', 'CD_onset', 'Relative_timepoint', 'Country'], axis=1) # Feature matrix
        y = current_data['Diagnosis'] # Labels

        # Ensure there are enough samples to perform LOOCV
        if len(y) < 2:
            print(f"Skipping time point {time_point} due to insufficient samples.")
            return time_point, None

        # Initial feature reduction to 100 features
        selector = SelectKBest(score_func=f_classif, k=100)
        X_reduced = selector.fit_transform(X, y)
        selected_feature_names = X.columns[selector.get_support(indices=True)]
        X_reduced = pd.DataFrame(X_reduced, columns=selected_feature_names)

        # Initialize Leave-One-Out cross-validation
        loo = LeaveOneOut()
        best_overall_score = 0
        best_overall_setup = {}

        # Define a function to process each feature selector and model combination
        def process_combination(feature_selector_name, ml_model_name):
            all_selected_features = []
            all_importances = []

            # Loop through the training/test splits generated by LOOCV
            for train_index, test_index in loo.split(X_reduced):
                X_train, X_test = X_reduced.iloc[train_index], X_reduced.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                # Perform feature selection based on the specified selector
                if feature_selector_name == 'LASSO':
                    feature_selector = LassoCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30)).fit(X_train, y_train)
                else:
                    feature_selector = ElasticNetCV(cv=5, max_iter=20000, tol=1e-4, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7).fit(X_train, y_train)

                # Select features with non-zero coefficients
                selected_features = X_train.columns[feature_selector.coef_ != 0]
                selected_features = selected_features[:min(len(selected_features), int(0.8 * len(y_train)))]

                # If no features are selected, skip this iteration
                if len(selected_features) == 0:
                    continue

                # Select the relevant features from the training set
                X_train_selected = X_train[selected_features]

                # Perform logistic regression for ranking the selected features based on importance
                logistic = LogisticRegression(max_iter=10000, random_state=42, solver='liblinear').fit(X_train_selected, y_train)
                importances = abs(logistic.coef_[0])
                ranked_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

                # Store selected features and their importance
                all_selected_features.extend([f[0] for f in ranked_features])
                all_importances.extend([f[1] for f in ranked_features])

            # If no features were selected across all folds, return None
            if not all_selected_features:
                return None

            # Aggregate selected features across all folds
            unique_features = list(set(all_selected_features))
            frequency = Counter(all_selected_features)
            avg_importance = {feature: np.mean([imp for feat, imp in zip(all_selected_features, all_importances) if feat == feature])
                              for feature in unique_features}

            # Calculate a composite score for each feature based on frequency and importance
            composite_scores = {feature: 0.5 * (frequency[feature] / loo.get_n_splits(X_reduced)) + 0.5 * (avg_importance[feature] / sum(avg_importance.values()))
                                for feature in unique_features}
            sorted_features = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

            # Evaluate different feature subsets with varying thresholds
            best_overall_performance = 0
            best_overall_setup = {}
            best_percentage = 0
            thresholds = np.linspace(0.05, 0.95, 19)

            for i in range(1, min(len(sorted_features), int(0.8 * len(y))) + 1):
                selected_features = [feature[0] for feature in sorted_features[:i]]
                best_performance_for_features = 0
                best_threshold_for_features = None

                for threshold in thresholds:
                    fold_f1_scores = []

                    # Perform LOOCV prediction with the selected features and model
                    for train_index, test_index in loo.split(X_reduced):
                        X_train, X_test = X_reduced.iloc[train_index][selected_features], X_reduced.iloc[test_index][selected_features]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Use the appropriate model for prediction
                        if ml_model_name == 'RandomForest':
                            model = RandomForestClassifier(n_estimators=300, random_state=42)
                        else:
                            model = ElasticNetCV(cv=5, max_iter=10000, alphas=np.logspace(-6, -2, 30), l1_ratio=0.7)

                        # Fit the model, make predictions and compute F1 score for the current fold
                        model.fit(X_train, y_train)
                        test_prediction = (model.predict_proba(X_test)[:, 1] >= threshold).astype(int) if ml_model_name == 'RandomForest' else (model.predict(X_test) >= threshold).astype(int)
                        f1_score_current = f1_score(y_test, test_prediction, average='macro')
                        fold_f1_scores.append(f1_score_current)

                    # Calculate the average F1 score for the current threshold
                    f1_score_avg = np.mean(fold_f1_scores)

                    # Record the best performance and threshold for the features
                    if f1_score_avg > best_performance_for_features:
                        best_performance_for_features = f1_score_avg
                        best_threshold_for_features = threshold

                # Update the best overall performance if it improves
                if best_performance_for_features > best_overall_performance:
                    best_overall_performance = best_performance_for_features
                    best_overall_setup = {
                        'features': selected_features,
                        'threshold': best_threshold_for_features,
                        'performance': best_performance_for_features,
                        'feature_selection_method': feature_selector_name,
                        'ml_model': ml_model_name
                    }

            return {
                'feature_selection_method': best_overall_setup['feature_selection_method'],
                'ml_model': best_overall_setup['ml_model'],
                'best_features': best_overall_setup['features'],
                'features_length': len(best_overall_setup['features']),
                'best_threshold': best_overall_setup['threshold'],
                'best_performance': best_overall_setup['performance']
            }

        # Process combinations of feature selection and prediction models
        combinations = [('ElasticNet', 'ElasticNet')]
        results_per_combination = [process_combination(fs, ml) for fs, ml in combinations]
        best_combination = max(results_per_combination, key=lambda x: x['best_performance'] if x is not None else 0)

        return time_point, best_combination

    # Process time points within the specified max_timepoint
    time_points = np.sort(data[data['timepoint_numeric'] < max_timepoint]['timepoint_numeric'].unique())
    results_parallel = Parallel(n_jobs=-1)(delayed(process_time_point)(tp) for tp in time_points)
    results = {tp: result for tp, result in results_parallel if result is not None}

    return results

# Set the max timepoint for late onset groups
late_onset_max_timepoint = 36

# Evaluate models for late onset group
print("Evaluating Late Onset Group")
results_late_onset = evaluate_models(late_onset_data, late_onset_max_timepoint)

# Print results for late onset group
print("Results for Late Onset Group:")
for time_point, res in results_late_onset.items():
    if res is not None:
        print(f"Time Point: {time_point}")
        print(f"  Feature Selection Method: {res['feature_selection_method']}")
        print(f"  Machine Learning Model: {res['ml_model']}")
        print(f"  Best F1 Score: {res['best_performance']}")
        print(f"  Best Threshold: {res['best_threshold']}")
        print(f"  Features Used: {res['best_features']}")
        print(f"  Features Length: {res['features_length']}")
        print("-" * 40)

In [None]:
# Create a line chart to show the F1 score for each time point
# Sorting the time points
sorted_pathways_time_points = [12, 15]
sorted_pathways_f1_scores = [90, 80]

# Create the line chart
plt.figure(figsize=(8, 4))

# Plot the line with points and set line color, marker size, and style
plt.plot(sorted_pathways_time_points, sorted_pathways_f1_scores, marker='o', linestyle='-', color='blue', markersize=10)

# Add labels to each point
for i, score in enumerate(sorted_pathways_f1_scores):
    plt.text(sorted_pathways_time_points[i], score + 1, f"{score}", ha='center', va='bottom', fontsize=14)

# Add titles and labels
plt.title('F1 Score by Age for Early Onset Pathways Abundance Data', fontsize=14, color='blue', pad=20)
plt.xlabel('Age (Months)', fontsize=14)
plt.ylabel('Average F1 Score', fontsize=14)

# Adjust the axis limits
plt.xticks(sorted_pathways_time_points, fontsize=12)
plt.ylim(0, 100)
plt.yticks(fontsize=12)

# Show the plot with tight layout
plt.tight_layout()
plt.show()

In [None]:
# Create a line chart to show the F1 score for each time point
# Sorting the time points
sorted_pathways_time_points = [12, 15, 18, 21, 24, 27, 30, 33]
sorted_pathways_f1_scores = [82, 82, 79, 74, 88, 100, 82, 80]

# Create the line chart
plt.figure(figsize=(8, 4))

# Plot the line with points and set line color, marker size, and style
plt.plot(sorted_pathways_time_points, sorted_pathways_f1_scores, marker='o', linestyle='-', color='blue', markersize=10)

# Add labels to each point
for i, score in enumerate(sorted_pathways_f1_scores):
    plt.text(sorted_pathways_time_points[i], score + 1, f"{score}", ha='center', va='bottom', fontsize=14)

# Add titles and labels
plt.title('F1 Score by Age for Late Onset Pathways Abundance Data', fontsize=14, color='blue', pad=20)
plt.xlabel('Age (Months)', fontsize=14)
plt.ylabel('Average F1 Score', fontsize=14)

# Adjust the axis limits
plt.xticks(sorted_pathways_time_points, fontsize=12)
plt.ylim(0, 100)
plt.yticks(fontsize=12)

# Show the plot with tight layout
plt.tight_layout()
plt.show()