In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from itertools import combinations
from tqdm.notebook import tqdm
from pathlib import Path

In [2]:
def preprocess_data(df: pd.DataFrame, target_col: str='FL_UDSD', diagnosis_order: list=None) -> pd.DataFrame:
    """
    Preprocess the data by splitting into train and test sets.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        random_state (int): Random state for reproducibility.
        target_col (str): The target column for stratification.
    
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: The train and test dataframes.
    """
    
    # Clean data
    filter_df = df[df[target_col] != 'Unknown'] # Remove rows with unknown target values
    filter_df = filter_df[filter_df["MMSE"] != -1] # Remove rows with invalid MMSE values
    
    # Convert columns to categorical if needed
    filter_df['APOE'] = filter_df['APOE'].astype('category')
    filter_df['AMYLPET'] = filter_df['AMYLPET'].astype('category')
    
    # Encode the target variable as an ordered categorical variable.    
    filter_df['FL_UDSD'] = pd.Categorical(filter_df['FL_UDSD'], categories=diagnosis_order, ordered=True)
    filter_df['FL_UDSD_cat'] = filter_df['FL_UDSD'].cat.codes 
    # filter_df.drop(columns=['FL_UDSD'], inplace=True)
    
    return filter_df

In [3]:
def combine_categories(df: pd.DataFrame, combination_map: dict, target_col: str = 'FL_UDSD') -> pd.DataFrame:
    """
    Combine multiple categories in the target column into single categories.
    
    Args:
        df: Input dataframe
        target_col: Column containing categories to combine
        combination_map: Dictionary where keys are new category names and values are lists of 
                        categories to combine into that new category.
                        Example: {'SCD/Impaired': ['Subjective Cognitive Decline', 'Impaired Not SCD/MCI'],
                                 'Normal/SCD': ['Normal cognition', 'Subjective Cognitive Decline']}
    
    Returns:
        pd.DataFrame: DataFrame with combined categories
    """
    df = df.copy()
    
    # Apply each combination
    for new_category, old_categories in combination_map.items():
        df[target_col] = df[target_col].replace(old_categories, new_category)
    
    return df

In [4]:
df = pd.read_csv('../data/synthetic_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MMSE                10000 non-null  int64  
 1   CDRSUM              10000 non-null  float64
 2   CDRGLOB             10000 non-null  float64
 3   HVLT_DR             8696 non-null   float64
 4   LASSI_A_CR2         8668 non-null   float64
 5   LASSI_B_CR1         8607 non-null   float64
 6   LASSI_B_CR2         8499 non-null   float64
 7   APOE                8674 non-null   float64
 8   AMYLPET             6791 non-null   float64
 9   PTAU_217_CONCNTRTN  3017 non-null   float64
 10  FL_UDSD             10000 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 859.5+ KB


In [5]:
# diagnosis_order = ['Normal cognition', 'Subjective Cognitive Decline', 'Impaired Not SCD/MCI',
#                        'Early MCI', 'Late MCI', 'Dementia']

In [6]:
combination_map = {
    'SCD/Impaired': ['Subjective Cognitive Decline', 'Impaired Not SCD/MCI'],
}

In [7]:
diagnosis_order = [
    'Normal cognition', 
    'SCD/Impaired',
    'Early MCI', 
    'Late MCI', 
    'Dementia'
]

In [8]:
df.value_counts('FL_UDSD')

FL_UDSD
Early MCI                       2852
Subjective Cognitive Decline    1821
Normal cognition                1693
Dementia                        1668
Late MCI                        1224
Impaired Not SCD/MCI             742
Name: count, dtype: int64

In [9]:
df = combine_categories(df, combination_map, target_col='FL_UDSD')

In [10]:
df.value_counts('FL_UDSD')

FL_UDSD
Early MCI           2852
SCD/Impaired        2563
Normal cognition    1693
Dementia            1668
Late MCI            1224
Name: count, dtype: int64

In [11]:
filter_df = preprocess_data(df= df, diagnosis_order=diagnosis_order)
filter_df.dropna(inplace=True)

In [12]:
filter_df['FL_UDSD'].value_counts()

FL_UDSD
Early MCI           345
Dementia            202
Late MCI            140
SCD/Impaired        136
Normal cognition     72
Name: count, dtype: int64

In [13]:
filter_df.drop(columns=['FL_UDSD'], inplace=True)

In [14]:
train_df, test_df = train_test_split(filter_df, test_size=0.2, random_state=42, stratify=filter_df['FL_UDSD_cat'])

In [15]:
def calculate_metrics(y_true, y_pred) -> dict:
    """
    Calculate evaluation metrics for a given set of true and predicted labels.
    
    Args:
        y_true: True labels
        y_pred: Predicted labels
    
    Returns:
        mertrics: A dictionary containing accuracy, balanced accuracy, and f1 score.
    """
    
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'f1_score_macro': f1_score(y_true, y_pred, average='macro')
    }
    
    metrics = {k: float(f'{v:.5f}') for k, v in metrics.items()}
    
    return metrics

In [19]:
def train_and_evaluate_model(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    model,
    target_col: str = 'FL_UDSD_cat'
) -> dict:
    """
    Train and evaluate a single model.
    
    Args:
        train_df: Training dataframe
        test_df: Test dataframe
        model: An instance of a sklearn model to train and evaluate
        target_col: Name of target column to exclude from features (default: 'FL_UDSD_cat')
    
    returns:
        dict: A dictionary containing training and testing metrics (accuracy, balanced accuracy, f1 score) for both train and test sets.
    
    """
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col]
    
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    train_metrics = calculate_metrics(y_train, train_preds)
    test_metrics = calculate_metrics(y_test, test_preds)
    
    metrics = {
        'train_accuracy': train_metrics['accuracy'],
        'test_accuracy': test_metrics['accuracy'],
        'train_balanced_accuracy': train_metrics['balanced_accuracy'],
        'test_balanced_accuracy': test_metrics['balanced_accuracy'],
        'train_f1_macro_score': train_metrics['f1_score_macro'],
        'test_f1_macro_score': test_metrics['f1_score_macro']
    }
    
    return metrics



In [20]:
rf_results = train_and_evaluate_model(
    train_df=train_df,
    test_df=test_df,
    model=RandomForestClassifier(random_state=42)
)

In [21]:
rf_results

{'train_accuracy': 1.0,
 'test_accuracy': 0.6257,
 'train_balanced_accuracy': 1.0,
 'test_balanced_accuracy': 0.54141,
 'train_f1_macro_score': 1.0,
 'test_f1_macro_score': 0.5449}

In [22]:
lr_results = train_and_evaluate_model(
    train_df=train_df,
    test_df=test_df,
    model= LogisticRegression(max_iter=10000, random_state=42)
)

In [23]:
lr_results

{'train_accuracy': 0.5824,
 'test_accuracy': 0.62011,
 'train_balanced_accuracy': 0.46621,
 'test_balanced_accuracy': 0.52617,
 'train_f1_macro_score': 0.47563,
 'test_f1_macro_score': 0.53151}

In [24]:
def comprehensive_feature_search(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str = 'FL_UDSD_cat',
    min_features: int = 2,
    max_features: int = None,
    models: dict = None
):
    """
    Test all possible feature subsets with multiple models.
    
    Args:
        train_df: Training dataframe
        test_df: Test dataframe
        target_col: Name of target column to exclude from features
        min_features: Minimum number of features in a subset (default: 2)
        max_features: Maximum number of features in a subset (default: all features)
        models: Dictionary of model names and their corresponding sklearn model instances
            ie. {'RandomForest': RandomForestClassifier(random_state=42), 'LogisticRegression': LogisticRegression(max_iter=10000, random_state=42)}
    
    Returns:
        pd.DataFrame: Results sorted by test balanced accuracy
    """
    
    # Get all available features (excluding target column)
    all_features = [col for col in train_df.columns if col != target_col]
    
    if max_features is None:
        max_features = len(all_features)
    
    print(f"Total features available: {len(all_features)}")
    print(f"Features: {all_features}\n")
    
    # Define models to test
    if models is None:
        models = {
            'RandomForest': RandomForestClassifier(random_state=42, n_estimators=100),
            'LogisticRegression': LogisticRegression(max_iter=10000, random_state=42)
        }
    
    results = []
    
    # Try all subset sizes from min_features to max_features
    for n_features in range(min_features, max_features + 1):
        # Generate all combinations of n_features
        feature_combinations = list(combinations(all_features, n_features))
        
        print(f"Testing {len(feature_combinations)} combinations with {n_features} features...")
        
        # Test each combination with each model
        for features in tqdm(feature_combinations, desc=f"{n_features} features"):
            features = list(features)
            
            # Prepare data with selected features
            X_train = train_df[features]
            y_train = train_df[target_col]
            X_test = test_df[features]
            y_test = test_df[target_col]
            
            # Test each model
            for model_name, model in models.items():
                try:
                    # Train model
                    model.fit(X_train, y_train)
                    
                    # Make predictions
                    train_preds = model.predict(X_train)
                    test_preds = model.predict(X_test)
                    
                    train_metrics = calculate_metrics(y_train, train_preds)
                    test_metrics = calculate_metrics(y_test, test_preds)
                    
                    # Calculate metrics
                    result = {
                        'model': model_name,
                        'n_features': n_features,
                        'features': ', '.join(features),
                        'train_accuracy': train_metrics['accuracy'],
                        'test_accuracy': test_metrics['accuracy'],
                        'train_balanced_acc': train_metrics['balanced_accuracy'],
                        'test_balanced_acc': test_metrics['balanced_accuracy'],
                        'train_f1_macro_score': train_metrics['f1_score_macro'],
                        'test_f1_macro_score': test_metrics['f1_score_macro']
                    }
                    
                    results.append(result)
                    
                except Exception as e:
                    # Skip combinations that cause errors
                    print(f"Error with {model_name} and features {features}: {e}")
                    continue
    
    # Convert to DataFrame and sort by test balanced accuracy
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('test_balanced_acc', ascending=False)
    
    return results_df

In [25]:
# Run the comprehensive search
print("Starting comprehensive feature search...\n")
results_df = comprehensive_feature_search(
    train_df=train_df,
    test_df=test_df,
    target_col='FL_UDSD_cat',
    min_features=9,
    max_features=10  # Use all features or set a limit like 5
)

Starting comprehensive feature search...

Total features available: 10
Features: ['MMSE', 'CDRSUM', 'CDRGLOB', 'HVLT_DR', 'LASSI_A_CR2', 'LASSI_B_CR1', 'LASSI_B_CR2', 'APOE', 'AMYLPET', 'PTAU_217_CONCNTRTN']

Testing 10 combinations with 9 features...


9 features:   0%|          | 0/10 [00:00<?, ?it/s]

Testing 1 combinations with 10 features...


10 features:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# group = results_df.groupby(by=['model', 'n_features']).get_group(('RandomForest', 2))
# results_df

In [None]:
best_results = results_df.loc[results_df.groupby(['model', 'n_features'])['test_f1_macro_score'].idxmax()]

In [None]:
best_results

In [None]:
def save_results(
    results_df: pd.DataFrame,
    output_dir: str,
    sort_by: str = 'test_f1_macro',
    ascending: bool = False
):
    """
    Save feature search results to separate Excel files for each model.
    Each Excel file contains sheets organized by feature count.
    
    Args:
        results_df: DataFrame containing feature search results from comprehensive_feature_search
        output_dir: Directory where the Excel files will be saved
        sort_by: Column name to sort results by (default: 'test_balanced_acc')
        ascending: Sort order (default: False for descending)
    
    Returns:
        None. Saves Excel files to disk.
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Get unique models
    models = results_df['model'].unique()
    
    # Create a separate Excel file for each model
    for model in models:
        # Filter data for this model
        model_df = results_df[results_df['model'] == model].copy()
        
        # Create file path
        file_path = Path(output_dir) / f"{model}.xlsx"
        
        # Create Excel writer
        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            # Get unique feature counts for this model
            feature_counts = sorted(model_df['n_features'].unique())
            
            # Create a sheet for each feature count
            for n_features in feature_counts:
                # Filter data for this feature count
                filtered_df = model_df[model_df['n_features'] == n_features].copy()
                
                if not filtered_df.empty:
                    # Sort the filtered data
                    filtered_df = filtered_df.sort_values(sort_by, ascending=ascending)
                    
                    # Create sheet name
                    sheet_name = f"{n_features}_features"
                    
                    # Write to Excel
                    filtered_df.to_excel(writer, sheet_name=sheet_name, index=False)
            
            # Also create a summary sheet with all results for this model
            model_df_sorted = model_df.sort_values(sort_by, ascending=ascending)
            model_df_sorted.to_excel(writer, sheet_name='All_Results', index=False)
        
        print(f"Created {file_path} with {len(feature_counts)} feature count sheets")
    
    print(f"\nSaved {len(models)} Excel files to {output_dir}")
    print(f"Models: {', '.join(models)}")

In [None]:
save_results(results_df, 'results/',sort_by='test_f1_macro_score', ascending=False)