In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from itertools import combinations
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('../data/synthetic_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MMSE                10000 non-null  int64  
 1   CDRSUM              10000 non-null  float64
 2   CDRGLOB             10000 non-null  float64
 3   HVLT_DR             8696 non-null   float64
 4   LASSI_A_CR2         8668 non-null   float64
 5   LASSI_B_CR1         8607 non-null   float64
 6   LASSI_B_CR2         8499 non-null   float64
 7   APOE                8674 non-null   float64
 8   AMYLPET             6791 non-null   float64
 9   PTAU_217_CONCNTRTN  3017 non-null   float64
 10  FL_UDSD             10000 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 859.5+ KB


In [4]:
def preprocess_data(df: pd.DataFrame, target_col: str='FL_UDSD', diagnosis_order: list=None) -> pd.DataFrame:
    """
    Preprocess the data by splitting into train and test sets.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        random_state (int): Random state for reproducibility.
        target_col (str): The target column for stratification.
    
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: The train and test dataframes.
    """
    
    # Clean data
    filter_df = df[df[target_col] != 'Unknown'] # Remove rows with unknown target values
    filter_df = filter_df[filter_df["MMSE"] != -1] # Remove rows with invalid MMSE values
    
    # Convert columns to categorical if needed
    filter_df['APOE'] = filter_df['APOE'].astype('category')
    filter_df['AMYLPET'] = filter_df['AMYLPET'].astype('category')
    
    # Encode the target variable as an ordered categorical variable.    
    filter_df['FL_UDSD'] = pd.Categorical(filter_df['FL_UDSD'], categories=diagnosis_order, ordered=True)
    filter_df['FL_UDSD_cat'] = filter_df['FL_UDSD'].cat.codes 
    
    return filter_df

In [5]:
diagnosis_order = ['Normal cognition', 'Subjective Cognitive Decline', 'Impaired Not SCD/MCI',
                       'Early MCI', 'Late MCI', 'Dementia']

In [6]:
filter_df = preprocess_data(df= df, diagnosis_order=diagnosis_order)
filter_df.dropna(inplace=True)

In [7]:
filter_df['FL_UDSD'].value_counts()

FL_UDSD
Early MCI                       345
Dementia                        202
Late MCI                        140
Subjective Cognitive Decline    110
Normal cognition                 72
Impaired Not SCD/MCI             26
Name: count, dtype: int64

In [8]:
filter_df['FL_UDSD_cat'].value_counts()

FL_UDSD_cat
3    345
5    202
4    140
1    110
0     72
2     26
Name: count, dtype: int64

In [9]:
# note that the target variable is now encoded as integers, so FL_UDSD_cat is the target variable for modeling and FL_UDSD is not needed anymore. We can drop it from the dataframe to avoid confusion.
filter_df.drop(columns=['FL_UDSD'], inplace=True) 

In [10]:
train_df, test_df = train_test_split(filter_df, test_size=0.2, random_state=42, stratify=filter_df['FL_UDSD_cat'])

In [11]:
def train_and_evaluate_model(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    model,
    target_col: str = 'FL_UDSD_cat'
) -> dict:
    """Train and evaluate a single model."""
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col]
    
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    return {
        'train_accuracy': accuracy_score(y_train, train_preds),
        'test_accuracy': accuracy_score(y_test, test_preds),
        'train_balanced_accuracy': balanced_accuracy_score(y_train, train_preds),
        'test_balanced_accuracy': balanced_accuracy_score(y_test, test_preds),
        'train_f1_score': f1_score(y_train, train_preds, average='macro'),
        'test_f1_score': f1_score(y_test, test_preds, average='macro')
    }



In [12]:
rf_results = train_and_evaluate_model(
    train_df=train_df,
    test_df=test_df,
    model=RandomForestClassifier(random_state=42)
)

In [13]:
rf_results

{'train_accuracy': 1.0,
 'test_accuracy': 0.5977653631284916,
 'train_balanced_accuracy': 1.0,
 'test_balanced_accuracy': 0.43589316892604274,
 'train_f1_score': 1.0,
 'test_f1_score': 0.44160638858411927}

In [14]:
lr_results = train_and_evaluate_model(
    train_df=train_df,
    test_df=test_df,
    model= LogisticRegression(max_iter=10000, random_state=42)
)

In [15]:
lr_results

{'train_accuracy': 0.5935754189944135,
 'test_accuracy': 0.5921787709497207,
 'train_balanced_accuracy': 0.4026159944919315,
 'test_balanced_accuracy': 0.3937924924139134,
 'train_f1_score': 0.4100989541385928,
 'test_f1_score': 0.3874134361390733}

In [16]:
def comprehensive_feature_search(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str = 'FL_UDSD_cat',
    min_features: int = 2,
    max_features: int = None,
    models: dict = None
):
    """
    Test all possible feature subsets with multiple models.
    
    Args:
        train_df: Training dataframe
        test_df: Test dataframe
        target_col: Name of target column to exclude from features
        min_features: Minimum number of features in a subset (default: 2)
        max_features: Maximum number of features in a subset (default: all features)
        models: Dictionary of model names and their corresponding sklearn model instances
            ie. {'RandomForest': RandomForestClassifier(random_state=42), 'LogisticRegression': LogisticRegression(max_iter=10000, random_state=42)}
    
    Returns:
        pd.DataFrame: Results sorted by test balanced accuracy
    """
    
    # Get all available features (excluding target column)
    all_features = [col for col in train_df.columns if col != target_col]
    
    if max_features is None:
        max_features = len(all_features)
    
    print(f"Total features available: {len(all_features)}")
    print(f"Features: {all_features}\n")
    
    # Define models to test
    if models is None:
        models = {
            'RandomForest': RandomForestClassifier(random_state=42, n_estimators=100),
            'LogisticRegression': LogisticRegression(max_iter=10000, random_state=42)
        }
    
    results = []
    
    # Try all subset sizes from min_features to max_features
    for n_features in range(min_features, max_features + 1):
        # Generate all combinations of n_features
        feature_combinations = list(combinations(all_features, n_features))
        
        print(f"Testing {len(feature_combinations)} combinations with {n_features} features...")
        
        # Test each combination with each model
        for features in tqdm(feature_combinations, desc=f"{n_features} features"):
            features = list(features)
            
            # Prepare data with selected features
            X_train = train_df[features]
            y_train = train_df[target_col]
            X_test = test_df[features]
            y_test = test_df[target_col]
            
            # Test each model
            for model_name, model in models.items():
                try:
                    # Train model
                    model.fit(X_train, y_train)
                    
                    # Make predictions
                    train_preds = model.predict(X_train)
                    test_preds = model.predict(X_test)
                    
                    # Calculate metrics
                    result = {
                        'model': model_name,
                        'n_features': n_features,
                        'features': ', '.join(features),
                        'train_accuracy': accuracy_score(y_train, train_preds),
                        'test_accuracy': accuracy_score(y_test, test_preds),
                        'train_balanced_acc': balanced_accuracy_score(y_train, train_preds),
                        'test_balanced_acc': balanced_accuracy_score(y_test, test_preds),
                        'train_f1': f1_score(y_train, train_preds, average='macro'),
                        'test_f1': f1_score(y_test, test_preds, average='macro')
                    }
                    
                    results.append(result)
                    
                except Exception as e:
                    # Skip combinations that cause errors
                    print(f"Error with {model_name} and features {features}: {e}")
                    continue
    
    # Convert to DataFrame and sort by test balanced accuracy
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('test_balanced_acc', ascending=False)
    
    return results_df



In [45]:
# Run the comprehensive search
print("Starting comprehensive feature search...\n")
results_df = comprehensive_feature_search(
    train_df=train_df,
    test_df=test_df,
    target_col='FL_UDSD_cat',
    min_features=2,
    max_features=10  # Use all features or set a limit like 5
)

Starting comprehensive feature search...

Total features available: 10
Features: ['MMSE', 'CDRSUM', 'CDRGLOB', 'HVLT_DR', 'LASSI_A_CR2', 'LASSI_B_CR1', 'LASSI_B_CR2', 'APOE', 'AMYLPET', 'PTAU_217_CONCNTRTN']

Testing 45 combinations with 2 features...


2 features:   0%|          | 0/45 [00:00<?, ?it/s]

Testing 120 combinations with 3 features...


3 features:   0%|          | 0/120 [00:00<?, ?it/s]

Testing 210 combinations with 4 features...


4 features:   0%|          | 0/210 [00:00<?, ?it/s]

Testing 252 combinations with 5 features...


5 features:   0%|          | 0/252 [00:00<?, ?it/s]

Testing 210 combinations with 6 features...


6 features:   0%|          | 0/210 [00:00<?, ?it/s]

Testing 120 combinations with 7 features...


7 features:   0%|          | 0/120 [00:00<?, ?it/s]

Testing 45 combinations with 8 features...


8 features:   0%|          | 0/45 [00:00<?, ?it/s]

Testing 10 combinations with 9 features...


9 features:   0%|          | 0/10 [00:00<?, ?it/s]

Testing 1 combinations with 10 features...


10 features:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# group = results_df.groupby(by=['model', 'n_features']).get_group(('RandomForest', 2))

In [None]:
# results_df

Unnamed: 0,model,n_features,features,train_accuracy,test_accuracy,train_balanced_acc,test_balanced_acc,train_f1,test_f1
364,RandomForest,4,"MMSE, CDRSUM, LASSI_A_CR2, PTAU_217_CONCNTRTN",1.000000,0.620112,1.000000,0.484845,1.000000,0.493423
340,RandomForest,4,"MMSE, CDRSUM, CDRGLOB, AMYLPET",0.860335,0.620112,0.774320,0.483486,0.803889,0.483051
412,RandomForest,4,"MMSE, CDRGLOB, LASSI_B_CR1, AMYLPET",0.758380,0.603352,0.656367,0.470235,0.687492,0.474643
518,RandomForest,4,"CDRSUM, CDRGLOB, LASSI_A_CR2, PTAU_217_CONCNTRTN",1.000000,0.569832,1.000000,0.469613,1.000000,0.452519
90,RandomForest,3,"MMSE, CDRSUM, CDRGLOB",0.835196,0.597765,0.758400,0.468400,0.781502,0.468024
...,...,...,...,...,...,...,...,...,...
307,LogisticRegression,3,"LASSI_A_CR2, APOE, PTAU_217_CONCNTRTN",0.504190,0.430168,0.279284,0.226294,0.253354,0.193781
79,LogisticRegression,2,"LASSI_B_CR2, APOE",0.452514,0.424581,0.242034,0.221965,0.213241,0.190400
324,RandomForest,3,"LASSI_B_CR2, APOE, PTAU_217_CONCNTRTN",1.000000,0.324022,1.000000,0.211613,1.000000,0.216707
87,LogisticRegression,2,"APOE, PTAU_217_CONCNTRTN",0.446927,0.396648,0.234041,0.200729,0.201398,0.169549


In [None]:
# results_df['n_features'].max()

np.int64(4)

In [46]:
best_results = results_df.loc[results_df.groupby(['model', 'n_features'])['test_f1'].idxmax()]

In [47]:
best_results

Unnamed: 0,model,n_features,features,train_accuracy,test_accuracy,train_balanced_acc,test_balanced_acc,train_f1,test_f1
57,LogisticRegression,2,"HVLT_DR, AMYLPET",0.501397,0.541899,0.294198,0.323525,0.259403,0.297877
275,LogisticRegression,3,"HVLT_DR, LASSI_B_CR1, AMYLPET",0.5,0.564246,0.295239,0.381689,0.266896,0.373958
459,LogisticRegression,4,"MMSE, LASSI_A_CR2, LASSI_B_CR1, LASSI_B_CR2",0.532123,0.581006,0.314331,0.383749,0.292688,0.378946
937,LogisticRegression,5,"MMSE, HVLT_DR, LASSI_A_CR2, LASSI_B_CR1, AMYLPET",0.53352,0.592179,0.332167,0.405473,0.320288,0.399241
1461,LogisticRegression,6,"MMSE, CDRGLOB, LASSI_B_CR1, APOE, AMYLPET, PTA...",0.567039,0.603352,0.372699,0.405433,0.374382,0.406012
1721,LogisticRegression,7,"MMSE, CDRSUM, CDRGLOB, LASSI_A_CR2, LASSI_B_CR...",0.583799,0.603352,0.382528,0.405407,0.387824,0.406926
1935,LogisticRegression,8,"MMSE, CDRSUM, CDRGLOB, HVLT_DR, LASSI_B_CR1, L...",0.579609,0.608939,0.387479,0.416546,0.390118,0.420624
2017,LogisticRegression,9,"MMSE, CDRSUM, CDRGLOB, LASSI_A_CR2, LASSI_B_CR...",0.593575,0.597765,0.395091,0.410065,0.403672,0.41518
2025,LogisticRegression,10,"MMSE, CDRSUM, CDRGLOB, HVLT_DR, LASSI_A_CR2, L...",0.593575,0.592179,0.402616,0.393792,0.410099,0.387413
2,RandomForest,2,"MMSE, CDRGLOB",0.615922,0.586592,0.450055,0.41974,0.485618,0.423327
