In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from itertools import combinations
from tqdm.notebook import tqdm

In [None]:
df = pd.read_csv('../data/synthetic_data.csv')

In [None]:
df.info()

In [None]:
def preprocess_data(dataframe: pd.DataFrame, target_col: str='FL_UDSD', diagnosis_order: list=None) -> pd.DataFrame:
    """
    Preprocess the data by splitting into train and test sets.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        random_state (int): Random state for reproducibility.
        target_col (str): The target column for stratification.
    
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: The train and test dataframes.
    """
    
    # Clean data
    filter_df = df[df[target_col] != 'Unknown'] # Remove rows with unknown target values
    filter_df = filter_df[filter_df["MMSE"] != -1] # Remove rows with invalid MMSE values
    
    # Convert columns to categorical if needed
    filter_df['APOE'] = filter_df['APOE'].astype('category')
    filter_df['AMYLPET'] = filter_df['AMYLPET'].astype('category')
    
    # Encode the target variable as an ordered categorical variable.    
    df['FL_UDSD'] = pd.Categorical(df['FL_UDSD'], categories=diagnosis_order, ordered=True)
    df['FL_UDSD_cat'] = df['FL_UDSD'].cat.codes 
    
    return filter_df

In [None]:
diagnosis_order = ['Normal cognition', 'Subjective Cognitive Decline', 'Impaired Not SCD/MCI',
                       'Early MCI', 'Late MCI', 'Dementia']

In [None]:
filter_df = preprocess_data(dataframe = df, diagnosis_order=diagnosis_order)
filter_df.dropna(inplace=True)

In [None]:
filter_df['FL_UDSD'].value_counts()

In [None]:
filter_df['FL_UDSD_cat'].value_counts()

In [None]:
train_df, test_df = train_test_split(filter_df, test_size=0.2, random_state=42, stratify=filter_df['FL_UDSD_cat'])

In [None]:
def train_and_evaluate_model(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    model,
    target_col: str = 'FL_UDSD_cat'
) -> dict:
    """Train and evaluate a single model."""
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col]
    
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    return {
        'train_accuracy': accuracy_score(y_train, train_preds),
        'test_accuracy': accuracy_score(y_test, test_preds),
        'train_balanced_accuracy': balanced_accuracy_score(y_train, train_preds),
        'test_balanced_accuracy': balanced_accuracy_score(y_test, test_preds),
        'train_f1_score': f1_score(y_train, train_preds, average='macro'),
        'test_f1_score': f1_score(y_test, test_preds, average='macro')
    }



In [None]:
# Usage: call it multiple times
rf_results = train_and_evaluate_model(train_df, test_df, RandomForestClassifier(random_state=42))
lr_results = train_and_evaluate_model(train_df, test_df, LogisticRegression(max_iter=1000, random_state=42))

In [None]:
from itertools import combinations
from tqdm.notebook import tqdm

def comprehensive_feature_search(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str = 'FL_UDSD_cat',
    min_features: int = 2,
    max_features: int = None
):
    """
    Test all possible feature subsets with multiple models.
    
    Args:
        train_df: Training dataframe
        test_df: Test dataframe
        target_col: Name of target column to exclude from features
        min_features: Minimum number of features in a subset (default: 2)
        max_features: Maximum number of features in a subset (default: all features)
    
    Returns:
        pd.DataFrame: Results sorted by test balanced accuracy
    """
    
    # Get all available features (excluding target column)
    all_features = [col for col in train_df.columns if col != target_col]
    
    if max_features is None:
        max_features = len(all_features)
    
    print(f"Total features available: {len(all_features)}")
    print(f"Features: {all_features}\n")
    
    # Define models to test
    models = {
        'RandomForest': RandomForestClassifier(random_state=42, n_estimators=100),
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
    }
    
    results = []
    
    # Try all subset sizes from min_features to max_features
    for n_features in range(min_features, max_features + 1):
        # Generate all combinations of n_features
        feature_combinations = list(combinations(all_features, n_features))
        
        print(f"Testing {len(feature_combinations)} combinations with {n_features} features...")
        
        # Test each combination with each model
        for features in tqdm(feature_combinations, desc=f"{n_features} features"):
            features = list(features)
            
            # Prepare data with selected features
            X_train = train_df[features]
            y_train = train_df[target_col]
            X_test = test_df[features]
            y_test = test_df[target_col]
            
            # Test each model
            for model_name, model in models.items():
                try:
                    # Train model
                    model.fit(X_train, y_train)
                    
                    # Make predictions
                    train_preds = model.predict(X_train)
                    test_preds = model.predict(X_test)
                    
                    # Calculate metrics
                    result = {
                        'model': model_name,
                        'n_features': n_features,
                        'features': ', '.join(features),
                        'train_accuracy': accuracy_score(y_train, train_preds),
                        'test_accuracy': accuracy_score(y_test, test_preds),
                        'train_balanced_acc': balanced_accuracy_score(y_train, train_preds),
                        'test_balanced_acc': balanced_accuracy_score(y_test, test_preds),
                        'train_f1': f1_score(y_train, train_preds, average='macro'),
                        'test_f1': f1_score(y_test, test_preds, average='macro')
                    }
                    
                    results.append(result)
                    
                except Exception as e:
                    # Skip combinations that cause errors
                    print(f"Error with {model_name} and features {features}: {e}")
                    continue
    
    # Convert to DataFrame and sort by test balanced accuracy
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('test_balanced_acc', ascending=False)
    
    return results_df



In [None]:
# Run the comprehensive search
print("Starting comprehensive feature search...\n")
results_df = comprehensive_feature_search(
    train_df=train_df,
    test_df=test_df,
    target_col='FL_UDSD_cat',
    min_features=2,
    max_features=10  # Use all features or set a limit like 5
)