#Intro
Using code for the 33-60 age group in the colorectal
cancer dataset as a sample. Age was encoded as follows:

*   33-60: 'g_age_3grp1 = 1'
*   61-72: 'g_age_3grp2 = 1'
*   73+: 'g_age_3grp3 = 1'

# Training on All Groups and Testing on an Individual Group

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Initialize lists to store results
Accur_mean = []
prec_mean = []
rec_mean = []
f1_mean = []

# Load the dataset
df = pd.read_csv(loadpath) #Currently placeholder
df = df.set_index('Sample')
print(df.head())

# Prepare the features and labels
X = df.drop('outcome', axis=1)  # Features
X = X.fillna(method='ffill')  # Forward fill to handle NaN values
y = df.outcome  # Labels

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
cv = KFold(n_splits=10, random_state=None, shuffle=True)

# Function to run the model evaluation and return metrics
def run_model_evaluation(model, X_train, y_train, X_test, y_test, group_column, group_value):
    # Combine X_test and y_test
    Comb = pd.concat([X_test, y_test], axis=1)
    Combine = Comb[Comb[group_column] == group_value]  # Filter by group column and value
    X_group = Combine.drop('outcome', axis=1).fillna(method='ffill')  # Handle missing data
    y_group = Combine['outcome']

    # Reset the metrics for each iteration
    Accur_mean = []
    prec_mean = []
    rec_mean = []
    f1_mean = []

    # Loop through the specified number of iterations
    for i in range(20):
        # Train the model on the training data
        model.fit(X_train, y_train)

        # Evaluate the model using cross-validation
        scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
        scores = cross_validate(model, X_group, y_group, scoring=scoring, cv=cv)

        # Append the results
        Accur_mean.append(scores['test_accuracy'].mean())
        prec_mean.append(scores['test_precision_macro'].mean())
        rec_mean.append(scores['test_recall_macro'].mean())
        f1_mean.append(scores['test_f1_macro'].mean())

    # Create a DataFrame to store the results
    Accuracy = pd.DataFrame({
        'Accuracy': Accur_mean,
        'Precision': prec_mean,
        'Recall': rec_mean,
        'F1': f1_mean
    })

    # Calculate average and standard deviation
    avg = Accuracy.mean()
    std = Accuracy.std()

    # Combine the results
    Accuracy = pd.concat([Accuracy, avg.to_frame().T, std.to_frame().T])

    return Accuracy

## Random Forest

###Tuning the Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Load and prepare data
df = pd.read_csv('your_data.csv')  # Placeholder path
df_i = df.set_index('Sample')

X = df_i.drop(['outcome'], axis=1)
y = df_i['outcome']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Storing results
Accur = []
Estimator = []
Sample_split = []

# Grid search over hyperparameters
def run_tuning():
    accuracies = []
    estimators = []
    sample_splits = []

    for min_split in range(2, 14, 3):
        for n_est in range(5, 150, 5):
            model = RandomForestClassifier(bootstrap=True, class_weight=None,
                criterion='gini', max_depth=None, max_features='sqrt',
                max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1,
                min_samples_split=min_split, min_weight_fraction_leaf=0.0,
                n_estimators=n_est, n_jobs=-1, oob_score=False, random_state=None,
                verbose=0, warm_start=False)

            scores = cross_val_score(model, X, y, cv=10)
            accuracies.append(scores.mean())
            estimators.append(n_est)
            sample_splits.append(min_split)

    return pd.DataFrame({
        'Estimator': estimators,
        'Sample Split': sample_splits,
        'Accuracy': accuracies
    })

# Run tuning 3 times independently
results_run0 = run_tuning()
results_run1 = run_tuning()
results_run2 = run_tuning()

# Merge results
merged = results_run0.merge(results_run1, on=['Estimator', 'Sample Split'], suffixes=('_0', '_1'))
merged = merged.merge(results_run2, on=['Estimator', 'Sample Split'])
merged.rename(columns={'Accuracy': 'Accuracy_2'}, inplace=True)

# Average accuracy across runs
merged['Avg Accuracy'] = merged[['Accuracy_0', 'Accuracy_1', 'Accuracy_2']].mean(axis=1)

# Identify best parameter combination
best = merged.sort_values('Avg Accuracy', ascending=False).head(1)
final_estimator = best['Estimator'].values[0]
final_split = best['Sample Split'].values[0]

print("\nBest hyperparameters found:")
print(f"n_estimators = {final_estimator}")
print(f"min_samples_split = {final_split}")
print(best)

### Training/Testing the Model

In [None]:
#Initializing RF model and inputting hyperparameters found
rfc=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=final_split,
              min_weight_fraction_leaf=0.0, n_estimators=final_estimator, n_jobs=-1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)

In [None]:
group_column_rf = 'g_age_3grp1'  # Feature to filter by
group_value_rf = 1  # Value for feature
rfc_accuracy = run_model_evaluation(rfc, X_train, y_train, X_test, y_test, group_column_rf, group_value_rf)
print("Random Forest Performance:")
print(rfc_accuracy)

## Multinomial Logistic Regression

In [None]:
#Initializing MLogit
mlog = LogisticRegression(random_state=None, multi_class='multinomial',
                        penalty='none', solver='newton-cg').fit(X_train, y_train)

In [None]:
group_column_mlog = 'g_age_3grp1'  # Feature to filter by
group_value_mlog = 1  # Value for feature
mlog_accuracy = run_model_evaluation(mlog, X_train, y_train, X_test, y_test, group_column_mlog, group_value_mlog)
print("Logistic Regression Performance:")
print(mlog_accuracy)

##Linear Support Vector Classifier

In [None]:
#Initializing Linear SVC
svc = svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True,
                    tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True,
                    intercept_scaling=1, class_weight=None, verbose=0,
                    random_state=None, max_iter=1000).fit(X_train, y_train)

In [None]:
group_column_svc = 'g_age_3grp1'  # Feature to filter by
group_value_svc = 1  # Value for feature
svc_accuracy = run_model_evaluation(svc, X_train, y_train, X_test, y_test, group_column_svc, group_value_svc)
print("Linear SVC Performance:")
print(svc_accuracy)

## Linear Discriminant Analysis

In [None]:
#Initializing LDA
lda = LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None,
                                 n_components=None, store_covariance=False,
                                 tol=0.0001, covariance_estimator=None).fit(X_train, y_train)

In [None]:
group_column_lda = 'g_age_3grp1'  # Feature to filter by
group_value_lda = 1  # Value for feature
lda_accuracy = run_model_evaluation(lda, X_train, y_train, X_test, y_test, group_column_lda, group_value_lda)
print("LDA Performance:")
print(lda_accuracy)

## Multilayer Perceptron

In [None]:
#Initializing MLP
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001,
                    learning_rate='constant', learning_rate_init=0.001,max_iter=200,
                    random_state=None, tol=0.0001, verbose=False, warm_start=False,
                    validation_fraction=0.1, max_fun=15000).fit(X_train, y_train)

In [None]:
group_column_mlp = 'g_age_3grp1'  # Feature to filter by
group_value_mlp = 1  # Value for feature
mlp_accuracy = run_model_evaluation(mlp, X_train, y_train, X_test, y_test, group_column_mlp, group_value_mlp)
print("MLP Performance:")
print(mlp_accuracy)

# Training on All Groups and Testing on a Merged Group
Using 'g_age_3grp1' = 0 as indication that patient belongs to either group 2 or group 3. This then creates Group 2+3

## Sample in Random Forest

In [None]:
group_column_rf_merge = 'g_age_3grp1'  # Feature to filter by
group_value_rf_merge = 0  # Value for feature
rfc_accuracy_merge = run_model_evaluation(rfc, X_train, y_train, X_test, y_test,
                                          group_column_rf_merge, group_value_rf_merge)
print("Random Forest Performance:")
print(rfc_accuracy_merge)