##### import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
file_path = "RealisticSyntheticCognitiveData.csv"
df = pd.read_csv(file_path)

# Display basic info
print(df.info())
print(df.head())

# Preprocess the data:
# 1. Drop identifier columns (e.g., participant_id) that are not useful for prediction.
# 2. Separate features and target.
X = df.drop(columns=['participant_id', 'diagnosis'])
y = df['diagnosis']

# Encode the target variable (assuming values like 'no_alzheimers' and 'alzheimers' or similar)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Standardize the feature variables.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


In [244]:
# Select one test sample (for example, the first one in the test set)
sample = X_test[0].reshape(1, -1)

# Make a prediction on the sample
predicted_class = clf.predict(sample)

# Convert the numeric prediction back to the original label
predicted_label = le.inverse_transform(predicted_class)

print("Predicted label for the test sample:", predicted_label[0])


Predicted label for the test sample: no_alzheimers


In [238]:
joblib.dump(clf, "alzheimers_model.pkl")
print("Saved RandomForestClassifier to alzheimers_model.pkl")


Saved RandomForestClassifier to alzheimers_model.pkl


In [242]:
import coremltools as ct
from coremltools.models.utils import rename_feature  # <-- import here
from coremltools.proto import Model_pb2
# -- Load back the same model (just to demonstrate the workflow)
clf_loaded = joblib.load("alzheimers_model.pkl")

# -- Convert using the scikit-learn–specific converter
#    Provide 'input_features' as a list of names matching your training columns.
coreml_model = ct.converters.sklearn.convert(
    clf_loaded,
    input_features=X.columns.tolist()  # e.g. ["age", "round1Correct", "round2Correct", ...]
)

# The above step generates a Core ML spec with default or numeric labels. 
# Next, we'll edit the spec to add custom string class labels and rename outputs.

spec = coreml_model.get_spec()

# 4) Insert your string class labels into the TreeEnsembleClassifier spec
#
#   - If the converter recognized a pipeline, you'll find the classifier inside
#     spec.pipeline.models[...] 
#   - Otherwise, it might be directly in spec.treeEnsembleClassifier
#

if spec.WhichOneof("Type") == "pipeline":
    # If there's a pipeline with multiple models, we find the classifier:
    for model_spec in spec.pipeline.models:
        if model_spec.HasField("treeEnsembleClassifier"):
            model_spec.treeEnsembleClassifier.classLabels.stringClassLabels[:] = [
                "no_alzheimers",
                "early_alzheimers",
                "advanced_alzheimers"
            ]
elif spec.WhichOneof("Type") == "treeEnsembleClassifier":
    # If it's directly a treeEnsembleClassifier
    spec.treeEnsembleClassifier.classLabels.stringClassLabels[:] = [
        "no_alzheimers",
        "early_alzheimers",
        "advanced_alzheimers"
    ]

# 5) Rename the predicted class output, if desired
#    Typically output[0] is the predicted label, output[1] is the probability dictionary.
rename_feature(spec, spec.description.output[0].name, "diagnosis_prediction")
rename_feature(spec, spec.description.output[1].name, "diagnosis_probability")

# 6) Create an MLModel from the updated spec and save
final_model = ct.models.MLModel(spec)
final_model.save("AlzheimersPredictor.mlmodel")
print("Core ML model saved: AlzheimersPredictor.mlmodel")

Core ML model saved: AlzheimersPredictor.mlmodel


In [246]:
import numpy as np

# Suppose 'final_model' is the MLModel you obtained after editing spec
# (the same one you saved to "AlzheimersPredictor.mlmodel").

# Let's grab one row from X_test:
one_test_sample = X_test[0]  # shape (13,) if you have 13 features
feature_names = X.columns.tolist()  # or the same list you used in the converter

# Create a dictionary: { feature_name: feature_value }
input_dict = {}
for i, name in enumerate(feature_names):
    # Cast to float to be safe, if it's a numpy float64
    input_dict[name] = float(one_test_sample[i])

# Use the Core ML model to predict
coreml_output = final_model.predict(input_dict)

# The output dictionary typically has:
#   • "diagnosis_prediction" for the label (since we renamed it)
#   • "diagnosis_probability" for the probabilities dictionary
print("Core ML output:", coreml_output)
print("Predicted label (Core ML):", coreml_output["diagnosis_prediction"])
print("Probabilities:", coreml_output["diagnosis_probability"])


Core ML output: {'diagnosis_probability': {0: 0.0, 1: 2.0, 2: 98.0}, 'diagnosis_prediction': 2}
Predicted label (Core ML): 2
Probabilities: {0: 0.0, 1: 2.0, 2: 98.0}


In [248]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Print the class order
print("LabelEncoder class order:", le.classes_)


LabelEncoder class order: ['alzheimers' 'maybe_alzheimers' 'no_alzheimers']


In [250]:
single_pred_sklearn = clf.predict(sample)  # might return [2]
print(le.inverse_transform(single_pred_sklearn))
# -> ['no_alzheimers']
coreml_output = final_model.predict(input_dict)
print(coreml_output["diagnosis_prediction"])


['no_alzheimers']
2
