# AutoPrognosis classification

Welcome! This tutorial will walk you through the steps of selecting a model for a classification task using AutoPrognosis.

### Setup

In [6]:
# stdlib
import json
import warnings

# third party
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# ===== Global imports =====
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np


### Import ClassifierStudy

ClassifierStudy is the engine that learns an ensemble of pipelines and their hyperparameters automatically.

In [None]:
# autoprognosis absolute
from autoprognosis.studies.classifiers import ClassifierStudy

### Load the target dataset

AutoPrognosis expects pandas.DataFrames as input.

For this example, we will use the [Breast Cancer Wisconsin Dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)).

In [None]:
# stdlib
from pathlib import Path

X, Y = load_breast_cancer(return_X_y=True, as_frame=True)

df = X.copy()
df["target"] = Y

### Create the classifier

While AutoPrognosis provides default plugins, it allows the user to customize the plugins for the pipelines.

You can see the supported plugins below:

In [6]:
# List the available plugins

# autoprognosis absolute
import json

from autoprognosis.plugins import Plugins
from pathlib import Path
from autoprognosis.studies.classifiers import ClassifierStudy


print(json.dumps(Plugins().list_available(), indent=2))

{
  "imputer": {
    "default": [
      "EM",
      "gain",
      "hyperimpute",
      "ice",
      "mean",
      "median",
      "mice",
      "missforest",
      "most_frequent",
      "nop",
      "sinkhorn",
      "softimpute"
    ]
  },
  "prediction": {
    "classifier": [
      "adaboost",
      "bagging",
      "bernoulli_naive_bayes",
      "catboost",
      "decision_trees",
      "extra_tree_classifier",
      "gaussian_naive_bayes",
      "gaussian_process",
      "gradient_boosting",
      "hist_gradient_boosting",
      "knn",
      "lda",
      "lgbm",
      "linear_svm",
      "logistic_regression",
      "multinomial_naive_bayes",
      "neural_nets",
      "perceptron",
      "qda",
      "random_forest",
      "ridge_classifier",
      "tabnet",
      "xgboost"
    ],
    "regression": [
      "bayesian_ridge",
      "catboost_regressor",
      "kneighbors_regressor",
      "linear_regression",
      "mlp_regressor",
      "neural_nets_regression",
      "random_fore

We will set a few custom plugins for the pipelines and create the classifier study.

In [10]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(as_frame=True)
df = data.frame

# 確認 target 欄位存在
df["target"] = data.target


workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

study_name = "classification_example"

study = ClassifierStudy(
    study_name=study_name,
    dataset=df,  # pandas DataFrame
    target="target",  # the label column in the dataset
    num_iter=2,  # DELETE THIS LINE FOR BETTER RESULTS. how many trials to do for each candidate. Default: 50
    num_study_iter=1,  # DELETE THIS LINE FOR BETTER RESULTS. how many outer iterations to do. Default: 5
    classifiers=[
        
        "lda",
        "qda",
    ],  # DELETE THIS LINE FOR BETTER RESULTS.
    workspace=workspace,
)

### Search for the optimal ensemble


In [16]:
study.run()

0,1,2
,models,"[<autoprognosi...0016111119470>, <autoprognosi...00161111191D0>]"
,weights,"[np.float64(0.5384615380473372), np.float64(0....3846118343195)]"
,explainer_plugins,[]
,explainers,
,explanations_nepoch,10000


In [15]:
from pathlib import Path
print(Path("workspace/_backend_trace.txt").read_text(encoding="utf-8"))


USING RANDOM SEARCH



In [None]:
# stdlib
import pprint

# autoprognosis absolute
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_estimator

output = workspace / study_name / "model.p"

model = load_model_from_file(output)

metrics = evaluate_estimator(model, X, Y)

print(f"Model {model.name()} ")
print("Score: ")

pprint.pprint(metrics)

## Serialization

In [None]:
# autoprognosis absolute
from autoprognosis.utils.serialization import load_from_file, save_to_file

out = workspace / "tmp.bkp"
# Fit the model
model.fit(X, Y)

# Save
save_to_file(out, model)

# Reload
loaded_model = load_from_file(out)

print(loaded_model.name())

assert loaded_model.name() == model.name()

out.unlink()

## Congratulations!

Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards Machine learning and AI for medicine, you can do so in the following ways!

### Star AutoPrognosis on GitHub

The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we're building.

- [Star AutoPrognosis](https://github.com/vanderschaarlab/autoprognosis)
- [Star HyperImpute](https://github.com/vanderschaarlab/hyperimpute)


In [17]:
import sys
print(sys.executable)


c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\.venv\Scripts\python.exe


In [21]:
from pathlib import Path
print("CWD =", Path.cwd())

CWD = c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\tutorials\automl


In [2]:
from pathlib import Path
import pandas as pd

# 從 tutorials/automl 往上兩層 = autoprognosis-main
root = Path.cwd().parents[2]
print("Search root =", root)

candidates = list(root.rglob("*.csv"))

keys = ["pima", "diabetes"]
pima_files = [p for p in candidates if any(k in p.name.lower() for k in keys)]

print("Found candidates:")
for p in pima_files[:20]:
    print(" -", p)

assert len(pima_files) > 0, "找不到 PIMA csv：請確認檔名是否包含 pima/diabetes，或直接貼檔名我幫你改搜尋規則"

pima_path = pima_files[0]
print("Using:", pima_path)

df = pd.read_csv(pima_path)
print(df.shape)
df.head()


Search root = c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼
Found candidates:
 - c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\PIMA.csv
 - c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\.venv\Lib\site-packages\lifelines\datasets\interval_diabetes.csv
Using: c:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\PIMA.csv
(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
target_col = "Outcome"
df[target_col] = df[target_col].astype(int)
df[target_col].value_counts()


Outcome
0    500
1    268
Name: count, dtype: int64

In [4]:
from autoprognosis.studies.classifiers import ClassifierStudy
from pathlib import Path

workspace = Path("workspace_pima_random")
workspace.mkdir(parents=True, exist_ok=True)

study = ClassifierStudy(
    study_name="pima_random_search",
    dataset=df,
    target=target_col,
    num_iter=20,     #可調
    num_study_iter=1,
    classifiers=["lda", "qda"],
    workspace=workspace,
)

import time

t0 = time.time()
result = study.run()
t1 = time.time()

runtime_sec = t1 - t0
print("Runtime (sec):", runtime_sec)

result


Runtime (sec): 42.230963945388794


0,1,2
,models,"[<autoprognosi...002AD7F5EC2F0>, <autoprognosi...002AD7F5ED550>]"
,weights,"[np.float64(0.9090909082644627), np.float64(0....0909082644627)]"
,explainer_plugins,[]
,explainers,
,explanations_nepoch,10000


In [7]:
summary = {
    "time": datetime.now().isoformat(timespec="seconds"),
    "dataset": "PIMA",
    "backend": "random_search",
    "classifiers": ["lda", "qda"],
    "num_iter": 20,
    "num_study_iter": 1,
    "runtime_sec": runtime_sec
}

Path("workspace_pima_random/summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
print("Saved:", Path("workspace_pima_random/summary.json").resolve())


Saved: C:\Users\user\Desktop\11401\Python資料科學\Final project\原碼\autoprognosis-main\tutorials\automl\workspace_pima_random\summary.json
