Student 1: name: , i.d.: , github:
Student 2: name: , i.d.: , github:
Student 3: name: , i.d.: , github:

1. Load breast cancer dataset (**structured data**)

For more details about the data: https://scikit-learn.org/1.5/modules/generated/sklearn.datasets.load_breast_cancer.html

In [85]:
from sklearn.datasets import load_breast_cancer

my_data = load_breast_cancer()


2. Split **my_data** to train and test:

- Define X_train, X_test, Y_train, Y_test
- Choose **test_size** for splitting **my_data**
- Use **train_test_split** (for details: https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.train_test_split.html)

In [86]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


3. Libraries

In [87]:
!pip install mlflow
!pip install mlflow scikit-learn

import mlflow
import mlflow.sklearn
from mlflow import log_param, log_metric

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import itertools
import pandas as pd




4. Define MLFlow experiment

In [88]:
EXPERIMENT_NAME = "trees_hyperparam"
mlflow.set_experiment(EXPERIMENT_NAME)


<Experiment: artifact_location='/content/mlruns/1', creation_time=1765719431580, experiment_id='1', last_update_time=1765719431580, lifecycle_stage='active', name='trees_hyperparam', tags={}>

5. Train **model_decision_tree**

- Library: sklearn.tree.DecisionTreeClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize DecisionTreeClassifier options   

In [89]:
from sklearn.tree import DecisionTreeClassifier

param_1_list = [None, 5, 10]          # max_depth
param_2_list = [2, 5, 10]             # min_samples_split
param_3_list = [1, 2, 4]              # min_samples_leaf

param_grid = list(itertools.product(param_1_list, param_2_list, param_3_list))

results = []

for param_1, param_2, param_3 in param_grid:
    with mlflow.start_run():
        mlflow.log_param("model_type", "DecisionTree")
        mlflow.log_param("max_depth", param_1)
        mlflow.log_param("min_samples_split", param_2)
        mlflow.log_param("min_samples_leaf", param_3)

        model = DecisionTreeClassifier(
            max_depth=param_1,
            min_samples_split=param_2,
            min_samples_leaf=param_3,
            random_state=42
        )
        model.fit(X_train, Y_train)

        y_pred = model.predict(X_test)

        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1  = f1_score(Y_test, y_pred)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)

        results.append(["DecisionTree", param_1, param_2, param_3, acc, pre, rec, f1])


6. Train model_random_forest
- Library: sklearn.ensemble.RandomForestClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize RandomForestClassifier options

In [90]:
from sklearn.ensemble import RandomForestClassifier

param_1_list = [50, 100]      # n_estimators
param_2_list = [None, 10]     # max_depth
param_3_list = [2, 5]         # min_samples_split

param_grid = list(itertools.product(param_1_list, param_2_list, param_3_list))

for param_1, param_2, param_3 in param_grid:
    with mlflow.start_run():
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_param("n_estimators", param_1)
        mlflow.log_param("max_depth", param_2)
        mlflow.log_param("min_samples_split", param_3)

        model = RandomForestClassifier(
            n_estimators=param_1,
            max_depth=param_2,
            min_samples_split=param_3,
            random_state=42
        )
        model.fit(X_train, Y_train)

        y_pred = model.predict(X_test)

        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1  = f1_score(Y_test, y_pred)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)

        results.append(["RandomForest", param_1, param_2, param_3, acc, pre, rec, f1])


7. Train model_adaboost

- Library: sklearn.ensemble.AdaBoostClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize AdaBoostClassifier options

In [91]:
from sklearn.ensemble import AdaBoostClassifier

param_1_list = [50, 100]     # n_estimators
param_2_list = [0.5, 1.0]   # learning_rate
param_3_list = ["SAMME"]    # algorithm (חובה כך!)

param_grid = list(itertools.product(param_1_list, param_2_list, param_3_list))

for param_1, param_2, param_3 in param_grid:
    with mlflow.start_run():
        mlflow.log_param("model_type", "AdaBoost")
        mlflow.log_param("n_estimators", param_1)
        mlflow.log_param("learning_rate", param_2)
        mlflow.log_param("algorithm", param_3)

        model = AdaBoostClassifier(
            n_estimators=param_1,
            learning_rate=param_2,
            algorithm=param_3,
            random_state=42
        )
        model.fit(X_train, Y_train)

        y_pred = model.predict(X_test)

        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1  = f1_score(Y_test, y_pred)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)

        results.append(["AdaBoost", param_1, param_2, param_3, acc, pre, rec, f1])




8. Store the result

In [92]:
from google.colab import files

df = df.drop(columns=[col for col in df.columns if "time" in col.lower()], errors="ignore")
df.to_excel("student_name_results.xlsx", index=False)

files.download("student_name_results.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>