In [76]:
from typing import Dict

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import warnings



Defining some constants

In [50]:
RANDOM_STATE = 42

ignore warnings about convergence of models, etc.

In [51]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Import dataset



In [52]:
df = pd.read_csv('./data/breast-cancer-diagnostic.shuf.lrn.csv')
df.head()

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,886452,True,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,...,16.39,22.07,108.1,826.0,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
1,84348301,True,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
2,9012795,True,21.37,15.1,141.3,1386.0,0.1001,0.1515,0.1932,0.1255,...,22.69,21.84,152.1,1535.0,0.1192,0.284,0.4024,0.1966,0.273,0.08666
3,894326,True,18.22,18.87,118.7,1027.0,0.09746,0.1117,0.113,0.0795,...,21.84,25.0,140.9,1485.0,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
4,867387,False,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,...,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071


In [53]:
df.describe()

Unnamed: 0,ID,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,...,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,25755170.0,13.946439,19.376246,90.756842,637.428772,0.096595,0.104231,0.085204,0.047139,0.179774,...,16.038446,25.909614,105.767088,854.987719,0.132928,0.253865,0.266263,0.112879,0.287262,0.08377
std,107390000.0,3.488308,4.278841,24.062045,340.172969,0.014748,0.05523,0.077423,0.038661,0.029706,...,4.785408,6.101124,33.468918,550.723964,0.025036,0.165161,0.210121,0.067894,0.062336,0.019355
min,8913.0,7.691,9.71,47.98,170.4,0.06251,0.01938,0.0,0.0,0.106,...,8.678,12.02,54.49,223.6,0.08125,0.03432,0.0,0.0,0.1566,0.05521
25%,868871.0,11.51,16.39,73.99,406.3,0.08588,0.06545,0.02987,0.01899,0.1601,...,12.84,21.59,82.98,506.2,0.1148,0.1432,0.1117,0.06296,0.2482,0.07055
50%,905189.0,13.14,18.9,85.24,530.6,0.09597,0.08751,0.05485,0.0311,0.1776,...,14.73,25.34,96.09,656.7,0.1312,0.2053,0.1932,0.09265,0.279,0.07944
75%,8812816.0,15.5,21.84,102.8,747.2,0.1059,0.1284,0.1155,0.06772,0.1943,...,18.13,29.94,123.5,1030.0,0.1483,0.3253,0.3853,0.1663,0.3157,0.0918
max,911296200.0,25.73,39.28,174.2,2010.0,0.1634,0.3454,0.4264,0.1913,0.304,...,33.13,44.87,229.3,3234.0,0.2226,1.058,1.17,0.291,0.6638,0.2075


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       285 non-null    int64  
 1   class                    285 non-null    bool   
 2   radiusMean               285 non-null    float64
 3    textureMean             285 non-null    float64
 4    perimeterMean           285 non-null    float64
 5    areaMean                285 non-null    float64
 6    smoothnessMean          285 non-null    float64
 7    compactnessMean         285 non-null    float64
 8    concavityMean           285 non-null    float64
 9    concavePointsMean       285 non-null    float64
 10   symmetryMean            285 non-null    float64
 11   fractalDimensionMean    285 non-null    float64
 12   radiusStdErr            285 non-null    float64
 13   textureStdErr           285 non-null    float64
 14   perimeterStdErr         2

## Data preparation

1. Check for missing values


In [55]:
print(f'Missing values: {df.isnull().sum().any()}')

Missing values: False


1Encode Boolean target attribute 'class' as Integer
2Drop the 'ID' attribute
3Separate the 'class' attribute into its own variable

In [56]:
df['class'] = df['class'].astype(int)
X = df.drop(columns=['ID', 'class'])
Y = df['class']


Training-test data split for holdout method

In [57]:
holdout_X_train, holdout_X_test, holdout_Y_train, holdout_Y_test = train_test_split(X, Y, test_size=0.2,
                                                                                    random_state=42)


Data split for cross-validation method
Scaling set up in pipelines for individual algorithms

In [58]:
cross_validation_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Prepare data structures and useful functions

In [59]:
def get_metrics_dict(
        accuracy: float,
        f1: float,
        precision: float,
        recall: float,
) -> Dict[str, float]:
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }


def find_best_estimator(
        classifier,
        param_grid: dict,
        cv: int = 5
) -> GridSearchCV:
    grid_search = GridSearchCV(
        classifier,
        param_grid=param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1
    )
    grid_search.fit(holdout_X_train, holdout_Y_train)
    return grid_search.best_estimator_


## Random Forest

In [60]:
def run_random_forest(classifier: RandomForestClassifier | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = RandomForestClassifier()

    classifier.set_params(random_state=RANDOM_STATE)

    # Holdout method
    classifier.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = classifier.predict(holdout_X_test)
    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # Cross-validation
    cv_scores = cross_validate(classifier, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "Random Forest",
        "n_estimators": classifier.n_estimators,
        "max_depth": classifier.max_depth,
        "min_samples_split": classifier.min_samples_split,
        "min_samples_leaf": classifier.min_samples_leaf,
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]


Test random forest in various configurations

In [61]:
rf_classifiers = [
    RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1),
    RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=1),
    RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_depth=15),
    RandomForestClassifier(n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_depth=20),
    RandomForestClassifier(n_estimators=250, min_samples_split=3, min_samples_leaf=3, max_depth=10)
]

rf_results = []
for classifier in rf_classifiers:
    rf_results.extend(run_random_forest(classifier))  # Assumes run_random_forest is defined elsewhere

rf_results_df = pd.DataFrame(rf_results)
rf_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,n_estimators,max_depth,min_samples_split,min_samples_leaf,Data Split,accuracy,f1,precision,recall
3,Random Forest,200,,4,1,Cross Validation,0.961,0.943,0.959,0.928
1,Random Forest,100,,2,1,Cross Validation,0.958,0.937,0.948,0.928
9,Random Forest,250,10.0,3,3,Cross Validation,0.954,0.933,0.938,0.928
8,Random Forest,250,10.0,3,3,Holdout,0.953,0.939,0.939,0.939
6,Random Forest,150,20.0,5,2,Holdout,0.953,0.939,0.939,0.939
2,Random Forest,200,,4,1,Holdout,0.953,0.939,0.939,0.939
7,Random Forest,150,20.0,5,2,Cross Validation,0.951,0.927,0.938,0.918
5,Random Forest,100,15.0,2,4,Cross Validation,0.951,0.926,0.948,0.908
0,Random Forest,100,,2,1,Holdout,0.942,0.923,0.938,0.909
4,Random Forest,100,15.0,2,4,Holdout,0.942,0.923,0.938,0.909


Attempt to find best configuration using GridSearchCV

In [62]:
rf_param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
}

best_rf = find_best_estimator(
    classifier=RandomForestClassifier(),
    param_grid=rf_param_grid,
    cv=5
)

best_rf_results = pd.DataFrame(run_random_forest(best_rf))
best_rf_results

Unnamed: 0,classifier,n_estimators,max_depth,min_samples_split,min_samples_leaf,Data Split,accuracy,f1,precision,recall
0,Random Forest,200,,2,1,Holdout,0.953488,0.939394,0.939394,0.939394
1,Random Forest,200,,2,1,Cross Validation,0.961404,0.942708,0.959474,0.928421



## MLP


In [63]:
def run_mlp(classifier: MLPClassifier | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = MLPClassifier()

    # create a pipeline which both scales data using standard scaler and then estimates using MLP
    classifier.set_params(random_state=RANDOM_STATE)
    pipeline = Pipeline([
        ('scale', StandardScaler()),
        ('mlp', classifier),
    ])
    # holdout method
    pipeline.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = pipeline.predict(holdout_X_test)

    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # cross validation
    cv_scores = cross_validate(pipeline, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "MLP",
        "hidden_layer_sizes": classifier.hidden_layer_sizes,
        "max_iter": classifier.max_iter,
        "activation": classifier.activation,
        "solver": classifier.solver,
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]

Test MLP in various configurations.
There are quite a lot of parameters to vary here, we can test only a limited amount.

In [64]:

mlp_classifiers = [
    MLPClassifier(hidden_layer_sizes=(100,), max_iter=200),
    MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=200),
    MLPClassifier(hidden_layer_sizes=(200,), max_iter=300, activation="logistic"),
    MLPClassifier(hidden_layer_sizes=(100, 50, 25), max_iter=300, solver="lbfgs"),
    MLPClassifier(hidden_layer_sizes=(300,), max_iter=500, activation="identity")
]

mlp_results = []
for classifier in mlp_classifiers:
    mlp_results.extend(run_mlp(classifier))

mlp_results_df = pd.DataFrame(mlp_results)
mlp_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,hidden_layer_sizes,max_iter,activation,solver,Data Split,accuracy,f1,precision,recall
4,MLP,"(200,)",300,logistic,adam,Holdout,0.988,0.985,1.0,0.97
5,MLP,"(200,)",300,logistic,adam,Cross Validation,0.982,0.973,1.0,0.948
3,MLP,"(100, 50)",200,relu,adam,Cross Validation,0.979,0.969,0.979,0.959
0,MLP,"(100,)",200,relu,adam,Holdout,0.977,0.97,0.97,0.97
8,MLP,"(300,)",500,identity,adam,Holdout,0.977,0.97,0.97,0.97
1,MLP,"(100,)",200,relu,adam,Cross Validation,0.975,0.964,0.97,0.959
9,MLP,"(300,)",500,identity,adam,Cross Validation,0.968,0.954,0.95,0.959
2,MLP,"(100, 50)",200,relu,adam,Holdout,0.965,0.955,0.941,0.97
7,MLP,"(100, 50, 25)",300,relu,lbfgs,Cross Validation,0.965,0.948,0.95,0.948
6,MLP,"(100, 50, 25)",300,relu,lbfgs,Holdout,0.942,0.928,0.889,0.97


Attempt to find best configuration using GridSearchCV

In [65]:
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,), (100, 50), (100, 50, 25)],
    'max_iter': [200, 300, 500],
    'activation': ['relu', 'tanh', 'logistic'],  # Optional for activation exploration
    'solver': ['adam', 'sgd'],  # Optional for solver exploration
}

best_mlp = find_best_estimator(
    classifier=MLPClassifier(),
    param_grid=mlp_param_grid,
    cv=5
)

best_mlp_results = pd.DataFrame(run_mlp(best_mlp))
best_mlp_results

Unnamed: 0,classifier,hidden_layer_sizes,max_iter,activation,solver,Data Split,accuracy,f1,precision,recall
0,MLP,"(100, 50)",300,tanh,adam,Holdout,0.965116,0.955224,0.941176,0.969697
1,MLP,"(100, 50)",300,tanh,adam,Cross Validation,0.964912,0.948421,0.948421,0.948421


## SVC

In [66]:
def run_svc(classifier: SVC | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = SVC()

    pipeline = Pipeline([
        ('scale', StandardScaler()),
        ('svc', classifier),
    ])
    # Holdout method
    pipeline.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = pipeline.predict(holdout_X_test)
    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # Cross-validation
    cv_scores = cross_validate(pipeline, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "SVC",
        "kernel": classifier.kernel,
        "C": classifier.C,
        "gamma": classifier.gamma,
        "degree": classifier.degree,
        "coef0": classifier.coef0
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]


Test SVC in various configurations

In [67]:
svc_classifiers = [
    SVC(kernel='linear', C=0.1, gamma='scale'),
    SVC(kernel='rbf', C=1.0, gamma=0.1),
    SVC(kernel='poly', degree=2, C=1.0, gamma='auto', coef0=0.0),
    SVC(kernel='poly', degree=3, C=10.0, gamma='scale', coef0=1.0),
    SVC(kernel='sigmoid', C=0.5, gamma=0.01, coef0=0.5)
]

svc_results = []
for classifier in svc_classifiers:
    svc_results.extend(run_svc(classifier))

svc_results_df = pd.DataFrame(svc_results)
svc_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,kernel,C,gamma,degree,coef0,Data Split,accuracy,f1,precision,recall
0,SVC,linear,0.1,scale,3,0.0,Holdout,0.988,0.985,1.0,0.97
1,SVC,linear,0.1,scale,3,0.0,Cross Validation,0.982,0.974,1.0,0.949
9,SVC,sigmoid,0.5,0.01,3,0.5,Cross Validation,0.968,0.951,1.0,0.907
2,SVC,rbf,1.0,0.1,3,0.0,Holdout,0.965,0.954,0.969,0.939
8,SVC,sigmoid,0.5,0.01,3,0.5,Holdout,0.965,0.952,1.0,0.909
7,SVC,poly,10.0,scale,3,1.0,Cross Validation,0.958,0.94,0.924,0.959
3,SVC,rbf,1.0,0.1,3,0.0,Cross Validation,0.947,0.925,0.915,0.939
6,SVC,poly,10.0,scale,3,1.0,Holdout,0.942,0.928,0.889,0.97
4,SVC,poly,1.0,auto,2,0.0,Holdout,0.826,0.706,1.0,0.545
5,SVC,poly,1.0,auto,2,0.0,Cross Validation,0.8,0.605,0.936,0.454


Attempt to find best configuration using GridSearchCV


In [68]:
svc_param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

best_svc = find_best_estimator(
    classifier=SVC(),
    param_grid=svc_param_grid,
    cv=5
)

best_svc_results = pd.DataFrame(run_svc(best_svc))
best_svc_results


Unnamed: 0,classifier,kernel,C,gamma,degree,coef0,Data Split,accuracy,f1,precision,recall
0,SVC,linear,1,scale,2,0.0,Holdout,0.988372,0.984615,1.0,0.969697
1,SVC,linear,1,scale,2,0.0,Cross Validation,0.978947,0.968355,0.99,0.948421


## Combining results

In [87]:
results = pd.concat(
    [rf_results_df, mlp_results_df, svc_results_df, best_rf_results, best_mlp_results, best_svc_results], join='inner')
results.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,Data Split,accuracy,f1,precision,recall
0,SVC,Holdout,0.988,0.985,1.0,0.97
0,SVC,Holdout,0.988,0.985,1.0,0.97
4,MLP,Holdout,0.988,0.985,1.0,0.97
5,MLP,Cross Validation,0.982,0.973,1.0,0.948
1,SVC,Cross Validation,0.982,0.974,1.0,0.949
1,SVC,Cross Validation,0.979,0.968,0.99,0.948
3,MLP,Cross Validation,0.979,0.969,0.979,0.959
0,MLP,Holdout,0.977,0.97,0.97,0.97
8,MLP,Holdout,0.977,0.97,0.97,0.97
1,MLP,Cross Validation,0.975,0.964,0.97,0.959
