# Setup

## Import modules

In [5]:
import numpy as np
import pandas as pd

# Data handling
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Cross validation
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Visualization
import matplotlib.pyplot as plt

SEED = 43

## Load data

In [6]:
dataS = datasets.load_breast_cancer(as_frame=True)
dataDF = dataS.data
targetDF = dataS.target
print(dataDF.shape)
dataDF.head()

(569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
X = dataDF.values
y = targetDF.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Model definition

In [12]:
pipeline_lr = Pipeline([
    ('scaling', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear', multi_class='auto'))
])

# K-fold cross-validation

## `StratifiedKFold`

In [13]:
scores1 = []
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED).split(X_train, y_train)
for k, (train_idxs, validation_idxs) in enumerate(skfold):
    pipeline_lr.fit(X_train[train_idxs], y_train[train_idxs])
    score = pipeline_lr.score(X_train[validation_idxs], y_train[validation_idxs])
    scores1.append(score)
    print(f'Fold: {k+1:2d}, Class dist.: {np.bincount(y_train[train_idxs])}, Acc: {score:.3f}')

Fold:  1, Class dist.: [156 253], Acc: 0.978
Fold:  2, Class dist.: [156 253], Acc: 0.978
Fold:  3, Class dist.: [155 254], Acc: 0.978
Fold:  4, Class dist.: [155 254], Acc: 0.978
Fold:  5, Class dist.: [155 254], Acc: 0.978
Fold:  6, Class dist.: [156 254], Acc: 0.978
Fold:  7, Class dist.: [156 254], Acc: 0.978
Fold:  8, Class dist.: [156 254], Acc: 0.978
Fold:  9, Class dist.: [156 254], Acc: 1.000
Fold: 10, Class dist.: [156 254], Acc: 1.000


In [14]:
print(f'CV accuracy: {np.mean(scores1):.3f} +/- {np.std(scores1):.3f}')

CV accuracy: 0.982 +/- 0.009


## `cross_val_score`

In [18]:
scores2 = cross_val_score(estimator=pipeline_lr,
                          X=X_train,
                          y=y_train,
                          cv=10,
                          n_jobs=-1)

In [19]:
for idx, score in enumerate(scores2):
    print(f'Fold: {idx+1:2d}, Accuracy: {score:.3f}')

Fold:  1, Accuracy: 0.957
Fold:  2, Accuracy: 1.000
Fold:  3, Accuracy: 1.000
Fold:  4, Accuracy: 1.000
Fold:  5, Accuracy: 1.000
Fold:  6, Accuracy: 1.000
Fold:  7, Accuracy: 1.000
Fold:  8, Accuracy: 1.000
Fold:  9, Accuracy: 0.956
Fold: 10, Accuracy: 0.933


In [20]:
print(f'CV accuracy: {np.mean(scores2):.3f} +/- {np.std(scores2):.3f}')

CV accuracy: 0.985 +/- 0.024
