In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

In [None]:
# import the required libraries
import pandas as pd
import numpy as np

In [None]:
# helper function to split the data
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    # determine the type of the dataset
    is_pandas = isinstance(X, pd.DataFrame)

    # shuffle the indices
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    # split indices
    split_idx = int(len(indices) * (1 - test_size))
    train_idx, test_idx = indices[:split_idx], indices[split_idx:]

    # split the data
    if is_pandas:
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    else:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print(y)




X_train shape: (455, 30)
X_test shape: (114, 30)
y_train shape: (455, 1)
y_test shape: (114, 1)
     Diagnosis
0            1
1            1
2            1
3            1
4            1
..         ...
564          1
565          1
566          1
567          1
568          0

[569 rows x 1 columns]


In [None]:
# helper function to standardize the data
def standard_scaler(X_train, X_test):
  mean = np.mean(X_train, axis=0)
  std = np.std(X_train, axis=0)

  # avoid division by zero in case of zero variance
  std = np.where(std == 0, 1, std)

  # perform scaling
  X_train_scaled = (X_train - mean) / std
  X_test_scaled = (X_test - mean) / std

  return X_train_scaled, X_test_scaled



X_train_scaled, X_test_scaled = standard_scaler(X_train, X_test)
print("Original X_train mean:", np.mean(X_train, axis=0))
print("Scaled X_train mean:", np.mean(X_train_scaled, axis=0))



Original X_train mean: radius1                14.053226
texture1               19.311341
perimeter1             91.496571
area1                 647.105055
smoothness1             0.096316
compactness1            0.104691
concavity1              0.088904
concave_points1         0.048606
symmetry1               0.181284
fractal_dimension1      0.062977
radius2                 0.398979
texture2                1.232834
perimeter2              2.828935
area2                  38.947615
smoothness2             0.007021
compactness2            0.025552
concavity2              0.032247
concave_points2         0.011661
symmetry2               0.020617
fractal_dimension2      0.003799
radius3                16.203837
texture3               25.737363
perimeter3            106.930593
area3                 872.951429
smoothness3             0.132254
compactness3            0.254927
concavity3              0.272903
concave_points3         0.113689
symmetry3               0.291066
fractal_dimension3  

utilize package sklearn and evluate its performance

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# It's good practice to scale features before feeding them to SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the classifier with default parameters
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9824561403508771
              precision    recall  f1-score   support

           B       0.97      1.00      0.99        71
           M       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that includes scaling and the classifier
svm_pipeline = make_pipeline(StandardScaler(), SVC(random_state=42, probability=True))

# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_accuracy = cross_val_score(svm_pipeline, X, y, cv=skf, scoring='accuracy')
cv_f1 = cross_val_score(svm_pipeline, X, y, cv=skf, scoring='f1_macro')
cv_recall = cross_val_score(svm_pipeline, X, y, cv=skf, scoring='recall_macro')

# Train the classifier on the scaled training data
svm_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = svm_pipeline.predict(X_test)

# Calculate the probability scores of the test set
y_scores = svm_pipeline.predict_proba(X_test)[:, 1]

# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, y_scores)

# Evaluate the model
print("Cross-Validated Accuracy:", cv_accuracy.mean())
print("Cross-Validated F1 Score:", cv_f1.mean())
print("Cross-Validated Recall:", cv_recall.mean())
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Test Recall:", recall_score(y_test, y_pred, average='macro'))
print("ROC AUC Score:", roc_auc)
print(classification_report(y_test, y_pred))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-Validated Accuracy: 0.9771619313771154
Cross-Validated F1 Score: 0.9753836264480608
Cross-Validated Recall: 0.9731786715122779
Test Accuracy: 0.9824561403508771
Test F1 Score: 0.9811507936507937
Test Recall: 0.9767441860465116
ROC AUC Score: 0.99737962659679
              precision    recall  f1-score   support

           B       0.97      1.00      0.99        71
           M       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



runtime measurement

In [None]:
import time
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Start the total runtime timer
total_start_time = time.time()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.squeeze()  # Using .squeeze() to convert DataFrame to Series
y_test = y_test.squeeze()

# Create a pipeline that includes scaling and the classifier
svm_pipeline = make_pipeline(StandardScaler(), SVC(random_state=42, probability=True))

# Initialize KFold for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform K-Fold cross-validation
start_time = time.time()  # Timer for K-Fold cross-validation
cv_results = cross_val_score(svm_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
print("K-Fold Cross-validation completed in {:.2f} seconds".format(time.time() - start_time))
print("Average accuracy: {:.2f}%".format(np.mean(cv_results) * 100))

# Train the classifier on the scaled training data
start_time = time.time()
svm_pipeline.fit(X_train, y_train)
print("Training completed in {:.2f} seconds".format(time.time() - start_time))

# Make predictions
start_time = time.time()
y_pred = svm_pipeline.predict(X_test)
print("Prediction completed in {:.2f} seconds".format(time.time() - start_time))

# Calculate the ROC AUC Score
y_scores = svm_pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_scores)

# Evaluation
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Test Recall:", recall_score(y_test, y_pred, average='macro'))
print("ROC AUC Score:", roc_auc)
print(classification_report(y_test, y_pred))

# End the total runtime timer and print the total elapsed time
total_elapsed_time = time.time() - total_start_time
print("Total execution time: {:.2f} seconds".format(total_elapsed_time))


K-Fold Cross-validation completed in 0.61 seconds
Average accuracy: 97.58%
Training completed in 0.12 seconds
Prediction completed in 0.02 seconds
Test Accuracy: 0.9824561403508771
Test F1 Score: 0.9811507936507937
Test Recall: 0.9767441860465116
ROC AUC Score: 0.99737962659679
              precision    recall  f1-score   support

           B       0.97      1.00      0.99        71
           M       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Total execution time: 0.91 seconds


5-fold validation score

In [None]:
# initialize KFold for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform K-Fold cross-validation
start_time = time.time()  # Timer for K-Fold cross-validation
cv_results = cross_val_score(svm_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
print("K-Fold Cross-validation completed in {:.2f} seconds".format(time.time() - start_time))
print("Average accuracy: {:.2f}%".format(np.mean(cv_results) * 100))


K-Fold Cross-validation completed in 0.62 seconds
Average accuracy: 97.58%


passive learning (random selection)

In [None]:
# random sample simulations (should be the same to ensure same initial model)
random_seeds = [219+i for i in range(5)]

In [None]:
# initialize
CV_train_all_passive = list()
test_all_passive = list()

for seed_idx, random_seed in enumerate(random_seeds):
    np.random.seed(random_seed)
    idices = list(range(y.shape[0]))
    np.random.shuffle(idices)

    # initialize
    train_idices = idices[:int(len(idices)*0.2)]
    test_idices = idices[int(len(idices)*0.2):]

    CV_train_per_seed = list()
    test_per_seed = list()

    while (float(len(train_idices))/len(test_idices) < 0.5):
        # cross-validate in the training set
        clf = SVC(kernel='linear')
        CV_train_per_seed.append(
            np.mean(cross_val_score(clf, X.iloc[train_idices], y.iloc[train_idices], cv=5))
        )
        # update the clf to ensure it's re-initialized
        clf = SVC(kernel='linear')
        clf.fit(X.iloc[train_idices], y.iloc[train_idices])
        y_pred = clf.predict(X.iloc[test_idices])
        test_per_seed.append(accuracy_score(y.iloc[test_idices], y_pred))

        # select the next point to mv from test to train
        # for passive learning, just take the next random idx
        train_idices = idices[:len(train_idices)+1]
        test_idices = idices[len(train_idices):]

    CV_train_all_passive.append(CV_train_per_seed)
    test_all_passive.append(test_per_seed)

CV_train_all_passive = np.array(CV_train_all_passive)
test_all_passive = np.array(test_all_passive)
CV_train_mean_passive = np.mean(CV_train_all_passive, axis=0)
CV_train_std_passive = np.std(CV_train_all_passive, axis=0)
test_mean_passive = np.mean(test_all_passive, axis=0)
test_std_passive = np.std(test_all_passive, axis=0)

active learning (uncertainty sampling)

In [None]:
# initialize
CV_train_all_active = list()
test_all_active = list()

for seed_idx, random_seed in enumerate(random_seeds):
    np.random.seed(random_seed)
    idices = list(range(y.shape[0]))
    np.random.shuffle(idices)

    # initialize
    train_idices = idices[:int(len(idices)*0.2)]
    test_idices = idices[int(len(idices)*0.2):]

    CV_train_per_seed = list()
    test_per_seed = list()

    while (float(len(train_idices))/len(test_idices) < 0.5):
        # cross-validate in the training set
        clf = SVC(kernel='linear')
        CV_train_per_seed.append(
            np.mean(cross_val_score(clf, X.iloc[train_idices], y.iloc[train_idices], cv=5))
        )
        # update the clf to ensure it's re-initialized
        clf = SVC(kernel='linear')
        clf.fit(X.iloc[train_idices], y.iloc[train_idices])
        y_pred = clf.predict(X.iloc[test_idices])
        test_per_seed.append(accuracy_score(y.iloc[test_idices], y_pred))

        # select the next point to mv from test to train
        # for active learning, take the one closest to the decision boundary (for SVM)
        # compute distances of each point to the decision boundary
        distances = np.abs(np.dot(X.iloc[test_idices], clf.coef_[0]) + clf.intercept_) /\
        (np.linalg.norm(clf.coef_[0], ord=2, axis=0))
        closest_index = np.argmin(distances)

        # move the index from test to train
        train_idices.append(test_idices[closest_index])
        test_idices.pop(closest_index)

    CV_train_all_active.append(CV_train_per_seed)
    test_all_active.append(test_per_seed)

CV_train_all_active = np.array(CV_train_all_active)
test_all_active = np.array(test_all_active)
CV_train_mean_active = np.mean(CV_train_all_active, axis=0)
CV_train_std_active = np.std(CV_train_all_active, axis=0)
test_mean_active = np.mean(test_all_active, axis=0)
test_std_active = np.std(test_all_active, axis=0)

visualize the result for active learning against passive learning

In [None]:
import matplotlib.pyplot as plt

In [None]:
# for plot in training set
# for passive learning
plt.errorbar(np.arange(CV_train_mean_passive.shape[0]),
             CV_train_mean_passive,
             yerr=CV_train_std_passive,
             fmt='s',
             color='steelblue',
             ecolor='steelblue',
             linestyle='--',
             capsize=5,
             label='CV_train_passive')



# for active learning
plt.errorbar(np.arange(CV_train_mean_active.shape[0]),
             CV_train_mean_active,
             yerr=CV_train_std_active,
             fmt='s',
             color='gold',
             ecolor='gold',
             linestyle='--',
             capsize=5,
             label='CV_train_active')


# to display the legend
plt.legend()

# set up the characteristics of the plot

plt.xlabel('Round Number')
plt.ylabel('Accuracy')
plt.title('Performance for Passive and Active Learning over training set in each round')
plt.grid(True, linestyle='--')


plt.show()

In [None]:
# for plot in tesing set
# for passive learning
plt.errorbar(np.arange(test_mean_passive.shape[0]),
             test_mean_passive,
             yerr=test_std_passive,
             fmt='s',
             color='orange',
             ecolor='orange',
             linestyle='--',
             capsize=5,
             label='test_passive')

# for active learning
plt.errorbar(np.arange(test_mean_active.shape[0]),
             test_mean_active,
             yerr=test_std_active,
             fmt='s',
             color='green',
             ecolor='green',
             linestyle='--',
             capsize=5,
             label='test_active')

# to display the legend
plt.legend()

# set up the characteristics of the plot

plt.xlabel('Round Number')
plt.ylabel('Accuracy')
plt.title('Performance for Passive and Active Learning over testing set in each round')
plt.grid(True, linestyle='--')


plt.show()