In [50]:
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets.values 

# Reshape the target variable y
y = np.ravel(y)

# Change class values into 1s and 0s for binary classification
# Convert M = 1, B = 0
y_encoded = np.where(y == 'M', 1, 0)
y_encoded

# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

# Split the dataset into training and testing sets
# 80% of data for training
# 20% of data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize Support Vector Classifier
svm = SVC(probability=True)

# Fit the model on the training data
svm.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_proba = svm.predict_proba(X_test)

# Predict class labels for the test set
y_pred = svm.predict(X_test)

# Compare predicted labels with ground truth
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

## K-Nearest Neighbors
Author: Tommy James

In [51]:
# Import needed packages for classification
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Import packages for visualization of results
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from matplotlib.colors import ListedColormap

# Iport packages for evaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [52]:
# Train models with n_neighbors 1 to 15
# Find the maximum accuracy 
# Print accuracy, n_neighbors, and the confusion matrix

max_acc = 0
n = 0
conf_matrix = []

for i in range(1, 16):
    knnClassifier = KNeighborsClassifier(n_neighbors=i)

    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Train model and make predictions for the test set
    knnClassifier.fit(X_train_scaled, np.ravel(y_train))
    y_pred = knnClassifier.predict(scaler.transform(X_test))

    acc = metrics.accuracy_score(y_pred, y_test)

    if acc > max_acc:
        max_acc = acc
        n = i
        tn, fp, fn, tp = metrics.confusion_matrix(y_pred, y_test).ravel()

# Compute metrics
print(f'max accuracy: {max_acc}')
print(f'n_neighbors = {i}')
print(f'true positives: {tp}')
print(f'false positives: {fp}')
print(f'true negatives: {tn}')
print(f'false negatives: {fn}')
print()

max accuracy: 0.9649122807017544
n_neighbors = 15
true positives: 41
false positives: 2
true negatives: 69
false negatives: 2

