In [102]:
import pandas as pd
import numpy as np


In [103]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)


In [104]:
data_source = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"

In [105]:
feature_columns = [
    'case_id', 'diagnosis', 
    'mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness',
    'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry', 
    'mean_fractal',
    'se_radius', 'se_texture', 'se_perimeter', 'se_area', 'se_smoothness',
    'se_compactness', 'se_concavity', 'se_concave_points', 'se_symmetry', 
    'se_fractal',
    'max_radius', 'max_texture', 'max_perimeter', 'max_area',
    'max_smoothness', 'max_compactness', 'max_concavity',
    'max_concave_points', 'max_symmetry', 'max_fractal'
]

In [106]:
cancer_data = pd.read_csv(data_source, header=None, names=feature_columns)
cancer_data['target'] = cancer_data['diagnosis'].map({'M': 1, 'B': 0})  

In [107]:
features = cancer_data.drop(['case_id', 'diagnosis', 'target'], axis=1)
target = cancer_data['target']

In [108]:
train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size=0.3, random_state=42
)

In [109]:
normalizer = StandardScaler()
normalized_train = normalizer.fit_transform(train_features)
normalized_test = normalizer.transform(test_features)

In [110]:
base_model = DecisionTreeClassifier(random_state=42)
base_model.fit(normalized_train, train_target)
base_pred = base_model.predict(normalized_test)

In [111]:
print("Original features model metrics:")
print(f"F1: {f1_score(test_target, base_pred):.2f}")
print(f"Precision: {precision_score(test_target, base_pred):.2f}")
print(f"Sensitivity: {recall_score(test_target, base_pred):.2f}\n")

Original features model metrics:
F1: 0.92
Precision: 0.90
Sensitivity: 0.95



In [115]:
for dim in [1, 2]:
    reducer = PCA(n_components=dim)
    reduced_train = reducer.fit_transform(normalized_train)
    reduced_test = reducer.transform(normalized_test)
    
    pca_model = DecisionTreeClassifier(random_state=42)
    pca_model.fit(reduced_train, train_target)
    pca_pred = pca_model.predict(reduced_test)
    if dim == 1:
        print(f"Single component model metrics:")
        print(f"F1: {f1_score(test_target, pca_pred):.2f}")
        print(f"Precision: {precision_score(test_target, pca_pred):.2f}")
        print(f"Recall: {recall_score(test_target, pca_pred):.2f}\n")
    else:
        tn, fp, fn, tp = confusion_matrix(test_target, pca_pred).ravel()
        false_positive_rate = fp / (fp + tn)
        true_positive_rate = tp / (tp + fn)

Single component model metrics:
F1: 0.78
Precision: 0.76
Recall: 0.79



In [116]:
        print("Dual component         confusion analysis:")
        print(f"True Positives = {tp}, False Alarms = {fp}")
        print(f"True Negatives = {tn}, Missed Cases = {fn}")
        print(f"False Alarm Rate = {false_positive_rate*100:.2f}%")
        print(f"Detection Rate = {true_positive_rate*100:.2f}%")

Dual component         confusion analysis:
True Positives = 57, False Alarms = 10
True Negatives = 98, Missed Cases = 6
False Alarm Rate = 9.26%
Detection Rate = 90.48%


In [117]:
print("\nKey findings:")
print("Original feature metrics surpass PCA-reduced models across all evaluation parameters")
print("Complete feature set preserves discriminative patterns, enabling superior model performance")


Key findings:
Original feature metrics surpass PCA-reduced models across all evaluation parameters
Complete feature set preserves discriminative patterns, enabling superior model performance
