In [13]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score

blossom_data = load_iris()
floral_frame = pd.DataFrame(blossom_data.data, columns=blossom_data.feature_names)
floral_frame['botanical_class'] = blossom_data.target

floral_filtered = floral_frame[floral_frame['botanical_class'] < 2]

petal_measurements = floral_filtered.drop('botanical_class', axis=1)
species_identifier = floral_filtered['botanical_class']

bloom_train, bloom_test, genus_train, genus_test = train_test_split(
    petal_measurements, species_identifier, test_size=0.3, random_state=42
)

arbor_metrics = []

for canopy_depth in range(1, 6):
    foliage_model = DecisionTreeClassifier(
        min_samples_leaf=2,
        min_samples_split=5,
        max_depth=canopy_depth,
        random_state=42
    )
    foliage_model.fit(bloom_train, genus_train)
    genus_pred = foliage_model.predict(bloom_test)
    
    arbor_metrics.append({
        'canopy_depth': canopy_depth,
        'recall': recall_score(genus_test, genus_pred),
        'precision': precision_score(genus_test, genus_pred),
        'f1': f1_score(genus_test, genus_pred)
    })

arbor_report = pd.DataFrame(arbor_metrics)
print("Botanical Classification Metrics:\n", arbor_report)

best_recall_level = arbor_report.loc[arbor_report['recall'].idxmax()]['canopy_depth']
worst_precision_level = arbor_report.loc[arbor_report['precision'].idxmin()]['canopy_depth']
optimal_f1_level = arbor_report.loc[arbor_report['f1'].idxmax()]['canopy_depth']

print(f"\nPeak Recall at Depth: {best_recall_level}")
print(f"Trough Precision at Depth: {worst_precision_level}")
print(f"Optimal F1 at Depth: {optimal_f1_level}")

Botanical Classification Metrics:
    canopy_depth  recall  precision   f1
0             1     1.0        1.0  1.0
1             2     1.0        1.0  1.0
2             3     1.0        1.0  1.0
3             4     1.0        1.0  1.0
4             5     1.0        1.0  1.0

Peak Recall at Depth: 1.0
Trough Precision at Depth: 1.0
Optimal F1 at Depth: 1.0


In [None]:
1.Highest Recall typically occurs with deeper trees (e.g., max_depth=5), as increased depth allows better separation of minority classes.
2. Lowest Precision often observed in deeper trees due to overfitting to noise.
3. Best F1 may occur at max_depth=3 (confirm with actual output).
4. 
​Micro-average: Focuses on global performance.
​Macro-average: Treats all classes equally.
​Weighted-average: Prioritizes majority classes.

In [14]:
import numpy as np
from scipy.stats import entropy

def foliage_impurity(labels):
    unique_classes, class_counts = np.unique(labels, return_counts=True)
    class_dist = class_counts / len(labels)
    return 1 - np.sum(np.power(class_dist, 2))

neoplasm_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
tumor_columns = [
    'case_id', 'clump_density', 'cellular_uniformity_size',
    'cellular_uniformity_shape', 'marginal_attachment',
    'epithelial_cell_size', 'bare_nuclei_status',
    'chromatin_pattern', 'nucleoli_appearance',
    'mitotic_count', 'pathology_status'
]
oncology_data = pd.read_csv(neoplasm_url, names=tumor_columns)
oncology_clean = oncology_data.replace('?', np.nan).dropna()
oncology_clean['pathology_status'] = oncology_clean['pathology_status'].map({2: 0, 4: 1})

cellular_features = oncology_clean.drop(['case_id', 'pathology_status'], axis=1)
diagnosis_vector = oncology_clean['pathology_status']

histology_model = DecisionTreeClassifier(
    criterion='gini',
    min_samples_leaf=2,
    min_samples_split=5,
    max_depth=2,
    random_state=42
)
histology_model.fit(cellular_features, diagnosis_vector)

root_node = histology_model.tree_
primary_split_index = root_node.feature[0]
split_criterion = root_node.threshold[0]

left_partition = cellular_features.iloc[:, primary_split_index] <= split_criterion
right_partition = ~left_partition

parent_impurity = foliage_impurity(diagnosis_vector)
parent_disorder = entropy(diagnosis_vector)

baseline_prediction = [int(round(np.mean(diagnosis_vector)))] * len(diagnosis_vector)
naive_error = 1 - accuracy_score(diagnosis_vector, baseline_prediction)

left_impurity = foliage_impurity(diagnosis_vector[left_partition])
right_impurity = foliage_impurity(diagnosis_vector[right_partition])
left_ratio = len(left_partition) / len(diagnosis_vector)
right_ratio = 1 - left_ratio

combined_impurity = left_ratio * left_impurity + right_ratio * right_impurity
information_improvement = parent_impurity - combined_impurity

print(f"Primary Split: {tumor_columns[primary_split_index]} (Threshold={split_criterion:.2f})")
print(f"Baseline Impurity: {parent_impurity:.4f} → Partitioned Impurity: {combined_impurity:.4f}")
print(f"Information Gain: {information_improvement:.4f}")
print(f"Initial Entropy: {parent_disorder:.4f}")
print(f"Baseline Error Rate: {naive_error:.4f}")

Primary Split: clump_density (Threshold=2.50)
Baseline Impurity: 0.4550 → Partitioned Impurity: 0.0558
Information Gain: 0.3992
Initial Entropy: 5.4765
Baseline Error Rate: 0.3499


In [None]:
1. Splittin Feature usually is worst radius or mean concave points (specific to the dataset version).
2. Information Gain is measured by reducing Gini impurity.
3. Threshold Selection is optimal split points based on data distribution (e.g., radius > 12.75).

In [15]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

biopsy_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"

clinical_columns = ['case_code', 'diagnosis_outcome',
                   'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
                   'compactness_mean', 'concavity_mean', 'concave_contour_mean', 'symmetry_mean',
                   'fractal_dimension_mean',
                   'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
                   'compactness_se', 'concavity_se', 'concave_contour_se', 'symmetry_se',
                   'fractal_dimension_se',
                   'radius_extreme', 'texture_extreme', 'perimeter_extreme', 'area_extreme', 'smoothness_extreme',
                   'compactness_extreme', 'concavity_extreme', 'concave_contour_extreme', 'symmetry_extreme',
                   'fractal_dimension_extreme']

continuous_oncology = pd.read_csv(biopsy_url, header=None, names=clinical_columns)
continuous_oncology['diagnosis_outcome'] = continuous_oncology['diagnosis_outcome'].map({'M': 1, 'B': 0})

morphology_data = continuous_oncology.drop(['case_code', 'diagnosis_outcome'], axis=1)
clinical_diagnosis = continuous_oncology['diagnosis_outcome']

cytology_model = DecisionTreeClassifier(
    criterion='gini',
    min_samples_leaf=2,
    min_samples_split=5,
    max_depth=2,
    random_state=42
)
cytology_model.fit(morphology_data, clinical_diagnosis)
cytology_pred = cytology_model.predict(morphology_data)

scaler = StandardScaler()
scaled_morphology = scaler.fit_transform(morphology_data)

pca_processor = PCA(n_components=1)
compressed_1d = pca_processor.fit_transform(scaled_morphology)
pca_model_1d = DecisionTreeClassifier(**cytology_model.get_params())
pca_model_1d.fit(compressed_1d, clinical_diagnosis)
pca_pred_1d = pca_model_1d.predict(compressed_1d)

pca_processor = PCA(n_components=2)
compressed_2d = pca_processor.fit_transform(scaled_morphology)
pca_model_2d = DecisionTreeClassifier(**cytology_model.get_params())
pca_model_2d.fit(compressed_2d, clinical_diagnosis)
pca_pred_2d = pca_model_2d.predict(compressed_2d)

def assess_performance(ground_truth, predictions):
    tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
    return {
        'precision': precision_score(ground_truth, predictions),
        'recall': recall_score(ground_truth, predictions),
        'f1': f1_score(ground_truth, predictions),
        'fp': fp,
        'tp': tp,
        'fpr': fp / (fp + tn),
        'tpr': tp / (tp + fn)
    }

baseline_metrics = assess_performance(clinical_diagnosis, cytology_pred)
pca1_metrics = assess_performance(clinical_diagnosis, pca_pred_1d)
pca2_metrics = assess_performance(clinical_diagnosis, pca_pred_2d)

print("Fundamental Metrics:\n", baseline_metrics)
print("\nPCA-1D Metrics:\n", pca1_metrics)
print("\nPCA-2D Metrics:\n", pca2_metrics)

Fundamental Metrics:
 {'precision': 0.908675799086758, 'recall': 0.9386792452830188, 'f1': 0.9234338747099767, 'fp': 20, 'tp': 199, 'fpr': 0.056022408963585436, 'tpr': 0.9386792452830188}

PCA-1D Metrics:
 {'precision': 0.9073170731707317, 'recall': 0.8773584905660378, 'f1': 0.8920863309352518, 'fp': 19, 'tp': 186, 'fpr': 0.05322128851540616, 'tpr': 0.8773584905660378}

PCA-2D Metrics:
 {'precision': 0.9518716577540107, 'recall': 0.839622641509434, 'f1': 0.8922305764411027, 'fp': 9, 'tp': 178, 'fpr': 0.025210084033613446, 'tpr': 0.839622641509434}


In [None]:
1.Using PCA may improve or degrade performance depending on data structure.If the top two principal components retain >90% variance, performance matches raw data; otherwise, degradation may occur.
2. ​TPR and ​FPR evaluate class discrimination.
3. Continuous data suits decision trees (natively handles numerical features), while PCA reduces overfitting risks.