In [5]:
#  Import necessary libraries

from sklearn.datasets import load_breast_cancer  # Load the dataset
from sklearn.model_selection import train_test_split  # For splitting the data
import pandas as pd  # For creating and manipulating dataframes
import numpy as np  # For numerical operations

# Set a seed for reproducibility

SEED = 42  # Ensures that the splits are the same every time you run the code

# Load the breast cancer dataset

cancer_data = load_breast_cancer()

# Create a DataFrame with feature names

df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)

# Add the target variable to the DataFrame

df['target'] = cancer_data.target

# Split the data into training (60%), validation (20%), and test (20%) sets

train_df, temp_df = train_test_split(df, test_size=20, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

# Display the sizes of the resulting dataframes

print("Training set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)


# Round 1: Silly Rule
def silly_rule(row):
    return 1 if row.name % 2 == 1 else 0  # If index is odd, classify as malignant

val_df['pred_silly'] = val_df.apply(silly_rule, axis=1)
silly_accuracy = (val_df['pred_silly'] == val_df['target']).mean()
print("Silly Rule Accuracy on Validation Set:", silly_accuracy)

# Round 2: Sensible Rule
def sensible_rule(row):
    return 1 if row['mean radius'] > 15 else 0  # If mean radius > 15, classify as malignant

val_df['pred_sensible'] = val_df.apply(sensible_rule, axis=1)
sensible_accuracy = (val_df['pred_sensible'] == val_df['target']).mean()
print("Sensible Rule Accuracy on Validation Set:", sensible_accuracy)

# Round 3: More Sophisticated Rule
def sophisticated_rule(row):
    return 1 if row['mean radius'] > 14 and row['mean texture'] < 20 else 0  # Using multiple features

val_df['pred_sophisticated'] = val_df.apply(sophisticated_rule, axis=1)
sophisticated_accuracy = (val_df['pred_sophisticated'] == val_df['target']).mean()
print("Sophisticated Rule Accuracy on Validation Set:", sophisticated_accuracy)

# Final Rule Selection (Best performing rule)
final_rule = sophisticated_rule

test_df['pred_final'] = test_df.apply(final_rule, axis=1)
test_accuracy = (test_df['pred_final'] == test_df['target']).mean()
print("Final Rule Accuracy on Test Set:", test_accuracy)


Training set size: (549, 31)
Validation set size: (10, 31)
Test set size: (10, 31)
Silly Rule Accuracy on Validation Set: 0.4
Sensible Rule Accuracy on Validation Set: 0.0
Sophisticated Rule Accuracy on Validation Set: 0.4
Final Rule Accuracy on Test Set: 0.4
