In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from scipy.stats import pointbiserialr, ttest_ind

# Load the breast cancer dataset
data = load_breast_cancer()

# Create a DataFrame with the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the binary target variable (0=malignant, 1=benign)
df['target'] = data.target

# Compute point-biserial correlation between target and mean radius
r_pb, p_corr = pointbiserialr(df['target'], df['mean radius'])

# Separate mean radius values for malignant cases
malignant = df[df['target'] == 0]['mean radius']

# Separate mean radius values for benign cases
benign = df[df['target'] == 1]['mean radius']

# Perform Welch's t-test on the two groups
t_stat, p_t = ttest_ind(malignant, benign, equal_var=False)

# Print the point-biserial correlation results
print(f"Point-biserial correlation: r = {r_pb:.3f}, p = {p_corr:.3e}")

# Print the group means for interpretability
print(f"Mean radius — malignant: {malignant.mean():.2f}, benign: {benign.mean():.2f}")

# Print the Welch’s t-test results
print(f"Welch’s t-test: t = {t_stat:.3f}, p = {p_t:.3e}")

Point-biserial correlation: r = -0.730, p = 8.466e-96
Mean radius — malignant: 17.46, benign: 12.15
Welch’s t-test: t = 22.209, p = 1.684e-64
