In [118]:
from wisconsin import data
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


## Download data

In [63]:
data.download(unzip=True)

Extracted contents to /home/tom/projects/wisconsin/data


## Extract target and features

In [108]:
df = data.load()
df = df.drop(columns=['ID'])
features = df.drop(columns=['Diagnosis'])
target = df['Diagnosis']


In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Diagnosis           569 non-null    object 
 1   radius1             569 non-null    float64
 2   texture1            569 non-null    float64
 3   perimeter1          569 non-null    float64
 4   area1               569 non-null    float64
 5   smoothness1         569 non-null    float64
 6   compactness1        569 non-null    float64
 7   concavity1          569 non-null    float64
 8   concave_points1     569 non-null    float64
 9   symmetry1           569 non-null    float64
 10  fractal_dimension1  569 non-null    float64
 11  radius2             569 non-null    float64
 12  texture2            569 non-null    float64
 13  perimeter2          569 non-null    float64
 14  area2               569 non-null    float64
 15  smoothness2         569 non-null    float64
 16  compactn

## Class distribution

In [111]:
import plotly.express as px

class_counts = target.value_counts().reset_index()
class_counts.columns = [target.name, 'Count']

fig = px.pie(
    class_counts,
    values='Count',
    names=target.name,
    title='Class Distribution'
)

fig.show()

## Feature Separation

### Group by feature type

The dataset contains 10 distinct feature types, each with 3 measurements, for a total of 30 features.

Given the remarkable performance of baselines models in this dataset, it is worth visualising how cleanly each feature separates the two labelled classes.

* First group the features by their suffix (`1`, `2` or `3`).

In [112]:
import re

pattern = re.compile(r'([a-zA-Z_]+)([123])$')

feature_map = {}
for col in features:
    match = pattern.match(col)
    if match:
        base, variant = match.groups()
        feature_map.setdefault(variant, []).append(col)

for k in feature_map:
    feature_map[k].sort()

print(feature_map)

{'1': ['area1', 'compactness1', 'concave_points1', 'concavity1', 'fractal_dimension1', 'perimeter1', 'radius1', 'smoothness1', 'symmetry1', 'texture1'], '2': ['area2', 'compactness2', 'concave_points2', 'concavity2', 'fractal_dimension2', 'perimeter2', 'radius2', 'smoothness2', 'symmetry2', 'texture2'], '3': ['area3', 'compactness3', 'concave_points3', 'concavity3', 'fractal_dimension3', 'perimeter3', 'radius3', 'smoothness3', 'symmetry3', 'texture3']}


### Feature Comparison per group

- scale the feature values to be between 1 and 0
- reshape data
- violin plots for each group

In [115]:
df_scaled = df.copy()

scaler = MinMaxScaler()
df_scaled[features.columns] = scaler.fit_transform(df_scaled[features.columns])

for variant, cols in feature_map.items():
    df_group = df_scaled[cols + ['Diagnosis']].copy()
    
    df_melted = df_group.melt(
        id_vars='Diagnosis',
        value_vars=cols,
        var_name='Feature',
        value_name='Value'
    )
    
    fig = px.violin(
        df_melted,
        x='Feature',
        y='Value',
        color='Diagnosis',
        box=True,
        points=False,
        title=f'Feature Group {variant} Distributions'
    )
    
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()


- Clear separation between classes for the majority of features.
- Group 2 has noticeably smaller averages than groups 1 and 3, although the relative pattern looks similar.

### Feature Group Comparisons

- From above plot, Group 2 appears to have noticeably smaller averages than Group 1 and Group 3.
- To visualise more cleary, group by feature type, scale each group individually and plot feature distributions

In [116]:
pattern = re.compile(r'([a-zA-Z_]+)([123])$')

feature_groups = {}
for col in df.columns:
    if col in ['ID', 'Diagnosis']:
        continue
    match = pattern.match(col)
    if match:
        base, subtype = match.groups()
        feature_groups.setdefault(base, []).append(col)

print(feature_groups)

{'radius': ['radius1', 'radius2', 'radius3'], 'texture': ['texture1', 'texture2', 'texture3'], 'perimeter': ['perimeter1', 'perimeter2', 'perimeter3'], 'area': ['area1', 'area2', 'area3'], 'smoothness': ['smoothness1', 'smoothness2', 'smoothness3'], 'compactness': ['compactness1', 'compactness2', 'compactness3'], 'concavity': ['concavity1', 'concavity2', 'concavity3'], 'concave_points': ['concave_points1', 'concave_points2', 'concave_points3'], 'symmetry': ['symmetry1', 'symmetry2', 'symmetry3'], 'fractal_dimension': ['fractal_dimension1', 'fractal_dimension2', 'fractal_dimension3']}


In [159]:
df_group_scaled = df.copy()
for base, cols in feature_groups.items():
    group_vals = df[cols].values
    scaler = MinMaxScaler()
    scaled_vals = scaler.fit_transform(group_vals)
    df_group_scaled[cols] = scaled_vals

records = []
for base, cols in feature_groups.items():
    for subtype in ['1', '2', '3']:
        col_name = f"{base}{subtype}"
        if col_name in df_group_scaled.columns:
            for val in df_group_scaled[col_name]:
                records.append({
                    'Feature': base,
                    'Subtype': subtype,
                    'Value': val
                })

df_long_scaled = pd.DataFrame(records)

fig = px.box(
    df_long_scaled,
    x='Subtype',
    y='Value',
    facet_col='Feature',
    facet_col_wrap=2,
    points='outliers',
    title='Boxplots of Each Feature Subtype (Scaled Per Feature Group)'
)

fig.update_layout(width=600,height=1000)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig.show()

- Typically, interquartile ranges for Group 2 do not often overlap those of Group 1 or Group 3.

### Dimension reduction

Is the data nicely separated when reducing to fewer dimensions?

#### UMAP

In [119]:
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

umap_df = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2'])
umap_df[target.name] = target.values

# Step 5: Plot using Plotly
fig = px.scatter(
    umap_df,
    x='UMAP1',
    y='UMAP2',
    color=target.name,
    title='UMAP Projection of Feature Space',
    opacity=0.7
)
fig.update_layout(
    width=700,
    height=700,
)
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



#### TSNE

In [121]:
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler


# Step 2: Scale
X_scaled = StandardScaler().fit_transform(features)

# Step 3: t-SNE
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Step 4: Plot
tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df[target.name] = target

fig = px.scatter(
    tsne_df,
    x='TSNE1',
    y='TSNE2',
    color=target.name,
    title='t-SNE Projection of Feature Space',
    width=700,
    height=700
)
fig.show()


#### PCA

In [122]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import plotly.express as px

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Step 3: PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
explained_var = pca.explained_variance_ratio_

# Step 4: Plot
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df[target.name] = target

fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color=target.name,
    title=f'PCA: PC1 ({explained_var[0]:.2%}) vs PC2 ({explained_var[1]:.2%})',
    width=700,
    height=700
)

# Set equal aspect ratio
fig.update_layout(yaxis_scaleanchor='x')
fig.show()


In [142]:
import matplotlib.pyplot as plt
import numpy as np

pca_full = PCA().fit(X_scaled)
explained_var_ratio = pca_full.explained_variance_ratio_
cumulative = np.cumsum(explained_var_ratio)
components = list(range(1, len(explained_var_ratio) + 1))

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=components,
    y=cumulative,
    mode='lines+markers',
    marker=dict(size=8),
    name='Cumulative Explained Variance'
))

fig.update_layout(
    title='Scree Plot - Cumulative Explained Variance',
    xaxis_title='Number of Principal Components',
    yaxis_title='Cumulative Explained Variance',
    xaxis=dict(dtick=1),
    width=800,
    height=500
)

fig.show()


### Feature Correlation Heatmap

In [133]:
import plotly.express as px


corr_matrix = features[sorted(features.columns)].corr()

fig = px.imshow(
    corr_matrix,
    text_auto='.2f',
    color_continuous_scale='Plasma_r',
    zmin=-1,
    zmax=1,
    title='Feature Correlation Heatmap'
)

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Features",
    width=800,
    height=800
)

fig.show()
