<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/qml_analysis_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
uciml_breast_cancer_wisconsin_data_path = kagglehub.dataset_download('uciml/breast-cancer-wisconsin-data')

print('Data source import complete.')


Data source import complete.


In [2]:
# Missing-Value Imputation, Scaling & Feature Selection

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# 1. Load the data
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

# 2. Drop unneeded columns
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')

# 3. Encode target
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

# 4. Split into features & label
X_raw = df.drop(columns='diagnosis')
y = df['diagnosis']

# 5. Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_raw)

# 6. Standard-scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 7. Select top 15 features via ANOVA-F
selector = SelectKBest(score_func=f_classif, k=15)
X_selected = selector.fit_transform(X_scaled, y)

# 8. (Optional) Get the names of the selected features
selected_features = X_raw.columns[selector.get_support()]
print("Selected top 15 features:")
for feat in selected_features:
    print("-", feat)

# 9. (Optional) Create a DataFrame of the trimmed feature set
X_selected_df = pd.DataFrame(X_selected, columns=selected_features)


Selected top 15 features:
- radius_mean
- perimeter_mean
- area_mean
- compactness_mean
- concavity_mean
- concave points_mean
- radius_se
- perimeter_se
- area_se
- radius_worst
- perimeter_worst
- area_worst
- compactness_worst
- concavity_worst
- concave points_worst


In [3]:
# Step 3: Enforcing the “Small-N” Regime

from sklearn.model_selection import StratifiedShuffleSplit

# 3.1 Parameters
N_max = 500
random_state = 42

# 3.2 Stratified subsampling (on the SELECTED feature set)
#    X_selected_df: DataFrame of shape (569, 15) from Step 2
#    y          : Series of labels (0/1)

if X_selected_df.shape[0] > N_max:
    sss = StratifiedShuffleSplit(
        n_splits=1,
        train_size=N_max,
        random_state=random_state
    )
    for train_idx, _ in sss.split(X_selected_df, y):
        X_sub = X_selected_df.iloc[train_idx].reset_index(drop=True)
        y_sub = y.iloc[train_idx].reset_index(drop=True)
else:
    # Dataset already ≤ N_max
    X_sub = X_selected_df.copy().reset_index(drop=True)
    y_sub = y.copy().reset_index(drop=True)

# 3.3 Quick sanity check
print(f"Subsampled dataset size: {X_sub.shape[0]} samples (max allowed = {N_max})")
print("Class proportions (benign=0 / malignant=1):")
print(y_sub.value_counts(normalize=True))


Subsampled dataset size: 500 samples (max allowed = 500)
Class proportions (benign=0 / malignant=1):
diagnosis
0    0.628
1    0.372
Name: proportion, dtype: float64


In [4]:
# Step 4: Class Balancing & Stratified Splitting

import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Assume X_sub (DataFrame) and y_sub (Series) are from Step 3

# 4.3.1 First split off the test set (20% of N_max)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_sub, y_sub,
    test_size=0.20,
    stratify=y_sub,
    random_state=42
)

# 4.3.2 Split the remaining into train (60% total) and val (20% total):
# Remaining is 80% of original, so val = 0.25 * 0.80 = 0.20 of original
X_train_temp, X_val, y_train_temp, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,
    stratify=y_temp,
    random_state=42
)

print("Before SMOTE:")
print("  Train size:", X_train_temp.shape[0])
print("  Class distribution:\n", y_train_temp.value_counts())

# 4.1 Apply SMOTE to the training split only
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train_temp, y_train_temp)

print("\nAfter SMOTE:")
print("  Train size:", X_train.shape[0])
print("  Class distribution:\n", y_train.value_counts())

# 4.3.3 Check validation and test sizes & distributions
print("\nValidation size:", X_val.shape[0], "Class distribution:\n", y_val.value_counts())
print("Test size:",       X_test.shape[0], "Class distribution:\n", y_test.value_counts())


Before SMOTE:
  Train size: 300
  Class distribution:
 diagnosis
0    188
1    112
Name: count, dtype: int64

After SMOTE:
  Train size: 376
  Class distribution:
 diagnosis
1    188
0    188
Name: count, dtype: int64

Validation size: 100 Class distribution:
 diagnosis
0    63
1    37
Name: count, dtype: int64
Test size: 100 Class distribution:
 diagnosis
0    63
1    37
Name: count, dtype: int64


In [5]:
# ── MONKEY-PATCH ALL ESTIMATORS TO HAVE VALID __sklearn_tags__ ──
from sklearn.base import BaseEstimator
from types import SimpleNamespace

def _sklearn_tags(self):
    t = SimpleNamespace()
    t.estimator_type  = "classifier"
    t.classifier_tags = SimpleNamespace()
    t.input_tags      = SimpleNamespace(pairwise=False, sparse=False)
    t.target_tags     = SimpleNamespace(required=True)
    t.requires_fit    = True
    return t

BaseEstimator.__sklearn_tags__ = _sklearn_tags

In [6]:
# Remove any existing Qiskit components
!pip uninstall -y qiskit qiskit-aer qiskit-terra qiskit-ibmq-provider qiskit-machine-learning

[0m

In [1]:
# Step 1: Completely clean up existing installations
!pip uninstall -y -q qiskit-terra qiskit-aer qiskit-ibmq-provider qiskit qiskit-machine-learning

[0m

In [6]:
# Step 2: Install specific compatible versions
!pip install -q "qiskit-terra==0.23.3" "qiskit-aer==0.11.2" "qiskit-machine-learning==0.6.0"

In [8]:
# After restart, run this cell:
import numpy as np
import torch
from sklearn.svm import SVC as SVC_SK
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset

# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Qiskit imports
from qiskit import Aer
# Removed import of QuantumInstance from qiskit.utils
from qiskit.primitives import Sampler, Estimator # Added imports for Sampler and Estimator
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.algorithms import VQC
from qiskit_machine_learning.neural_networks import EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit.algorithms.optimizers import SPSA

# Initialize backend - USING OLD-STYLE AER IMPORT
# Removed initialization of QuantumInstance
# Initialized Sampler and Estimator without backend argument based on error
sampler = Sampler()
estimator = Estimator()


print("All imports successful! Sampler and Estimator initialized.")

All imports successful! Sampler and Estimator initialized.


In [10]:
# Re-run data loading and preprocessing steps
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 1. Load the data
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

# 2. Drop unneeded columns
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')

# 3. Encode target
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

# 4. Split into features & label
X_raw = df.drop(columns='diagnosis')
y = df['diagnosis']

# 5. Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_raw)

# 6. Standard-scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 7. Select top 15 features via ANOVA-F
selector = SelectKBest(score_func=f_classif, k=15)
X_selected = selector.fit_transform(X_scaled, y)

# 8. Create a DataFrame of the trimmed feature set
selected_features = X_raw.columns[selector.get_support()]
X_selected_df = pd.DataFrame(X_selected, columns=selected_features)

# 9. Enforcing the “Small-N” Regime
N_max = 500
random_state = 42

if X_selected_df.shape[0] > N_max:
    sss = StratifiedShuffleSplit(
        n_splits=1,
        train_size=N_max,
        random_state=random_state
    )
    for train_idx, _ in sss.split(X_selected_df, y):
        X_sub = X_selected_df.iloc[train_idx].reset_index(drop=True)
        y_sub = y.iloc[train_idx].reset_index(drop=True)
else:
    X_sub = X_selected_df.copy().reset_index(drop=True)
    y_sub = y.copy().reset_index(drop=True)

# 10. Class Balancing & Stratified Splitting
# 10.1 First split off the test set (20% of N_max)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_sub, y_sub,
    test_size=0.20,
    stratify=y_sub,
    random_state=42
)

# 10.2 Split the remaining into train (60% total) and val (20% total):
X_train_temp, X_val, y_train_temp, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,
    stratify=y_temp,
    random_state=42
)

# 10.3 Apply SMOTE to the training split only
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train_temp, y_train_temp)

print("Data loading and preprocessing complete. X_train, y_train, X_test, y_test, X_val, y_val are defined.")


Data loading and preprocessing complete. X_train, y_train, X_test, y_test, X_val, y_val are defined.


In [13]:
# After restart, run this cell:
import numpy as np
import torch
from sklearn.svm import SVC as SVC_SK
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset

# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Qiskit imports
from qiskit import Aer
from qiskit.utils import QuantumInstance # Reverted to the import that defines QuantumInstance
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.algorithms import VQC
from qiskit_machine_learning.neural_networks import EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit.algorithms.optimizers import SPSA

# Initialize backend - USING OLD-STYLE AER IMPORT
backend = Aer.get_backend('aer_simulator_density_matrix')
qi = QuantumInstance(
    backend=backend,
    shots=1000,
    seed_simulator=42,
    seed_transpiler=42
)

print("All imports successful! Backend and QuantumInstance initialized.")

All imports successful! Backend and QuantumInstance initialized.


In [26]:
# --- VQC ---
vqc_fm  = ZZFeatureMap(feature_dimension=X_train.shape[1], reps=2, entanglement='circular')
vqc_ans = RealAmplitudes(num_qubits=X_train.shape[1], reps=2, entanglement='circular')

# Initialize VQC with QuantumInstance
vqc     = VQC(
    feature_map = vqc_fm,
    ansatz      = vqc_ans,
    optimizer   = SPSA(maxiter=100),
    quantum_instance   = qi # Using the initialized QuantumInstance
)

# Train VQC
print("Training VQC...")
vqc.fit(X_train, y_train)
print("VQC training complete.")

# Evaluate VQC
y_pred_proba_vqc = vqc.predict_proba(X_test)[:, 1]
y_pred_vqc = (y_pred_proba_vqc > 0.5).astype(int) # Convert probabilities to class labels

print("\nVQC Evaluation on Test Set:")
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba_vqc))
print("F1 Score:", f1_score(y_test, y_pred_vqc))
print("Accuracy Score:", accuracy_score(y_test, y_pred_vqc))
print("Precision Score:", precision_score(y_test, y_pred_vqc))

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [3]:
# After restart, run this cell:
import numpy as np
import torch
from sklearn.svm import SVC as SVC_SK
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score

# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Qiskit imports
# Removed import of Aer and QuantumInstance
from qiskit.primitives import Sampler, Estimator # Added imports for Sampler and Estimator
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.algorithms import VQC
from qiskit_machine_learning.neural_networks import EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit.algorithms.optimizers import SPSA

# Initialize primitives
sampler = Sampler()
estimator = Estimator()


print("All imports successful! Sampler and Estimator initialized.")

ImportError: cannot import name 'QuantumKernel' from 'qiskit_machine_learning.kernels' (/usr/local/lib/python3.11/dist-packages/qiskit_machine_learning/kernels/__init__.py)

In [1]:
# Step 1: Uninstall current qiskit-machine-learning
!pip uninstall -y -q qiskit-machine-learning

In [2]:
# Step 2: Install a recent compatible version of qiskit-machine-learning
!pip install -q qiskit-machine-learning

**IMPORTANT:** After the installations in the previous cell are complete, **please restart the runtime manually** (Runtime -> Restart runtime) before proceeding.