<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/qml_analysis_vs_ml%2C_dl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Snippet 1 (Final Working Version) ---

# Step 1: Install Conda for Colab
print("--- Step 1: Installing Conda for Colab ---")
!pip install -q condacolab
import condacolab
condacolab.install()

# Step 2: Create a clean environment, now explicitly including qiskit-algorithms.
print("\n--- Step 2: Creating 'qml_project' conda environment ---")
!conda create -n qml_project -y python=3.11 qiskit-machine-learning qiskit-algorithms tensorflow xgboost pandas scikit-learn

# Step 3: Activate the environment and run our setup script.
print("\n--- Step 3: Activating environment and running setup script ---")
setup_script = """
import os
import zipfile
import numpy as np
import pandas as pd
import tensorflow as tf
import qiskit
from qiskit_algorithms.utils import algorithm_globals

print("\\n--- Environment Verification ---")
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
algorithm_globals.random_seed = SEED

print(f"Global random seed set to {SEED}.")
print(f"Python Version: {os.sys.version.split()[0]}")
print(f"TensorFlow Version: {tf.__version__}")
print(f"Qiskit Version: {qiskit.__version__}")
print(f"Pandas Version: {pd.__version__}")
print(f"Numpy Version: {np.__version__}")

zip_path = '/content/leucumia.zip'
extract_path = '/content/leukemia_data/'

print("\\n--- Preparing Image Data ---")
if not os.path.exists(extract_path):
    os.makedirs(extract_path)
    print(f"Created directory: {extract_path}")
else:
    print(f"Directory already exists: {extract_path}")

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Successfully unzipped '{zip_path}' to '{extract_path}'")
    segmented_path = os.path.join(extract_path, 'Segmented')
    if os.path.exists(segmented_path):
        print(f"Verified: 'Segmented' directory found.")
    else:
        print(f"WARNING: 'Segmented' directory not found.")
except FileNotFoundError:
    print(f"ERROR: File '{zip_path}' not found. Please upload it.")
except Exception as e:
    print(f"An error occurred: {e}")
"""

with open("run_setup.py", "w") as f:
    f.write(setup_script)

# Execute the script using the new environment
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_setup.py

--- Step 1: Installing Conda for Colab ---
✨🍰✨ Everything looks OK!

--- Step 2: Creating 'qml_project' conda environment ---
Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ done
Solving environment: / - \ | done


    current version: 24.11.2
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/qml_project

  added / updated specs:
    - pandas
    - python=3.11
    - qiskit-algorithms
    - qiskit-machine-learning
    - scikit-learn
    - tensorflow
    - xgboost


The following NEW packages will be INSTALLED:

  _libgcc_mutex      conda-forge/linux-64::_libgcc_mutex-0.1-conda_forge 
  _openmp_mutex      conda-forge/linux-64::_openmp_mutex-4.5-2_gnu 
  _python_abi3_supp~ conda-forge/noarch::_python_abi3_support-1.0-hd8ed1ab_2 
  _x86_64-microarch~ conda-forge/noarch::_x86_64-microarch-level-3-2_b

In [None]:
# --- Snippet 2: Data Loading and Initial Inspection ---

# We will use the same method as before to run our code inside the
# 'qml_project' conda environment.

# Define the Python code for this snippet
data_loading_script = """
import os
import pandas as pd
import numpy as np

print("--- Snippet 2: Data Loading and Initial Inspection ---")

# Define file paths
BASE_PATH = '/content/'
TABULAR_PATHS = {
    "Breast_Cancer": os.path.join(BASE_PATH, 'breast.csv'),
    "Sonar": os.path.join(BASE_PATH, 'sonar.csv'),
    "Lung_Cancer": os.path.join(BASE_PATH, 'lung_cancer.csv'),
    "SCD": os.path.join(BASE_PATH, 'Sudden Cardiac Death Holter Database.csv')
}
IMAGE_PATH = '/content/leukemia_data/Segmented/'

# --- 1. Load and Inspect Tabular Datasets ---
print("\\n--- Loading Tabular Datasets ---")
loaded_data = {}
for name, path in TABULAR_PATHS.items():
    try:
        df = pd.read_csv(path)
        loaded_data[name] = df
        print(f"\\n--- {name} ---")
        print(f"Shape: {df.shape}")
        # Check for missing values
        missing_vals = df.isnull().sum().sum()
        print(f"Total Missing Values: {missing_vals}")
        if missing_vals > 0:
            print("Preview of columns with most missing values:")
            print(df.isnull().sum().sort_values(ascending=False).head(3))
        print("Data Preview:")
        print(df.head(3))
    except FileNotFoundError:
        print(f"\\n--- {name} ---")
        print(f"ERROR: File not found at {path}")
    except Exception as e:
        print(f"\\n--- {name} ---")
        print(f"ERROR loading file: {e}")


# --- 2. Load and Inspect Image Dataset Paths ---
print("\\n\\n--- Loading Image Dataset Paths ---")
image_data = []
try:
    for subdir, dirs, files in os.walk(IMAGE_PATH):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                # Determine label from subdirectory name
                label = "Benign"
                if "malignant" in subdir.lower():
                    label = "Malignant"

                image_data.append({
                    "path": os.path.join(subdir, file),
                    "label": label
                })

    if not image_data:
         print("ERROR: No images found in the specified directory.")
    else:
        df_images = pd.DataFrame(image_data)
        loaded_data["Leukemia"] = df_images
        print(f"Found {len(df_images)} images.")
        print(f"Shape of image DataFrame: {df_images.shape}")
        print("Label Distribution:")
        print(df_images['label'].value_counts())
        print("Data Preview:")
        print(df_images.head(3))
except FileNotFoundError:
    print(f"ERROR: Image directory not found at {IMAGE_PATH}")
except Exception as e:
    print(f"ERROR processing images: {e}")

"""

# Write the script to a file
with open("run_data_loading.py", "w") as f:
    f.write(data_loading_script)

# Execute the script using the conda environment's python
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_data_loading.py

--- Snippet 2: Data Loading and Initial Inspection ---

--- Loading Tabular Datasets ---

--- Breast_Cancer ---
Shape: (569, 33)
Total Missing Values: 569
Preview of columns with most missing values:
Unnamed: 32    569
id               0
diagnosis        0
dtype: int64
Data Preview:
         id diagnosis  ...  fractal_dimension_worst  Unnamed: 32
0    842302         M  ...                  0.11890          NaN
1    842517         M  ...                  0.08902          NaN
2  84300903         M  ...                  0.08758          NaN

[3 rows x 33 columns]

--- Sonar ---
Shape: (208, 61)
Total Missing Values: 0
Data Preview:
   Freq_1  Freq_2  Freq_3  Freq_4  ...  Freq_58  Freq_59  Freq_60  Label
0  0.0200  0.0371  0.0428  0.0207  ...   0.0084   0.0090   0.0032      R
1  0.0453  0.0523  0.0843  0.0689  ...   0.0049   0.0052   0.0044      R
2  0.0262  0.0582  0.1099  0.1083  ...   0.0164   0.0095   0.0078      R

[3 rows x 61 columns]

--- Lung_Cancer ---
Shape: (59, 7)
Total Missin

In [None]:
# --- Snippet 3 (Final Corrected Version): Preprocessing Pipeline Definition ---

# Define the Python code for this snippet
pipeline_definition_script = """
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from tqdm.auto import tqdm # Use tqdm.auto for better notebook integration
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

print("--- Snippet 3: Preprocessing Pipeline Definition ---")

# --- 1. Define Tabular Preprocessing Pipeline ---
# This pipeline will handle missing values, scale the data, and select the best 15 features.
tabular_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=15))
])
print("\\n1. Tabular preprocessing pipeline defined successfully.")
print(tabular_pipeline)


# --- 2. Define Image Preprocessing and Embedding Logic ---
# This is more complex than a simple pipeline, so we define it as a reusable function.
# This function will take a list of image paths and return a 15-dimensional feature embedding.

def create_image_embedding_pipeline():
    # Load the pre-trained MobileNetV2 model without its final classification layer
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    # Add a pooling layer to get a single feature vector per image
    x = GlobalAveragePooling2D()(base_model.output)
    embedding_model = Model(inputs=base_model.input, outputs=x)

    # Define the dimensionality reduction pipeline
    reduction_pipeline = Pipeline([
        ('pca', PCA(n_components=30)),
        ('selector', SelectKBest(score_func=f_classif, k=15))
    ])

    return embedding_model, reduction_pipeline

def process_images_to_embeddings(image_paths, embedding_model):
    \"\"\"
    Processes a list of images into embeddings using a pre-trained model.
    Includes a progress bar using tqdm.
    \"\"\"
    embedded_features = []
    # Use tqdm for a progress bar
    for path in tqdm(image_paths, desc="Generating Image Embeddings"):
        # Load and preprocess the image for MobileNetV2
        img = load_img(path, target_size=(128, 128))
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)

        # Get the embedding
        embedding = embedding_model.predict(img_array, verbose=0)
        embedded_features.append(embedding.flatten())

    return np.array(embedded_features)


print("\\n2. Image embedding function and dimensionality reduction pipeline defined successfully.")

# Example of creating the image processing components
# This just demonstrates that the functions work
try:
    image_embedder, image_reducer = create_image_embedding_pipeline()
    print("   - MobileNetV2 embedding model created.")
    print("   - PCA -> SelectKBest reduction pipeline created.")
    print(image_reducer)
except Exception as e:
    print(f"ERROR creating image pipelines: {e}")

print("\\nPreprocessing architecture is defined and ready for use in model training.")

"""

# Write the script to a file
with open("run_pipeline_definition.py", "w") as f:
    f.write(pipeline_definition_script)

# Re-run the environment creation, this time including tqdm
# And then execute the script.
# NOTE: It's often faster to just add tqdm to the existing environment,
# but for maximum stability, we will recreate it.
print("--- Recreating 'qml_project' conda environment with tqdm ---")
!conda create -n qml_project -y python=3.11 qiskit-machine-learning qiskit-algorithms tensorflow xgboost pandas scikit-learn tqdm

print("\n--- Executing script in the new environment ---")
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_pipeline_definition.py

--- Recreating 'qml_project' conda environment with tqdm ---
Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ done
Solving environment: / - \ done


    current version: 24.11.2
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/qml_project

  added / updated specs:
    - pandas
    - python=3.11
    - qiskit-algorithms
    - qiskit-machine-learning
    - scikit-learn
    - tensorflow
    - tqdm
    - xgboost


The following NEW packages will be INSTALLED:

  _libgcc_mutex      conda-forge/linux-64::_libgcc_mutex-0.1-conda_forge 
  _openmp_mutex      conda-forge/linux-64::_openmp_mutex-4.5-2_gnu 
  _python_abi3_supp~ conda-forge/noarch::_python_abi3_support-1.0-hd8ed1ab_2 
  _x86_64-microarch~ conda-forge/noarch::_x86_64-microarch-level-3-2_broadwell 
  absl-py            conda-forge/noarch::absl-py

In [None]:
# --- Snippet 4 (Corrected): Model & Experiment Function Definition ---

# Define the Python code for this snippet
experiment_logic_script = """
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm

# Scikit-learn imports
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier # Corrected import

# Qiskit imports
from qiskit.circuit.library import ZZFeatureMap, TwoLocal
from qiskit.primitives import Sampler
from qiskit_algorithms.optimizers import SPSA
from qiskit_machine_learning.algorithms.classifiers import VQC, QSVC

print("--- Snippet 4: Model & Experiment Function Definition ---")
SEED = 42

# --- 1. Define Model & Hyperparameter Grids ---
print("1. Defining model and hyperparameter grids...")

# A. Classical Models
models_and_params = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=SEED, max_iter=1000, solver='liblinear'),
        'params': {
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1', 'l2']
        }
    },
    'RBF_SVM': {
        'model': SVC(random_state=SEED, probability=True),
        'params': {
            'classifier__C': [0.1, 1, 10],
            'classifier__gamma': ['scale']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=SEED),
        'params': {
            'classifier__n_estimators': [50, 100],
            'classifier__max_depth': [3, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=SEED, eval_metric='logloss'),
        'params': {
            'classifier__n_estimators': [50, 100],
            'classifier__learning_rate': [0.05, 0.1]
        }
    }
}

# B. Deep Learning Model (MLP)
def create_mlp(meta, dropout_rate=0.2, l2_reg=0.001):
    # Scikeras provides metadata like n_features_in_
    n_features = meta["n_features_in_"]
    model = Sequential([
        Dense(64, activation='relu', input_shape=(n_features,), kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
        Dropout(dropout_rate),
        Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
        Dense(1, activation='sigmoid')
    ])
    return model

# Updated KerasClassifier wrapper using SciKeras
models_and_params['MLP'] = {
    'model': KerasClassifier(
        model=create_mlp,
        loss="binary_crossentropy",
        optimizer="adam",
        verbose=0
    ),
    'params': {
        'classifier__batch_size': [16, 32],
        'classifier__epochs': [50, 100],
        'classifier__model__dropout_rate': [0.2, 0.3]
    }
}

# C. Quantum Models (placeholders)
models_and_params['QKSVM'] = { 'model': None, 'params': {} }
models_and_params['VQC'] = { 'model': None, 'params': {} }

print("   - Grids defined.")

# --- 2. Define the Main Experiment Runner Function ---
print("\\n2. Defining the main 'run_experiment' function...")

# (A simplified function is defined here for validation.
# The actual logic will be used in the next snippet)
def placeholder_experiment_runner():
    print("   - 'run_experiment' function placeholder is defined.")
    return True

placeholder_experiment_runner()

print("\\nExperiment architecture is defined and ready for pilot runs.")
"""

# Write the script to a file
with open("run_experiment_logic.py", "w") as f:
    f.write(experiment_logic_script)

# Re-create the conda environment, now including scikeras
print("--- Recreating 'qml_project' conda environment with scikeras ---")
!conda create -n qml_project -y python=3.11 qiskit-machine-learning qiskit-algorithms tensorflow xgboost pandas scikit-learn tqdm scikeras

print("\n--- Executing script in the new environment ---")
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_experiment_logic.py

--- Recreating 'qml_project' conda environment with scikeras ---
Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / done
Solving environment: \ | / done


    current version: 24.11.2
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/qml_project

  added / updated specs:
    - pandas
    - python=3.11
    - qiskit-algorithms
    - qiskit-machine-learning
    - scikeras
    - scikit-learn
    - tensorflow
    - tqdm
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    scikeras-0.13.0            |     pyhd8ed1ab_0          27 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          27 KB

The following N

In [None]:
# --- Snippet 5 (Definitive Final Version): Pilot Run with Hybrid QNN ---

# Define the Python code for this snippet
pilot_run_script = """
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm

# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import balanced_accuracy_score

# Classical Model
from sklearn.linear_model import LogisticRegression

# PyTorch and Qiskit
import torch
from torch import nn
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit.primitives import Estimator
from qiskit.quantum_info import SparsePauliOp
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.neural_networks import EstimatorQNN

print("--- Snippet 5 (Definitive Final): Pilot Run with Hybrid QNN ---")
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# --- 1. Define Models ---
print("1. Defining models for pilot run...")

# A. Classical Model
lr_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=10)),
    ('classifier', LogisticRegression(C=10, penalty='l1', random_state=SEED, max_iter=1000, solver='liblinear'))
])

# B. Hybrid Quantum Neural Network (QNN)
num_features = 10
feature_map = ZZFeatureMap(feature_dimension=num_features, reps=1)
ansatz = RealAmplitudes(num_qubits=num_features, reps=3)

# Define the observable to measure
observable = SparsePauliOp("Z" * num_features)

# Use the EstimatorQNN for expectation value computation
qiskit_qnn = EstimatorQNN(
    circuit=feature_map.compose(ansatz),
    estimator=Estimator(),
    observables=observable,
    input_params=feature_map.parameters,
    weight_params=ansatz.parameters
)
initial_weights = (2 * np.random.rand(qiskit_qnn.num_weights) - 1)
torch_qnn = TorchConnector(qiskit_qnn, initial_weights=initial_weights)

# Define the full hybrid model
class HybridQNN(nn.Module):
    def __init__(self, qnn):
        super().__init__()
        self.qnn = qnn
        self.classical_layer = nn.Linear(1, 1) # QNN output is 1 value
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.qnn(x) # Pass the whole batch to TorchConnector
        x = self.classical_layer(x)
        return self.sigmoid(x)

pytorch_model = HybridQNN(torch_qnn)

# --- 2. Load and Prepare Data ---
print("\\n2. Loading and preparing Breast Cancer data...")
df = pd.read_csv('/content/breast.csv').drop(columns=['id', 'Unnamed: 32'])
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])
X, y = df.drop(columns=['diagnosis']), df['diagnosis']
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=128, stratify=y, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, stratify=y_sample, random_state=SEED)
print(f"Data prepared. Train size: {len(y_train)}, Test size: {len(y_test)}")

# --- 3. Run Experiment ---
# A. Logistic Regression
print(f"\\n--- Running: LogisticRegression ---")
start_time_lr = time.time()
lr_pipeline.fit(X_train, y_train)
lr_preds = lr_pipeline.predict(X_test)
lr_accuracy = balanced_accuracy_score(y_test, lr_preds)
lr_time = time.time() - start_time_lr
print(f"Accuracy: {lr_accuracy:.3f}, Time: {lr_time:.2f}s")

# B. Hybrid QNN
print(f"\\n--- Running: HybridQNN ---")
start_time_qnn = time.time()
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=num_features)),
]).fit(X_train, y_train)
X_train_q = torch.tensor(preprocessor.transform(X_train), dtype=torch.float32)
X_test_q = torch.tensor(preprocessor.transform(X_test), dtype=torch.float32)
y_train_q = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
y_test_q = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.05)
loss_fn = torch.nn.BCELoss()
epochs = 10

pytorch_model.train()
for epoch in tqdm(range(epochs), desc="Training HybridQNN"):
    optimizer.zero_grad()
    output = pytorch_model(X_train_q)
    loss = loss_fn(output, y_train_q)
    loss.backward()
    optimizer.step()

pytorch_model.eval()
with torch.no_grad():
    test_output = pytorch_model(X_test_q)
    qnn_preds = (test_output >= 0.5).float().numpy()

qnn_accuracy = balanced_accuracy_score(y_test_q.numpy(), qnn_preds)
qnn_time = time.time() - start_time_qnn
print(f"Accuracy: {qnn_accuracy:.3f}, Time: {qnn_time:.2f}s")

print("\\n--- Pilot Run Summary ---")
results = pd.DataFrame([
    {'model': 'LogisticRegression', 'test_balanced_accuracy': lr_accuracy, 'fit_time_seconds': lr_time},
    {'model': 'HybridQNN', 'test_balanced_accuracy': qnn_accuracy, 'fit_time_seconds': qnn_time}
])
print(results.round(3))
"""

# Write the script to a file
with open("run_pilot_qnn.py", "w") as f:
    f.write(pilot_run_script)

# Re-create the conda environment with PyTorch
print("--- Recreating 'qml_project' conda environment with PyTorch ---")
!conda create -n qml_project -y python=3.11 pytorch qiskit-machine-learning qiskit-algorithms tensorflow xgboost pandas scikit-learn tqdm scikeras -c pytorch -c conda-forge

print("\n--- Executing script in the new environment ---")
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_pilot_qnn.py

--- Recreating 'qml_project' conda environment with PyTorch ---
Channels:
 - pytorch
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | done
Solving environment: - \ | done


    current version: 24.11.2
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/qml_project

  added / updated specs:
    - pandas
    - python=3.11
    - pytorch
    - qiskit-algorithms
    - qiskit-machine-learning
    - scikeras
    - scikit-learn
    - tensorflow
    - tqdm
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _openmp_mutex-4.5          |       3_kmp_llvm           7 KB  conda-forge
    blas-1.0                   |              mkl           1 KB  conda-forge
    filelock-3.18.0            |     pyhd8ed1ab_

In [None]:
# --- Final Environment Setup (with GPU-enabled PyTorch) ---
import condacolab
# This command ensures conda is installed and ready.
condacolab.install()

# This single command creates the environment with all packages,
# specifying the correct channels for the GPU build of PyTorch.
!conda create -n qml_project -y python=3.11 pytorch torchvision torchaudio pytorch-cuda=11.8 qiskit-machine-learning qiskit-algorithms tensorflow xgboost pandas scikit-learn tqdm scikeras pillow -c pytorch -c nvidia -c conda-forge

# --- Verification ---
print("\\n--- Verifying PyTorch and CUDA installation ---")
!source /usr/local/etc/profile.d/conda.sh && \\
 conda activate qml_project && \\
 python -c 'import torch; print(f"PyTorch Version: {torch.__version__}"); print(f"CUDA Available: {torch.cuda.is_available()}");'

✨🍰✨ Everything looks OK!
Channels:
 - pytorch
 - nvidia
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
Solving environment: | / - \ done


    current version: 24.11.2
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/qml_project

  added / updated specs:
    - pandas
    - pillow
    - python=3.11
    - pytorch
    - pytorch-cuda=11.8
    - qiskit-algorithms
    - qiskit-machine-learning
    - scikeras
    - scikit-learn
    - tensorflow
    - torchaudio
    - torchvision
    - tqdm
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    aom-3.6.1                  |       h59595ed_0         2.6 MB  conda-forge
    cud

In [None]:
# --- Snippet 6 (Final Version, 8 Qubits) ---

# Define the Python code for this snippet
pilot_run_images_script = """
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm

# Scikit-learn, Models, PyTorch, Qiskit, Keras...
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
import torch
from torch import nn
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit.primitives import Estimator
from qiskit.quantum_info import SparsePauliOp
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.neural_networks import EstimatorQNN
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

print("--- Snippet 6: Pilot Run - Image Data (8 Qubits) ---")
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
tf.random.set_seed(SEED)

# --- 1. Load Image Paths ---
print("1. Loading image paths...")
IMAGE_PATH = '/content/leukemia_data/Segmented/'
image_data = [{"path": os.path.join(s, f), "label": 0 if "Benign" in s else 1}
              for s, d, files in os.walk(IMAGE_PATH) for f in files
              if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
df_images = pd.DataFrame(image_data)
print(f"Found {len(df_images)} images.")

# --- 2. Generate Embeddings on CPU for stability ---
print("\\n2. Subsampling and generating embeddings (forced on CPU)...")
X_sample_paths, _, y_sample, _ = train_test_split(df_images['path'], df_images['label'], train_size=128, stratify=df_images['label'], random_state=SEED)

with tf.device('/CPU:0'):
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    x = GlobalAveragePooling2D()(base_model.output)
    embedding_model = Model(inputs=base_model.input, outputs=x)
    embeddings = [embedding_model.predict(preprocess_input(np.expand_dims(img_to_array(load_img(p, target_size=(128, 128))), axis=0)), verbose=0).flatten()
                  for p in tqdm(X_sample_paths, desc="Generating Embeddings (CPU)")]
X_embedded = np.array(embeddings)
print(f"Embedding generation complete. Shape: {X_embedded.shape}")

# --- 3. Split Data & Define Models ---
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y_sample.values, test_size=0.2, stratify=y_sample.values, random_state=SEED)

# --- ADJUSTMENT: Set number of features/qubits to 8 ---
num_features = 8
print(f"\\nADJUSTMENT: Using {num_features} features (qubits) for this run.")

lr_pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=15)), ('selector', SelectKBest(k=num_features)), ('classifier', LogisticRegression(random_state=SEED))])

feature_map = ZZFeatureMap(num_features)
ansatz = RealAmplitudes(num_features, reps=2)
observable = SparsePauliOp("Z" * num_features)
qiskit_qnn = EstimatorQNN(circuit=feature_map.compose(ansatz), estimator=Estimator(), observables=observable, input_params=feature_map.parameters, weight_params=ansatz.parameters)
torch_qnn = TorchConnector(qiskit_qnn, initial_weights=(2 * np.random.rand(qiskit_qnn.num_weights) - 1))

class HybridQNN(nn.Module):
    def __init__(self, qnn):
        super().__init__()
        self.qnn = qnn
        self.classical_layer = nn.Linear(1, 1)
    def forward(self, x):
        return torch.sigmoid(self.classical_layer(self.qnn(x)))
pytorch_model = HybridQNN(torch_qnn)

# --- 4. Run Experiment ---
print(f"\\n--- Running: LogisticRegression ({num_features} features) ---")
start_time_lr = time.time()
lr_pipeline.fit(X_train, y_train)
lr_preds = lr_pipeline.predict(X_test)
lr_accuracy = balanced_accuracy_score(y_test, lr_preds)
lr_time = time.time() - start_time_lr
print(f"Accuracy: {lr_accuracy:.3f}, Time: {lr_time:.2f}s")

print(f"\\n--- Running: HybridQNN ({num_features} qubits) ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"PyTorch using device: {device.upper()}")

start_time_qnn = time.time()
reduction_pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=15)), ('selector', SelectKBest(k=num_features))]).fit(X_train, y_train)
X_train_q = torch.tensor(reduction_pipeline.transform(X_train), dtype=torch.float32).to(device)
y_train_q = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).to(device)
pytorch_model.to(device)

optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.05)
loss_fn = torch.nn.BCELoss()
epochs = 4 # Strictly 4 epochs as requested

pytorch_model.train()
for epoch in tqdm(range(epochs), desc=f"Training HybridQNN on {device.upper()}"):
    optimizer.zero_grad()
    output = pytorch_model(X_train_q)
    loss = loss_fn(output, y_train_q)
    loss.backward()
    optimizer.step()

pytorch_model.eval()
with torch.no_grad():
    X_test_q = torch.tensor(reduction_pipeline.transform(X_test), dtype=torch.float32).to(device)
    qnn_preds = (pytorch_model(X_test_q) >= 0.5).cpu().numpy()
qnn_accuracy = balanced_accuracy_score(y_test, qnn_preds)
qnn_time = time.time() - start_time_qnn
print(f"Accuracy: {qnn_accuracy:.3f}, Time: {qnn_time:.2f}s")
"""

# Write the script to a file
with open("run_pilot_images_8qubit.py", "w") as f:
    f.write(pilot_run_images_script)

# Execute using our stable environment
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_pilot_images_8qubit.py

--- Snippet 6: Pilot Run - Image Data (8 Qubits) ---
1. Loading image paths...
Found 3256 images.

2. Subsampling and generating embeddings (forced on CPU)...
Generating Embeddings (CPU): 100% 128/128 [00:12<00:00,  9.93it/s]
Embedding generation complete. Shape: (128, 1280)

ADJUSTMENT: Using 8 features (qubits) for this run.
  qiskit_qnn = EstimatorQNN(circuit=feature_map.compose(ansatz), estimator=Estimator(), observables=observable, input_params=feature_map.parameters, weight_params=ansatz.parameters)
  qiskit_qnn = EstimatorQNN(circuit=feature_map.compose(ansatz), estimator=Estimator(), observables=observable, input_params=feature_map.parameters, weight_params=ansatz.parameters)

--- Running: LogisticRegression (8 features) ---
Accuracy: 0.727, Time: 0.10s

--- Running: HybridQNN (8 qubits) ---
PyTorch using device: CUDA
Training HybridQNN on CUDA: 100% 4/4 [05:23<00:00, 80.95s/it]
Accuracy: 0.500, Time: 326.24s


In [None]:
# --- Snippet 7 (Final Optimized Version): Full-Scale Experiment ---

# Define the Python code for this snippet
full_experiment_script = """
import pandas as pd
import numpy as np
import os
import time
import warnings
from tqdm.auto import tqdm

# --- Environment and Global Setup ---
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
SEED = 42

# --- Library Imports ---
import torch, tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from torch import nn
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit.primitives import Estimator
from qiskit.quantum_info import SparsePauliOp
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.neural_networks import EstimatorQNN
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

# --- Set Global Seeds ---
np.random.seed(SEED); torch.manual_seed(SEED); tf.random.set_seed(SEED)
print("--- Snippet 7: Full-Scale Experiment Execution (Optimized) ---")

# --- 1. Data Loading ---
def load_and_prep_tabular(path, target_col, drop_cols):
    df = pd.read_csv(path, low_memory=False).drop(columns=drop_cols, errors='ignore')
    if target_col not in df.columns: return None, None
    df[target_col] = df[target_col].astype(str)
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col])
    X = df.drop(columns=[target_col]).apply(pd.to_numeric, errors='coerce')
    y = df[target_col]
    return X, y

def load_and_prep_images(path):
    image_data = [{"path": os.path.join(s, f), "label": 0 if "Benign" in s else 1} for s, d, files in os.walk(path) for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    df = pd.DataFrame(image_data)
    with tf.device('/CPU:0'):
        base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
        x = GlobalAveragePooling2D()(base_model.output)
        embedding_model = Model(inputs=base_model.input, outputs=x)
        embeddings = [embedding_model.predict(preprocess_input(np.expand_dims(img_to_array(load_img(p, target_size=(128, 128))), axis=0)), verbose=0).flatten() for p in tqdm(df['path'], desc="Generating All Image Embeddings (CPU)")]
    return np.array(embeddings), df['label']

# --- 2. Model Definition ---
def get_models_and_params(num_features):
    def create_mlp(meta, dropout_rate=0.3):
        return Sequential([Dense(64, activation='relu', input_shape=(meta["n_features_in_"],)), Dropout(dropout_rate), Dense(32, activation='relu'), Dense(1, activation='sigmoid')])
    def create_qnn():
        feature_map, ansatz = ZZFeatureMap(num_features), RealAmplitudes(num_features, reps=2)
        observable = SparsePauliOp("Z" * num_features)
        qiskit_qnn = EstimatorQNN(circuit=feature_map.compose(ansatz), estimator=Estimator(), observables=observable, input_params=feature_map.parameters, weight_params=ansatz.parameters)
        torch_qnn = TorchConnector(qiskit_qnn, initial_weights=(2 * np.random.rand(qiskit_qnn.num_weights) - 1))
        class HybridQNN(nn.Module):
            def __init__(self, qnn): super().__init__(); self.qnn, self.cls = qnn, nn.Linear(1, 1)
            def forward(self, x): return torch.sigmoid(self.cls(self.qnn(x)))
        return HybridQNN(torch_qnn)
    return {
        'LogisticRegression': {'model': LogisticRegression(random_state=SEED, max_iter=2000, solver='liblinear'), 'params': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}},
        'RBF_SVM': {'model': SVC(random_state=SEED, probability=True), 'params': {'C': [0.1, 1, 10], 'gamma': ['scale']}},
        'RandomForest': {'model': RandomForestClassifier(random_state=SEED), 'params': {'n_estimators': [50, 100], 'max_depth': [3, 5, 10]}},
        'XGBoost': {'model': XGBClassifier(random_state=SEED, eval_metric='logloss'), 'params': {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}},
        'MLP': {'model': KerasClassifier(model=create_mlp, loss="binary_crossentropy", optimizer="adam", verbose=0, random_state=SEED), 'params': {'batch_size': [16, 32], 'epochs': [50], 'model__dropout_rate': [0.2, 0.3]}},
        'HybridQNN': {'model': create_qnn(), 'params': {'lr': [0.05], 'epochs': [15]}}
    }

# --- 3. Main Experiment Loop ---
all_run_results = []
N_VALUES, SEEDS, NUM_FEATURES = [32, 64, 128, 256, 500], [42, 123, 456, 789, 101], 8
datasets_to_run = {
    "Breast_Cancer": {'type': 'tabular', 'path': '/content/breast.csv', 'target': 'diagnosis', 'drop': ['id', 'Unnamed: 32']},
    "Sonar": {'type': 'tabular', 'path': '/content/sonar.csv', 'target': 'Label', 'drop': []},
    "Lung_Cancer": {'type': 'tabular', 'path': '/content/lung_cancer.csv', 'target': 'Result', 'drop': ['Name', 'Surname']},
    "Leukemia": {'type': 'image', 'path': '/content/leukemia_data/Segmented/'}
}

if os.path.exists('Leukemia_Embeddings.npz'):
    print("Loading pre-computed image embeddings..."); X_leukemia, y_leukemia = np.load('Leukemia_Embeddings.npz').values()
else: X_leukemia, y_leukemia = load_and_prep_images(datasets_to_run['Leukemia']['path']); np.savez('Leukemia_Embeddings.npz', X=X_leukemia, y=y_leukemia)
datasets_to_run['Leukemia']['data'] = (X_leukemia, y_leukemia)

for dataset_name, config in tqdm(datasets_to_run.items(), desc="Overall Datasets"):
    if config['type'] == 'tabular': X_full, y_full = load_and_prep_tabular(config['path'], config['target'], config['drop'])
    else: X_full, y_full = config['data']
    if X_full is None: continue

    for n_samples in tqdm(N_VALUES, desc=f"N Samples for {dataset_name}", leave=False):
        if n_samples > len(y_full): continue
        for seed in tqdm(SEEDS, desc=f"Seeds for N={n_samples}", leave=False):
            X_sample, _, y_sample, _ = train_test_split(X_full, y_full, train_size=n_samples, stratify=y_full, random_state=seed)
            X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, stratify=y_sample, random_state=seed)
            models = get_models_and_params(num_features=NUM_FEATURES)

            pca_components = min(int(X_train.shape[0] * 0.75) -1, X_train.shape[1], 30)
            if pca_components < NUM_FEATURES: continue

            prep_pipeline_steps = [('scaler', StandardScaler()), ('pca', PCA(n_components=pca_components)), ('selector', SelectKBest(k=NUM_FEATURES))]
            if config['type'] == 'tabular': prep_pipeline_steps.insert(0, ('imputer', SimpleImputer(strategy='median')))
            prep_pipeline = Pipeline(prep_pipeline_steps)

            X_train_prep = prep_pipeline.fit_transform(X_train, y_train)
            X_test_prep = prep_pipeline.transform(X_test)

            for model_name, model_info in models.items():
                start_time = time.time()
                y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train
                y_test_np = y_test.values if isinstance(y_test, pd.Series) else y_test

                if model_name != 'HybridQNN':
                    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=2, scoring='balanced_accuracy', n_jobs=2)
                    grid_search.fit(X_train_prep, y_train_np)
                    best_model = grid_search.best_estimator_
                    test_preds = best_model.predict(X_test_prep)
                else:
                    device = 'cuda' if torch.cuda.is_available() else 'cpu'
                    qnn_model = model_info['model'].to(device)
                    X_train_q, y_train_q = torch.tensor(X_train_prep, dtype=torch.float32).to(device), torch.tensor(y_train_np, dtype=torch.float32).reshape(-1, 1).to(device)
                    optimizer = torch.optim.Adam(qnn_model.parameters(), lr=model_info['params']['lr'][0])
                    loss_fn = torch.nn.BCELoss()
                    qnn_model.train()
                    for epoch in range(model_info['params']['epochs'][0]):
                        optimizer.zero_grad(); output = qnn_model(X_train_q); loss = loss_fn(output, y_train_q); loss.backward(); optimizer.step()
                    qnn_model.eval()
                    with torch.no_grad():
                        X_test_q = torch.tensor(X_test_prep, dtype=torch.float32).to(device)
                        test_preds = (qnn_model(X_test_q) >= 0.5).cpu().numpy()

                all_run_results.append({'dataset': dataset_name, 'n_samples': n_samples, 'seed': seed, 'model': model_name, 'test_b_accuracy': balanced_accuracy_score(y_test_np, test_preds), 'fit_time': time.time() - start_time})
                pd.DataFrame(all_run_results).to_csv("results.csv", index=False)

print("\\n--- Full Experiment Complete ---")
print(pd.read_csv("results.csv"))
"""

# Write the script to a file
with open("run_full_experiment.py", "w") as f:
    f.write(full_experiment_script)

# Execute using our stable, GPU-enabled environment
!source /usr/local/etc/profile.d/conda.sh && conda activate qml_project && python run_full_experiment.py

--- Snippet 7: Full-Scale Experiment Execution (Optimized) ---
Loading pre-computed image embeddings...
Overall Datasets:   0% 0/4 [00:00<?, ?it/s]
N Samples for Breast_Cancer:   0% 0/5 [00:00<?, ?it/s][A

Seeds for N=32:   0% 0/5 [00:00<?, ?it/s][A[A2025-08-02 14:31:56.920487: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}
2025-08-02 14:31:56.937328: E tensorf