In [24]:
# VARIATIONAL QUANTUM CIRCUIT WITH PYTORCH
# source: pennylane demos, kernel based training

In [25]:
%matplotlib inline
import numpy as np
import torch
from torch.nn.functional import relu

from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pennylane as qml
from pennylane.templates import AngleEmbedding, StronglyEntanglingLayers

import matplotlib.pyplot as plt

np.random.seed(42)

In [26]:
# READ IN DATA
X_train = np.loadtxt("trainX.txt") # size 1600
y_train = np.loadtxt("trainY.txt")
X_test = np.loadtxt("testX.txt") # size 256
y_test = np.loadtxt("testY.txt")

# scaling the inputs is important since the embedding we use is periodic
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

# swaps 0 with -1 so we classify by -1 and 1 instead of 0 and 1
y_train = np.where(y_train == 0, -1.0, 1.0) 
y_test = np.where(y_test == 0, -1.0, 1.0)

Using the variational principle of training, we can propose an *ansatz*
for the variational circuit and train it directly. By increasing the
number of layers of the ansatz, its expressivity increases. Depending on
the ansatz, we may only search through a subspace of all measurements
for the best candidate.

Remember from above, the variational training does not optimize
*exactly* the same cost as the SVM, but we try to match them as closely
as possible. For this we use a bias term in the quantum model, and train
on the hinge loss.

We also explicitly use the
[parameter-shift](https://pennylane.ai/qml/glossary/parameter_shift.html)
differentiation method in the quantum node, since this is a method which
works on hardware as well. While `diff_method='backprop'` or
`diff_method='adjoint'` would reduce the number of circuit evaluations
significantly, they are based on tricks that are only suitable for
simulators, and can therefore not scale to more than a few dozen qubits.


In [27]:
n_qubits = 2
dev_var = qml.device("lightning.qubit", wires=n_qubits)

@qml.qnode(dev_var, diff_method="parameter-shift")
def quantum_model(x, params):
    """A variational quantum model."""

    # embedding
    AngleEmbedding(x, wires=range(n_qubits))

    # trainable measurement
    StronglyEntanglingLayers(params, wires=range(n_qubits))
    return qml.expval(qml.PauliZ(0))

def quantum_model_plus_bias(x, params, bias):
    """Adding a bias."""
    return quantum_model(x, params) + bias

def hinge_loss(predictions, targets):
    """Implements the hinge loss."""
    all_ones = torch.ones_like(targets)
    hinge_loss = all_ones - predictions * targets
    # trick: since the max(0,x) function is not differentiable,
    # use the mathematically equivalent relu instead
    hinge_loss = relu(hinge_loss)
    return hinge_loss

We now summarize the usual training and prediction steps into two
functions similar to scikit-learn\'s `fit()` and `predict()`. While it
feels cumbersome compared to the one-liner used to train the kernel
method, PennyLane---like other differentiable programming
libraries---provides a lot more control over the particulars of
training.

In our case, most of the work is to convert between numpy and torch,
which we need for the differentiable `relu` function used in the hinge
loss.


In [28]:
def quantum_model_train(n_layers, steps, batch_size):
    """Train the quantum model defined above."""

    params = np.random.random((n_layers, n_qubits, 3))
    params_torch = torch.tensor(params, requires_grad=True)
    bias_torch = torch.tensor(0.0)

    opt = torch.optim.Adam([params_torch, bias_torch], lr=0.1)

    loss_history = []
    for i in range(steps):

        batch_ids = np.random.choice(len(X_train), batch_size)

        X_batch = X_train[batch_ids]
        y_batch = y_train[batch_ids]

        X_batch_torch = torch.tensor(X_batch, requires_grad=False)
        y_batch_torch = torch.tensor(y_batch, requires_grad=False)

        def closure():
            opt.zero_grad()
            preds = torch.stack(
                [quantum_model_plus_bias(x, params_torch, bias_torch) for x in X_batch_torch]
            )
            loss = torch.mean(hinge_loss(preds, y_batch_torch))

            # bookkeeping
            current_loss = loss.detach().numpy().item()
            loss_history.append(current_loss)
            if i % 100 == 0:
                print("step", i, ", loss", current_loss)

            loss.backward()
            return loss

        opt.step(closure)

    return params_torch, bias_torch, loss_history


def quantum_model_predict(X_pred, trained_params, trained_bias):
    """Predict using the quantum model defined above."""

    p = []
    for x in X_pred:

        x_torch = torch.tensor(x)
        pred_torch = quantum_model_plus_bias(x_torch, trained_params, trained_bias)
        pred = pred_torch.detach().numpy().item()
        if pred > 0:
            pred = 1
        else:
            pred = -1

        p.append(pred)
    return p

Let's train the variational model and see how well we are doing on the
test set.


In [30]:
start = time.time()

n_layers = 6
batch_size = 128
steps = 10000

with dev_var.tracker:
    trained_params, trained_bias, loss_history = quantum_model_train(n_layers, steps, batch_size)
    pred_train = quantum_model_predict(X_train, trained_params, trained_bias)
    pred_test = quantum_model_predict(X_test, trained_params, trained_bias)

print("accuracy on train set:", accuracy_score(pred_train, y_train))
print("accuracy on test set:", accuracy_score(pred_test, y_test))

plt.plot(loss_history)
plt.ylim((0, 1))
plt.xlabel("steps")
plt.ylabel("cost")
plt.show()

end = time.time()
print("runtime in minutes: ", (end-start)/60)

# 100 steps: 
# accuracy on train set: 0.68625
# accuracy on test set: 0.6796875

# 10,000 steps: 
# accuracy on train set: 0.68625
# accuracy on test set: 0.6796875

# 10,000 steps, full data:
# accuracy on train set: 0.71
# accuracy on test set: 0.68

KeyboardInterrupt: 

The variational circuit has a slightly lower accuracy than the SVM---but
this depends very much on the training settings we used. Different
random parameter initializations, more layers, or more steps may indeed
get perfect test accuracy.

How often was the device executed?


In [31]:
dev_var.tracker.totals['executions']

KeyError: 'executions'

That is a lot more than the kernel method took!

Let's try to understand this value. In each optimization step, the
variational circuit needs to compute the partial derivative of all
trainable parameters for each sample in a batch. Using parameter-shift
rules, we require roughly two circuit evaluations per partial
derivative. Prediction uses only one circuit evaluation per sample.

We can formulate this as another function that will be used in the
scaling plot below.


In [32]:
def circuit_evals_variational(n_data, n_params, n_steps, shift_terms, split, batch_size):
    """Compute how many circuit evaluations are needed for
       variational training and prediction."""

    M = int(np.ceil(split * n_data))
    Mpred = n_data - M

    n_training = n_params * n_steps * batch_size * shift_terms
    n_prediction = Mpred

    return n_training + n_prediction

This estimates the circuit evaluations in variational training as:


In [33]:
circuit_evals_variational(
    n_data=len(X),
    n_params=len(trained_params.flatten()),
    n_steps=steps,
    shift_terms=2,
    split=len(X_train) / (len(X_train) + len(X_test)),
    batch_size=batch_size,
)

NameError: name 'X' is not defined

The estimate is a bit higher because it does not account for some
optimizations that PennyLane performs under the hood.

It is important to note that while they are trained in a similar manner,
the number of variational circuit evaluations differs from the number of
neural network model evaluations in classical machine learning, which
would be given by:


In [None]:
def model_evals_nn(n_data, n_params, n_steps, split, batch_size):
    """Compute how many model evaluations are needed for neural
       network training and prediction."""

    M = int(np.ceil(split * n_data))
    Mpred = n_data - M

    n_training = n_steps * batch_size
    n_prediction = Mpred

    return n_training + n_prediction

In each step of neural network training, and due to the clever
implementations of automatic differentiation, the backpropagation
algorithm can compute a gradient for all parameters in (more-or-less) a
single run. For all we know at this stage, the no-cloning principle
prevents variational circuits from using these tricks, which leads to
`n_training` in `circuit_evals_variational` depending on the number of
parameters, but not in `model_evals_nn`.

For the same example as used here, a neural network would therefore have
far fewer model evaluations than both variational and kernel-based
training:


In [None]:
model_evals_nn(
    n_data=len(X),
    n_params=len(trained_params.flatten()),
    n_steps=steps,
    split=len(X_train) / (len(X_train) + len(X_test)),
    batch_size=batch_size,
)

Which method scales best?
=========================


The answer to this question depends on how the variational model is set
up, and we need to make a few assumptions:

1.  Even if we use single-batch stochastic gradient descent, in which
    every training step uses exactly one training sample, we would want
    to see every training sample at least once on average. Therefore,
    the number of steps should scale at least linearly with the number
    of training data samples.

2.  Modern neural networks often have many more parameters than training
    samples. But we do not know yet whether variational circuits really
    need that many parameters as well. We will therefore use two cases
    for comparison:

    2a) the number of parameters grows linearly with the training data,
    or `n_params = M`,

    2b) the number of parameters saturates at some point, which we model
    by setting `n_params = sqrt(M)`.

Note that compared to the example above with 75 training samples and 24
parameters, a) overestimates the number of evaluations, while b)
underestimates it.


This is how the three methods compare:


In [None]:
variational_training1 = []
variational_training2 = []
kernelbased_training = []
nn_training = []
x_axis = range(0, 2000, 100)

for M in x_axis:
    var1 = circuit_evals_variational(
        n_data=M, n_params=M, n_steps=M, shift_terms=2, split=0.75, batch_size=1
    )
    variational_training1.append(var1)

    var2 = circuit_evals_variational(
        n_data=M, n_params=round(np.sqrt(M)), n_steps=M,
        shift_terms=2, split=0.75, batch_size=1
    )
    variational_training2.append(var2)

    kernel = circuit_evals_kernel(n_data=M, split=0.75)
    kernelbased_training.append(kernel)

    nn = model_evals_nn(
        n_data=M, n_params=M, n_steps=M, split=0.75, batch_size=1
    )
    nn_training.append(nn)


plt.plot(x_axis, nn_training, linestyle='--', label="neural net")
plt.plot(x_axis, variational_training1, label="var. circuit (linear param scaling)")
plt.plot(x_axis, variational_training2, label="var. circuit (srqt param scaling)")
plt.plot(x_axis, kernelbased_training, label="(quantum) kernel")
plt.xlabel("size of data set")
plt.ylabel("number of evaluations")
plt.legend()
plt.tight_layout()
plt.show()

This is the plot we saw at the beginning. With current
hardware-compatible training methods, whether kernel-based training
requires more or fewer quantum circuit evaluations than variational
training depends on how many parameters the latter needs. If variational
circuits turn out to be as parameter-hungry as neural networks,
kernel-based training will outperform them for common machine learning
tasks. However, if variational learning only turns out to require few
parameters (or if more efficient training methods are found),
variational circuits could in principle match the linear scaling of
neural networks trained with backpropagation.

The practical take-away from this demo is that unless your variational
circuit has significantly fewer parameters than training data, kernel
methods could be a much faster alternative!

Finally, it is important to note that fault-tolerant quantum computers
may change the picture for both quantum and classical machine learning.
As mentioned in [Schuld (2021)](https://arxiv.org/abs/2101.11020), early
results from the quantum machine learning literature show that larger
quantum computers will most likely enable us to reduce the quadratic
scaling of kernel methods to linear scaling, which may make classical as
well as quantum kernel methods a strong alternative to neural networks
for big data processing one day.


About the author
================
