1. Train your PCA and the subsequent model using scikit-learn.
2. Convert the entire pipeline to ONNX format using convert_sklearn from the skl2onnx package.

In this example, we first create a pipeline that includes PCA and logistic regression. We then train this pipeline on the Iris dataset. After training, we define the initial types for the ONNX model and convert the entire pipeline to ONNX format. Finally, we save the ONNX model to a file.

In [8]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a pipeline with PCA and logistic regression
pipeline = Pipeline([
    ('pca', PCA(n_components=2)),
    ('logistic_regression', LogisticRegression())
])
pipeline

Pipeline(steps=[('pca', PCA(n_components=2)),
                ('logistic_regression', LogisticRegression())])

In [9]:

# Train the pipeline
pipeline.fit(X_train, y_train)

# Define the initial types for the ONNX model
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

# Convert the pipeline to ONNX
onnx_model = convert_sklearn(pipeline, initial_types=initial_type)
onnx_model

ir_version: 9
producer_name: "skl2onnx"
producer_version: "1.17.0"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
  node {
    input: "float_input"
    input: "mean"
    output: "sub_result"
    name: "Sub"
    op_type: "Sub"
    domain: ""
  }
  node {
    input: "sub_result"
    input: "transform_matrix"
    output: "variable"
    name: "MatMul"
    op_type: "MatMul"
    domain: ""
  }
  node {
    input: "variable"
    output: "label"
    output: "probability_tensor"
    name: "LinearClassifier"
    op_type: "LinearClassifier"
    attribute {
      name: "classlabels_ints"
      ints: 0
      ints: 1
      ints: 2
      type: INTS
    }
    attribute {
      name: "coefficients"
      floats: -2.6541717052459717
      floats: 0.924582839012146
      floats: -0.2773626446723938
      floats: 0.2747637629508972
      floats: 2.9315342903137207
      floats: -1.199346661567688
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: -0.4931544065475464

In [10]:

# Save the model to a file
with open("pipeline.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

Please note that once the model is in ONNX format, you cannot apply PCA or any other scikit-learn operations to it. All transformations and training should be done prior to conversion.

If you need to apply PCA to new data using the ONNX model, you would need to apply PCA to your data separately and then pass the transformed data to the ONNX model for inference.

In [7]:

# Load the ONNX model and run inference
import onnxruntime as rt

sess = rt.InferenceSession("pipeline.onnx")
sess

<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x7fe4890825b0>

In [11]:
input_name = sess.get_inputs()[0].name
input_name

'float_input'

In [12]:
label_name = sess.get_outputs()[0].name
label_name

'output_label'

In [16]:
X_test

array([[5.1, 3.5, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.6, 3. , 4.5, 1.5],
       [6.6, 3. , 4.4, 1.4],
       [6.3, 2.7, 4.9, 1.8],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 4.1, 1.5, 0.1],
       [6.9, 3.2, 5.7, 2.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.4, 2.7, 5.3, 1.9],
       [4.4, 2.9, 1.4, 0.2],
       [6.4, 3.2, 4.5, 1.5],
       [5.7, 3. , 4.2, 1.2],
       [5.9, 3. , 4.2, 1.5],
       [4.9, 3.6, 1.4, 0.1],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 2.2, 5. , 1.5],
       [6.4, 2.8, 5.6, 2.1],
       [5.1, 2.5, 3. , 1.1],
       [6.1, 2.9, 4.7, 1.4],
       [5.8, 2.7, 3.9, 1.2],
       [5.1, 3.5, 1.4, 0.3],
       [5.4, 3.9, 1.7, 0.4],
       [7.7, 2.8, 6.7, 2. ],
       [5.4, 3.7, 1.5, 0.2],
       [6.7, 3. , 5.2, 2.3],
       [4.3, 3. , 1.1, 0.1],
       [6.7, 3. , 5. , 1.7],
       [5.4, 3. , 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2]])

In [17]:
y_test

array([0, 0, 1, 1, 2, 1, 0, 2, 1, 2, 0, 1, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0,
       0, 2, 0, 2, 0, 1, 1, 0])

In [15]:
 # test data
test = X_test.astype(np.float32)
test

array([[5.1, 3.5, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.6, 3. , 4.5, 1.5],
       [6.6, 3. , 4.4, 1.4],
       [6.3, 2.7, 4.9, 1.8],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 4.1, 1.5, 0.1],
       [6.9, 3.2, 5.7, 2.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.4, 2.7, 5.3, 1.9],
       [4.4, 2.9, 1.4, 0.2],
       [6.4, 3.2, 4.5, 1.5],
       [5.7, 3. , 4.2, 1.2],
       [5.9, 3. , 4.2, 1.5],
       [4.9, 3.6, 1.4, 0.1],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 2.2, 5. , 1.5],
       [6.4, 2.8, 5.6, 2.1],
       [5.1, 2.5, 3. , 1.1],
       [6.1, 2.9, 4.7, 1.4],
       [5.8, 2.7, 3.9, 1.2],
       [5.1, 3.5, 1.4, 0.3],
       [5.4, 3.9, 1.7, 0.4],
       [7.7, 2.8, 6.7, 2. ],
       [5.4, 3.7, 1.5, 0.2],
       [6.7, 3. , 5.2, 2.3],
       [4.3, 3. , 1.1, 0.1],
       [6.7, 3. , 5. , 1.7],
       [5.4, 3. , 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2]], dtype=float32)

In [5]:

pred_onx = sess.run([label_name], {input_name: test})[0]

# Output the predictions from ONNX model
print(pred_onx)


[2 1 0 2 2 2 1 1 1 0 0 1 1 2 2 1 2 1 1 2 1 1 0 1 2 2 0 1 0 1]


In [18]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_onx)) 
# result is indeed not great

0.4
