In [1]:
import logging
import os
import pickle

import numpy as np
import pandas as pd
import onnx
from onnxruntime import backend
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

from skl2onnx import convert_sklearn, get_latest_tested_opset_version
from skl2onnx.common.data_types import (
    FloatTensorType,
)

TARGET_OPSET = get_latest_tested_opset_version()

In [236]:
folder = os.environ.get("ONNXTESTDUMP", "tests_dump")
basename = "SklearnKNeighborsClassifierMulti"
os.makedirs(folder, exist_ok=True)

In [240]:
iris = datasets.load_iris()
target_names_map = dict(enumerate(iris.target_names))
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['plant_type'] = pd.Series(iris.target).map(target_names_map)

In [241]:
target_names_mapper = np.vectorize(dict(enumerate(iris.target_names)).get)

In [242]:
df['plant_type'].values.astype("<U10")

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [243]:
target_names_mapper(iris.target)

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [192]:
list(df.columns)

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'target']

In [244]:
x_columns=[
    'sepal length (cm)',
    'sepal width (cm)',
    'petal length (cm)'
]

In [211]:
X=df[x_columns].values.astype(np.float32)
y=df['target'].values.astype("<U10")

In [212]:
model = KNeighborsClassifier()
model.fit(X, y)
prediction = [model.predict(X), model.predict_proba(X)]

In [213]:
prediction_df = pd.DataFrame()
prediction_df['prediction_class'] = model.predict(X)

In [214]:
prediction_df.shape

(150, 1)

In [215]:
prediction_df = pd.DataFrame(model.predict_proba(X))
prediction_df.columns=[f"class_is_{x}" for x in df['target'].unique()]

In [216]:
prediction_df

Unnamed: 0,class_is_setosa,class_is_versicolor,class_is_virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.2,0.8
146,0.0,0.2,0.8
147,0.0,0.2,0.8
148,0.0,0.0,1.0


In [227]:
model_onnx = convert_sklearn(
    model,
    "KNN classifier multi-class",
    [("input", FloatTensorType([None, X.shape[1]]))],
    #target_opset=TARGET_OPSET
)

In [228]:
dest_data = os.path.join(folder, basename + ".data.pkl")
with open(dest_data, "wb") as data_file:
    pickle.dump(df, data_file)

In [229]:
dest_expected = os.path.join(folder, basename + ".expected.pkl")
with open(dest_expected, "wb") as expected_file:
    pickle.dump(prediction, expected_file)

In [230]:
dest_pkl = os.path.join(folder, basename + ".model.pkl")
with open(dest_pkl, "wb") as pickle_file:
    pickle.dump(model, pickle_file)

In [231]:
dest_onnx = os.path.join(folder, basename + ".model.onnx")
with open(dest_onnx, "wb") as onnx_file:
    logging.info(f"created {onnx_file}")
    onnx_file.write(model_onnx.SerializeToString())

In [232]:
onnx_graph = onnx.load(dest_onnx)
rep = backend.prepare(onnx_graph, 'CPU')
prediction_from_saved = rep.run(X)

In [233]:
print("doc_string={}".format(onnx_graph.doc_string))
print("domain={}".format(onnx_graph.domain))
print("ir_version={}".format(onnx_graph.ir_version))
print("metadata_props={}".format(onnx_graph.metadata_props))
print("model_version={}".format(onnx_graph.model_version))
print("producer_name={}".format(onnx_graph.producer_name))
print("producer_version={}".format(onnx_graph.producer_version))

doc_string=
domain=ai.onnx
ir_version=7
metadata_props=[]
model_version=0
producer_name=skl2onnx
producer_version=1.7.0


In [250]:
prediction_from_saved_df = pd.DataFrame(prediction_from_saved[1])
prediction_from_saved_df.columns = prediction_from_saved_df.columns.map("plant_type_is_{}".format)

prediction_from_saved_df['plant_type_pred'] = pd.Series(prediction_from_saved[0])
prediction_from_saved_df

In [234]:
pd.DataFrame(prediction_from_saved[0],columns=['predicted'])

Unnamed: 0,predicted
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
...,...
145,virginica
146,virginica
147,virginica
148,virginica


In [235]:
pd.DataFrame(prediction_from_saved[1])

Unnamed: 0,setosa,versicolor,virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.2,0.8
146,0.0,0.2,0.8
147,0.0,0.2,0.8
148,0.0,0.0,1.0
