In [1]:
import os 
import dotenv
from pathlib import Path

In [2]:
project_dir = Path(os.path.abspath("")).resolve().parents[1]

In [3]:
dotenv_path = os.path.join(project_dir, ".env")
dotenv.load_dotenv(dotenv_path)

True

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow

In [5]:
# set to bottom stage
experiment_name = "sentiment_clf_onnx"

In [6]:
mlflow.set_experiment(experiment_name)

2023/07/26 22:14:19 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_clf_onnx' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow/2', creation_time=1690424059862, experiment_id='2', last_update_time=1690424059862, lifecycle_stage='active', name='sentiment_clf_onnx', tags={}>

In [7]:
experiment = mlflow.get_experiment_by_name(experiment_name)

In [8]:
print(f"experiment id: {experiment.experiment_id}")

experiment id: 2


In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer

In [10]:
english_stopwords = stopwords.words("english")

In [11]:
porter = PorterStemmer()

In [12]:
def tokenize(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [13]:
# load dataset

In [14]:
df_train = pd.read_parquet(os.path.join(project_dir, "data", "processed", "dataset.train.parquet"))
df_test = pd.read_parquet(os.path.join(project_dir, "data", "processed", "dataset.test.parquet"))

In [15]:
df_train.head()

Unnamed: 0,text,airline,label,label_text
2920,question flying 1st time 16mont old son need b...,delta,1,NEUTRAL
507,wont find luggage without blaming everyone else,american,0,NEGATIVE
3988,lucky people like Annamarie Norris BWI hope ge...,southwest,2,POSITIVE
3383,AA Gate Supervisor emplid 600117 allowed fligh...,us airways,1,NEUTRAL
1196,lost luggage birthday wish find luggage,southwest,0,NEGATIVE


In [16]:
df_test.head()

Unnamed: 0,text,airline,label,label_text
1574,scm1133 hate delays tried http co 7STktJXAN1 a...,us airways,0,NEGATIVE
2049,Cancelled Flightled flight UA922 aircrft maint...,united,0,NEGATIVE
3710,Thanks guys got heading Milan Wednesday big we...,american,2,POSITIVE
1210,Thank AH Still wish someone picked phone 3hrs ...,southwest,0,NEGATIVE
3058,trying go far away King isCollegeLondon possib...,delta,1,NEUTRAL


In [17]:
df_test.shape

(876, 4)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

In [19]:
# setup the tfidf to be standalone

In [20]:
tfidf_vect = TfidfVectorizer(
    norm=None,
    lowercase=True, 
    tokenizer=tokenizer_porter, 
    stop_words=None,
    use_idf=False,
    ngram_range=(1, 1)
)

In [21]:
text_vectorizer = tfidf_vect.fit(df_train["text"].values)



In [22]:
X_train = text_vectorizer.transform(df_train["text"].values)

In [23]:
X_test = text_vectorizer.transform(df_test["text"].values)

In [24]:
print(X_train.shape)
print(X_test.shape)

(3504, 5339)
(876, 5339)


In [25]:
y_train, y_test = df_train["label"].values, df_test["label"].values

In [26]:
# create the SGD model 

In [27]:
# use params from previous run
sgd_clf = SGDClassifier(
    loss="hinge",
    alpha=0.01,
    max_iter=16,
    penalty="l2",
    random_state=13,
    shuffle=True
)

In [28]:
# train 
sgd_clf.fit(X_train, y_train)

In [29]:
sgd_clf.score(X_test, y_test)

0.7009132420091324

In [30]:
# check the accuracy + f1 score

In [31]:
y_preds = sgd_clf.predict(X_test)

In [32]:
accuracy_score(y_test, y_preds)

0.7009132420091324

In [33]:
f1_score(y_test, y_preds, average=None)

array([0.81635802, 0.1843318 , 0.54393305])

In [34]:
f1_score(y_test, y_preds, average="weighted")

0.6387219540729272

In [35]:
## ONNX

In [36]:
import onnx
from skl2onnx import to_onnx
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType

  tys = obj.typeStr or ''
  if getattr(obj, 'isHomogeneous', False):
  return getattr(obj, attribute)


In [37]:
# Set the input signature

In [38]:
initial_type = [("input", FloatTensorType([None, X_train.shape[1]]))]

In [39]:
# convert sklearn model to onnx

In [40]:
sgd_onx_fmt = to_onnx(sgd_clf, initial_types=initial_type, target_opset=12)

In [41]:
# prediction with onnx = onnx runtime 

In [42]:
sess = InferenceSession(sgd_onx_fmt.SerializeToString())

In [43]:
# benchmark

In [49]:
%%timeit

pred_ort = sess.run(None, {'input': X_test.toarray().astype(np.float32)})[0]
# print(pred_ort)

14.7 ms ± 464 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [50]:
%%timeit

pred_skl = sgd_clf.predict(X_test.toarray())
# print(pred_skl)

13.4 ms ± 721 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [53]:
%%time

pred_ort = sess.run(None, {'input': X_test.toarray().astype(np.float32)})[0]
print(pred_ort)

[0 0 2 0 0 0 0 0 2 0 2 0 0 0 1 2 2 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 2 0 0
 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0
 0 0 0 2 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 2 0 0 0 0
 0 1 0 0 0 2 0 0 0 1 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 2 0 0
 2 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 2 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0
 2 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 1 2 0 0 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 1 0 0 0 0 0 0 2 2 0 0 2 0
 0 0 2 0 2 2 0 0 0 0 0 0 

In [54]:
%%time

pred_skl = sgd_clf.predict(X_test.toarray())
print(pred_skl)

[0 0 2 0 0 0 0 0 2 0 2 0 0 0 1 2 2 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 2 0 0
 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0
 0 0 0 2 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 2 0 0 0 0
 0 1 0 0 0 2 0 0 0 1 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 2 0 0
 2 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 2 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0
 2 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 1 2 0 0 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 1 0 0 0 0 0 0 2 2 0 0 2 0
 0 0 2 0 2 2 0 0 0 0 0 0 

In [55]:
# save onnx and tfidf for future use

In [56]:
onnx.save(sgd_onx_fmt, "sentiment-sgd-onnx.onnx")

In [57]:
import joblib

In [58]:
joblib.dump(text_vectorizer, "sentiment-sgd-tf-vectorizer.joblib")

['sentiment-sgd-tf-vectorizer.joblib']

In [59]:
# to load saved onnx model

In [60]:
loaded_onnx = onnx.load("sentiment-sgd-onnx.onnx")

In [61]:
from mlflow.models import infer_signature

In [62]:
with mlflow.start_run(experiment_id=experiment.experiment_id, run_name="initial_run"):
    # log or do whatever you want
    
    # signature
    train = X_train
    predictions = sess.run(None, {'input': X_test.toarray().astype(np.float32)})[0]  # compute model predictions
    signature = infer_signature(train, predictions)
    
    # or just log the model
    mlflow.onnx.log_model(
        sgd_onx_fmt,
        "sentiment-sgd-onnx",
        signature=signature
        
    )

In [63]:
# test logged model
logged_model = "runs:/2c9c870f68fc41eda7559d192b517e80/sentiment-sgd-onnx"

In [64]:
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [65]:
preds = loaded_model.predict(X_test.toarray())

In [66]:
preds["output_label"]

array([0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
       0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [67]:
# register model

In [68]:
model_registry_version  = mlflow.register_model(logged_model, "sentiment-model-onnx")

Successfully registered model 'sentiment-model-onnx'.
2023/07/26 22:20:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: sentiment-model-onnx, version 1
Created version '1' of model 'sentiment-model-onnx'.


In [69]:
# check registered model 
print(f"model name: {model_registry_version.name}")
print(f"model version: {model_registry_version.version}")

model name: sentiment-model-onnx
model version: 1


In [70]:
# change status from none to Staging

In [71]:
from mlflow import MlflowClient

In [72]:
client = MlflowClient()
client.transition_model_version_stage(
    name=model_registry_version.name, version=model_registry_version.version, stage="Staging"
)

<ModelVersion: aliases=[], creation_timestamp=1690424422542, current_stage='Staging', description='', last_updated_timestamp=1690424426080, name='sentiment-model-onnx', run_id='2c9c870f68fc41eda7559d192b517e80', run_link='', source='s3://mlflow/2/2c9c870f68fc41eda7559d192b517e80/artifacts/sentiment-sgd-onnx', status='READY', status_message='', tags={}, user_id='', version='1'>

In [73]:
# try to load model based on stage
model_name = "sentiment-model-onnx"
stage = "Staging"

In [74]:
selected_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")

In [75]:
preds = selected_model.predict(X_test.toarray())

In [76]:
preds["output_label"]

array([0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
       0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [77]:
import json

with open("test.json", "w") as f:
    json.dump(X_test[1].toarray().tolist(), f)

