<a href="https://colab.research.google.com/github/thisisSHAX/ML_Ops_Project-/blob/main/Autism_Screening_ONNX_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing required libraries

In [None]:
!pip install wandb
!pip install onnx
!pip install skl2onnx
!pip install onnxruntime
!pip install onnxmltools

Collecting wandb
  Downloading wandb-0.12.7-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.0-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 56.3 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.9 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 53.6 MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.1.0-py3-none-any.whl (19 kB)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from skl2onnx import convert_sklearn, __version__, update_registered_converter
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa
import pprint
import os
import wandb

### Loading the dataset

In [None]:
df = pd.read_csv('https://drive.google.com/uc?export=download&id=16Pd-BdhzNMzPqPWD6XEu_XFZaGrSqFUz')

In [None]:
df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


### Identifying categorical and numerical variables

In [None]:
x_columns = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','Age_Mons','Sex','Ethnicity','Jaundice','Family_mem_with_ASD']

In [None]:
cat_features = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','Sex','Ethnicity','Jaundice','Family_mem_with_ASD']

In [None]:
num_features = list(set(x_columns) - set(cat_features))

In [None]:
encoded_df = pd.get_dummies(df[x_columns], columns=cat_features, drop_first = True)

In [None]:
encoded_df.sample(5)

Unnamed: 0,Age_Mons,A1_1,A2_1,A3_1,A4_1,A5_1,A6_1,A7_1,A8_1,A9_1,A10_1,Sex_m,Ethnicity_Latino,Ethnicity_Native Indian,Ethnicity_Others,Ethnicity_Pacifica,Ethnicity_White European,Ethnicity_asian,Ethnicity_black,Ethnicity_middle eastern,Ethnicity_mixed,Ethnicity_south asian,Jaundice_yes,Family_mem_with_ASD_yes
616,28,1,1,0,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0
682,29,0,1,0,0,1,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0
988,36,0,0,0,0,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0
555,33,1,0,1,1,1,1,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0
55,36,0,0,1,1,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
x_features = list(encoded_df.columns)

In [None]:
encoded_df.shape

(1054, 24)

### Data Splitting

In [None]:
X = encoded_df

In [None]:
Y = pd.get_dummies(df['Class/ASD Traits '], drop_first=True)

In [None]:
# Splitting after One - Hot Encoding
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

X_train, X_prod, Y_train, Y_prod = train_test_split(X_train, Y_train, test_size=0.25, random_state=2) # 0.25 x 0.8 = 0.2

In [None]:
# Splitting
train_X, test_X, train_Y, test_Y = train_test_split(df[x_columns], Y, test_size=0.2, random_state=1)

train_X, prod_X, train_Y, prod_Y = train_test_split(train_X, train_Y, test_size=0.25, random_state=2) # 0.25 x 0.8 = 0.2

### Creating the pipeline for deployment

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', 
                                           OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),                  
        ('cat', categorical_transformer, cat_features),
    ])

params = { "n_estimators": 400,
           "max_depth": 4,
           "objective": 'reg:squarederror' }

xgb_regressor = XGBRegressor(**params)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])           

reg.fit(train_X, train_Y)

rmse = np.sqrt(mean_squared_error(test_Y, 
                                  reg.predict(test_X)))

In [None]:
print(rmse)

0.22110099


### Creating the ONNX Graph

In [None]:
def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == 'int64':
            t = FloatTensorType([None, 1])
        elif v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs


inputs = convert_dataframe_schema(train_X)

pprint.pprint(inputs)

[('A1', FloatTensorType(shape=[None, 1])),
 ('A2', FloatTensorType(shape=[None, 1])),
 ('A3', FloatTensorType(shape=[None, 1])),
 ('A4', FloatTensorType(shape=[None, 1])),
 ('A5', FloatTensorType(shape=[None, 1])),
 ('A6', FloatTensorType(shape=[None, 1])),
 ('A7', FloatTensorType(shape=[None, 1])),
 ('A8', FloatTensorType(shape=[None, 1])),
 ('A9', FloatTensorType(shape=[None, 1])),
 ('A10', FloatTensorType(shape=[None, 1])),
 ('Age_Mons', FloatTensorType(shape=[None, 1])),
 ('Sex', StringTensorType(shape=[None, 1])),
 ('Ethnicity', StringTensorType(shape=[None, 1])),
 ('Jaundice', StringTensorType(shape=[None, 1])),
 ('Family_mem_with_ASD', StringTensorType(shape=[None, 1]))]


In [None]:
update_registered_converter(
    XGBRegressor, 'XGBoostXGBRegressor',
    calculate_linear_regressor_output_shapes, convert_xgboost)

### Converting the pipeline to ONNX format

In [None]:
model_onnx = convert_sklearn(
    reg, 'pipeline_xgboost',
    inputs)

### Storing the pipeline in a file

In [None]:
MODEL_DIR = "./autismscreeningprod"
MODEL_FILE_NAME = "autismscreening_xgboost.onnx"

In [None]:
os.mkdir(MODEL_DIR)
# And save.
with open(MODEL_DIR + "/" + MODEL_FILE_NAME, "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
!ls -al

total 20
drwxr-xr-x 1 root root 4096 Nov 22 12:29 .
drwxr-xr-x 1 root root 4096 Nov 22 10:46 ..
drwxr-xr-x 2 root root 4096 Nov 22 12:29 autismscreeningprod
drwxr-xr-x 4 root root 4096 Nov 18 14:35 .config
drwxr-xr-x 1 root root 4096 Nov 18 14:36 sample_data


### Uploading the pipeline to weights and biases for tracking

In [None]:
os.environ["WANDB_API_KEY"] = "3ee9089935c4340e01c38077092a69fb10a6f6ac"

In [None]:
wandb.init(project='autism_screening', 
           config=params, 
           tags = ['XGB', 'TargetEncoding'])

wandb.run.name = "XGBProdModel"

[34m[1mwandb[0m: Currently logged in as: [33mmaytune[0m (use `wandb login --relogin` to force relogin)


In [None]:
wandb.log( {"rmse" : rmse} )

In [None]:
model_artifact = wandb.Artifact("XGBoost_AutismScreening",
                                type = 'model',
                                description = 'XGBoost Model for autism tracking prediction')

In [None]:
model_artifact.add_dir(MODEL_DIR)

[34m[1mwandb[0m: Adding directory to artifact (./autismscreeningprod)... Done. 0.1s


In [None]:
wandb.run.log_artifact(model_artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f4e1b8445d0>

In [None]:
wandb.save()
wandb.finish()



VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
rmse,▁

0,1
rmse,0.2211
