# Setup Environment

In [1]:
import truera

In [3]:
#################
## Truera Code ##
#################

import json

from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import TokenAuthentication
from truera.client.truera_authentication import ServiceAccountAuthentication

from truera.client.ingestion import ColumnSpec, ModelOutputContext
from truera.client.public.communicator.http_communicator import AlreadyExistsError

In [None]:
#natwest -- use client ID and secret
#se demo -- use basic auth

In [15]:
with open("truera_credentials.json", "r") as f:
    _ = json.load(f)
    TRUERA_CLIENT_ID = _["client ID"]
    TRUERA_CLIENT_SECRET = _["client secret"]
    #TRUERA_TOKEN = _["token"]

In [16]:
from random import randrange
from datetime import date, datetime
version=randrange(10) 

PROJECT_NAME = "SM TruEra Pipeline LOCAL {} {}".format(date.today(), version)

In [16]:
DATA_COLLECTION_NAME = "Abalone"

In [24]:
# Initial setup of project & data collection

In [18]:
CONNECTION_STRING = <INSERT DEPLOYMENT URL>
auth = ServiceAccountAuthentication(
    client_id = TRUERA_CLIENT_ID,
    client_secret = TRUERA_CLIENT_SECRET
)

In [19]:
tru = TrueraWorkspace(CONNECTION_STRING, auth)

INFO:truera.client.remote_truera_workspace:Connecting to 'https://nwg-test.sandbox.truera.com'
, client side: 12.2.0
.


In [20]:
try:
    tru.add_project(PROJECT_NAME, score_type="regression")
except AlreadyExistsError as e:
    tru.delete_project(PROJECT_NAME)
    tru.add_project(PROJECT_NAME, score_type="regression")
tru.add_data_collection(DATA_COLLECTION_NAME)

# Step 3: Define a Processing Step for Feature Engineering

In [21]:
import argparse
import os
import requests
import tempfile
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [22]:
# Because this is a headerless CSV file, specify the column names here.
feature_columns_names = [
    "sex",
    "length",
    "diameter",
    "height",
    "whole_weight",
    "shucked_weight",
    "viscera_weight",
    "shell_weight",
]
label_column = "rings"

feature_columns_dtype = {
    "sex": str,
    "length": np.float64,
    "diameter": np.float64,
    "height": np.float64,
    "whole_weight": np.float64,
    "shucked_weight": np.float64,
    "viscera_weight": np.float64,
    "shell_weight": np.float64
}
label_column_dtype = {"rings": np.float64}

def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

base_dir = os.getcwd()

df = pd.read_csv(
    f"{base_dir}/data/input/abalone-dataset.csv",
    header=None, 
    names=feature_columns_names + [label_column],
    dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)
)
numeric_features = list(feature_columns_names)
numeric_features.remove("sex")
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_features = ["sex"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

y = df.pop("rings")
X_pre = preprocess.fit_transform(df)

#get post transform column names, as list
ohe_feat = preprocess.transformers_[1][1]\
    ['onehot'].get_feature_names_out()
cat_post_list = ohe_feat.tolist()
input_columns = ['rings']+numeric_features+cat_post_list

y_pre = y.to_numpy().reshape(len(y), 1)
X = np.concatenate((y_pre, X_pre), axis=1)

np.random.shuffle(X)
train, validation, test = np.split(X, [int(.7*len(X)), int(.85*len(X))])

train_df = pd.DataFrame(train)
train_df.columns = [str(c) for c in input_columns]
train_df.to_csv(f"{base_dir}/data/train/train.csv", header=False, index=False)
validation_df = pd.DataFrame(validation)
validation_df.columns = [str(c) for c in input_columns]
validation_df.to_csv(f"{base_dir}/data/validation/validation.csv", header=False, index=False)
test_df = pd.DataFrame(test)
test_df.columns = [str(c) for c in input_columns]
test_df.to_csv(f"{base_dir}/data/test/test.csv", header=False, index=False)

##########################
## Truera code ##
##########################
print(train_df.columns)
print(train_df.reset_index().shape)
print(input_columns[1:])
print(input_columns[0])

Index(['rings', 'length', 'diameter', 'height', 'whole_weight',
       'shucked_weight', 'viscera_weight', 'shell_weight', 'x0_F', 'x0_I',
       'x0_M'],
      dtype='object')
(2923, 12)
['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'x0_F', 'x0_I', 'x0_M']
rings


In [23]:
tru.add_data(data=train_df.reset_index(), 
             data_split_name='train',
             column_spec=ColumnSpec(id_col_name='index',
                                    pre_data_col_names=input_columns[1:],
                                    label_col_names=input_columns[0]))

tru.add_data(data=validation_df.reset_index(), 
             data_split_name='validation',
             column_spec=ColumnSpec(id_col_name='index',
                                    pre_data_col_names=input_columns[1:],
                                    label_col_names=input_columns[0]))

tru.add_data(data=test_df.reset_index(), 
             data_split_name='test',
             column_spec=ColumnSpec(id_col_name='index',
                                    pre_data_col_names=input_columns[1:],
                                    label_col_names=input_columns[0]))

Uploading tmpol7g8lme.parquet (91.1KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: a23920a2-f172-4e6a-97ec-7c6a2727095a finished with status: SUCCEEDED.


Uploading tmpuxt3acc2.parquet (32.9KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 18f1f8dd-5771-43a4-9b65-871653cf0331 finished with status: SUCCEEDED.


Uploading tmpl45fa65l.parquet (33.1KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 34d73db0-0fc9-46f2-b9e2-686cf844257c finished with status: SUCCEEDED.


# Step 5: Define a Processing Step for Model Evaluation

In [19]:
#!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/01/11/4840ff80f77a393ac929197b27335873c056e24e20231df97523786bcbf8/xgboost-2.0.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata
  Downloading xgboost-2.0.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0


In [40]:
#%%writefile abalone_truera/evaluation.py
import json
import pathlib
import pickle
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import mean_squared_error

In [41]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(train_df[input_columns[1:]], train_df[input_columns[0]])

In [42]:
params = {"objective": "reg:squarederror"}

n = 50
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [43]:
model

<xgboost.core.Booster at 0x7feea0342190>

In [44]:
X_test = xgb.DMatrix(test_df[input_columns[1:]], test_df[input_columns[0]])

In [45]:
predictions = model.predict(X_test)

In [46]:
##########################
## Truera-specific code ##
##########################
tru.set_model_execution("local")

INFO:truera.client.truera_workspace:Model execution environment set to 'local'


In [47]:
tru.add_python_model("xgb_abalone_model", model)

INFO:truera.client.remote_truera_workspace:Uploading xgboost model: Booster
INFO:truera.client.remote_truera_workspace:Verifying model...
INFO:truera.client.remote_truera_workspace:✔️ Verified packaged model format.
INFO:truera.client.remote_truera_workspace:✔️ Loaded model in current environment.
INFO:truera.client.remote_truera_workspace:✔️ Called predict on model.
INFO:truera.client.remote_truera_workspace:✔️ Verified model output.
INFO:truera.client.remote_truera_workspace:Verification succeeded!


Uploading MLmodel (226.0B) -- ### -- file upload complete.
Uploading tmpc3_r13zk.json (236.4KiB) -- ### -- file upload complete.
Uploading conda.yaml (208.0B) -- ### -- file upload complete.
Uploading xgboost_booster_regression_predict_wrapper.py (578.0B) -- ### -- file upload complete.
Uploading xgboost_booster_regression_predict_wrapper.cpython-38.pyc (1.2KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Model "xgb_abalone_model" is added and associated with data collection "Abalone". "xgb_abalone_model" is set as the model for the workspace context.


Model uploaded to: http://se-demo-server.eastus.cloudapp.azure.com/home/p/Abalone%20Test/m/xgb_abalone_model/


INFO:truera.client.remote_truera_workspace:Triggering computations for model predictions on split train.
INFO:truera.client.remote_truera_workspace:Data collection in remote environment is now set to "Abalone". 
INFO:truera.client.remote_truera_workspace:Setting model context to "xgb_abalone_model".


In [None]:
tru

{
    "project": "Abalone Test",
    "data-collection": "Abalone",
    "data-split": "test",
    "model": "xgb_abalone_model",
    "connection-string": "https://app.truera.net",
    "model_execution": "local"
}

In [48]:
tru.get_data_splits()

['train', 'validation', 'test']

In [49]:
dftest = tru.get_xs()

In [50]:
dftest

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,x0_F,x0_I,x0_M
0,0.924461,1.129910,0.609334,1.043700,0.662949,1.052967,1.244156,1.0,0.0,0.0
1,0.674624,0.626020,0.609334,0.821401,1.095505,0.742751,0.385593,0.0,0.0,1.0
2,-2.031938,-1.893430,-1.423087,-1.421985,-1.321854,-1.428759,-1.500373,0.0,1.0,0.0
3,1.882167,1.633801,1.685322,2.870020,3.377691,3.092180,2.199712,1.0,0.0,0.0
4,0.924461,0.827576,0.848442,1.114061,1.187874,0.820305,1.122018,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
622,1.091018,1.230689,1.207105,1.222151,0.602121,1.253695,2.343405,1.0,0.0,0.0
623,0.174951,0.222908,0.250672,-0.176906,-0.030944,-0.206144,-0.221508,0.0,1.0,0.0
624,0.674624,0.676409,0.489780,1.605566,0.856698,1.381431,0.726863,0.0,0.0,1.0
625,0.924461,0.928355,0.250672,1.116100,1.307278,1.646027,0.511324,1.0,0.0,0.0


In [51]:
DMtest = xgb.DMatrix(dftest)

In [52]:
preds = model.predict(DMtest)

In [53]:
preds_df = pd.DataFrame(preds, index=dftest.index).reset_index()

In [54]:
preds_df.head()

Unnamed: 0,index,0
0,0,12.442741
1,1,9.519217
2,2,5.068023
3,3,10.652112
4,4,10.812168


In [55]:
preds_df.columns

Index(['index', 0], dtype='object')

In [56]:
tru.get_models()

['xgb_abalone_model']

In [57]:
tru.get_models()[0]

'xgb_abalone_model'

In [58]:
for split in tru.get_data_splits():
    temp = tru.get_xs()
    DMtemp = xgb.DMatrix(temp)
    preds = model.predict(DMtemp)
    preds_df = pd.DataFrame(preds, index=temp.index, columns=['rings']).reset_index()
    print('predictions generated for split {}'.format(split))
    
    print('adding predictions to data split {}'.format(split))
    tru.add_data(
    data=preds_df,
    data_split_name=split,
    column_spec=ColumnSpec(
        id_col_name="index",
        prediction_col_names='rings'),
        
    model_output_context=ModelOutputContext(
        model_name=tru.get_models()[0],
        score_type='regression')
    )

    print('generating and uploading feature influences for split {}'.format(split))
    tru.compute_feature_influences()
    print('generating and uploading error influences for split {}'.format(split))
    tru.compute_error_influences()

predictions generated for split train
adding predictions to data split train
Uploading tmpmwttao18.parquet (8.1KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: e7fef118-e0be-4d2a-ac2e-36a971aa7890 finished with status: SUCCEEDED.


generating and uploading feature influences for split train


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "Abalone". 
INFO:truera.client.truera_workspace:Syncing data split "train" to local.
INFO:truera.client.local.local_truera_workspace:Data split "train" is added to local data collection "Abalone", and set as the data split for the workspace context.
INFO:truera.client.truera_workspace:Downloading model xgb_abalone_model...
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.
INFO:truera.client.local.local_truera_workspace:The previous data collection ("Abalone") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "

|          | 0.000% [00:00<?]

Uploading tmppd_627cv.parquet (103.5KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 3293490b-1c51-433e-b7d5-7be8d2cab6f1 finished with status: SUCCEEDED.
INFO:truera.client.truera_workspace:Inferred error `score_type` to be "mean_absolute_error_for_regression"


generating and uploading error influences for split train


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmp4aie_0y8.parquet (103.5KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: d754989a-1ae8-4e0d-a39d-763d1da3112f finished with status: SUCCEEDED.


predictions generated for split validation
adding predictions to data split validation
Uploading tmpfrfn02da.parquet (33.4KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 980781ee-aa58-4da2-9fa2-02763dc6ec02 finished with status: SUCCEEDED.


generating and uploading feature influences for split validation


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.truera_workspace:Syncing data split "validation" to local.
INFO:truera.client.local.local_truera_workspace:Data split "validation" is added to local data collection "Abalone", and set as the data split for the workspace context.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmprr33dtrd.parquet (67.9KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 5f85459f-b3b8-4526-8494-bb6a69c64f03 finished with status: SUCCEEDED.
INFO:truera.client.truera_workspace:Inferred error `score_type` to be "mean_absolute_error_for_regression"


generating and uploading error influences for split validation


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmp8unanarb.parquet (67.9KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 81fb3e4c-8173-4be3-9fb3-b37e0cd163af finished with status: SUCCEEDED.


predictions generated for split test
adding predictions to data split test
Uploading tmprydaxcxw.parquet (8.1KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 82ba68dc-f5e2-4d99-9d2e-a25675bcc19f finished with status: SUCCEEDED.


generating and uploading feature influences for split test


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.truera_workspace:Syncing data split "test" to local.
INFO:truera.client.local.local_truera_workspace:Data split "test" is added to local data collection "Abalone", and set as the data split for the workspace context.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmp988b6z_9.parquet (67.9KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: b79258de-2d18-4c2e-afac-6aa47a8678cc finished with status: SUCCEEDED.
INFO:truera.client.truera_workspace:Inferred error `score_type` to be "mean_absolute_error_for_regression"


generating and uploading error influences for split test


INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmp9qy7g0ab
INFO:truera.client.truera_workspace:Syncing data collection "Abalone" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmpiicpydcd.parquet (67.9KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 41def48d-97e0-4ace-8240-5fef40d43fda finished with status: SUCCEEDED.


In [59]:
tru.tester.add_performance_test(test_name="MSE test",
                                data_split_names=tru.get_data_splits(), 
                                metric="MSE", 
                                warn_if_greater_than = 3,
                                fail_if_greater_than = 6)
tru.tester.add_performance_test(test_name="MAPE test",
                                data_split_names=tru.get_data_splits(), 
                                metric="MAPE", 
                                warn_if_greater_than = 8,
                                fail_if_greater_than = 16)
tru.tester.add_performance_test(test_name="MAE test",
                                data_split_names=tru.get_data_splits(), 
                                metric="MAE", 
                                warn_if_greater_than = 2,
                                fail_if_greater_than = 4)

metric_dict = tru.tester.get_model_test_results().as_dict()["Performance Tests"]
metric_df = pd.DataFrame(metric_dict["Rows"], columns=metric_dict["Column Names"])

report_dict = {
    "regression_metrics": {}
}

num_tests = 0
num_passed = 0
for metric in metric_df["Metric"].unique():
    num_tests += 1
    row = metric_df[(metric_df["Metric"] == metric) & (metric_df["Split"] == "test")].iloc[0]
    score = row['Score']
    print(f"Metric: {metric} \t Value: {score} \t Outcome: {row['Outcome']}")

    if row["Outcome"] == "PASSED":
        num_passed += 1

    report_dict["regression_metrics"][metric] = {"value": score}

report_dict["test_metrics"] = {
    "total_tests": num_tests,
    "num_passing": num_passed,
    "passing_percentage": num_passed / num_tests,
}
report_dict["test_results"] = metric_dict
print(f"Tests passing: {num_passed}/{num_tests}")

##############################
## END Truera-specific code ##
##############################

|          | 0.000% [00:00<?]

Metric: MSE 	 Value: 17.931528 	 Outcome: FAILED
Metric: MAPE 	 Value: 35.893898 	 Outcome: FAILED
Tests passing: 0/3
