In [50]:
import sys
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import glob
import os
import pandas as pd
import time
from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import TokenAuthentication, BasicAuthentication
from truera.client.ingestion import ColumnSpec

### 1. Connect to TruEra

In [51]:
# connection details
TRUERA_URL = "https://app.truera.net"
AUTH_TOKEN = "XXXX" # Replace with auth token as copied from TruEra

### 2. Install Package

#### Create TruEra Client

In [52]:
from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import TokenAuthentication

auth = TokenAuthentication(AUTH_TOKEN)
tru = TrueraWorkspace(TRUERA_URL, auth, ignore_version_mismatch=True)
tru.set_environment("remote")

INFO:truera.client.remote_truera_workspace:Connecting to 'https://app.truera.net'


ValueError: Could not decode authentication token. Please check if provided token is malformed.

### Step 3: Download and Process sample data

In [56]:
date_format = '%Y-%m-%d %H:%M:%S'

def process_csv(file_name):
    df = pd.read_csv(file_name)
    # Making sure that we've data starting from last 3 months. 
    start_time = datetime.now() - relativedelta(months=3)
    
    # Change the datatime in each row.
    for df_idx in df.index:
        start_time = start_time + timedelta(minutes=1)
        timestamp = start_time.strftime(date_format)
        df['timestamp'][df_idx] = timestamp
    
    for col_name,col_type in df.dtypes.iteritems(): # cast all bools to str
        if col_type == 'bool' and col_name not in ["timestamp"]:
            df[col_name] = col_name + "_" + df[col_name].astype("string") # add string prefix to prevent reading as object
    df['id'] = "id_" + df["id"].astype("string") # add string prefix to prevent reading as object
    df.drop(["model"], axis="columns", inplace=True)
    dt = datetime.now() + dateutil.relativedelta.relativedelta(months=-1)
    return df


In [57]:
import pandas as pd
import numpy as np
from itertools import combinations

# common parameters & constants
_ID_COLUMN_NAME = 'id'
_TIMESTAMP_COLUMN_NAME = 'timestamp'
_GENDER_COLUMN_NAME = 'gender'
_LABEL_COLUMN_NAME = 'price'
_PREDICTION_COLUMN_NAME = 'prediction'
_EXTRA_COLUMN_NAME = 'lotAreaRaw'

# prod data
prod_data = process_csv("https://quickstart-sample-data.s3.us-west-2.amazonaws.com/housing-price-prediction/gb_prod.csv")

# If __id__ in the data, remove this. 
prod_data.rename(columns={'__id__': _ID_COLUMN_NAME}, inplace=True)

In [58]:
prod_data

Unnamed: 0,timestamp,id,hasImage,statusType,statusText,countryCurrency,price,addressCity,addressState,isUndisclosedAddress,...,lotAreaValue,lotAreaUnit,priceReductionStr,lotAreaRaw,priceReductionRatio,ownerGender,ownerRace,latitude,longitude,prediction
0,2023-06-08 22:44:41,id_81641,True,FOR_SALE,Active,$,749000.0,Portland,OR,0,...,4792.0,sqft,0.0,4792.0,0.0,Male,Asian,45.0,-123.0,8.232298e+05
1,2023-06-08 22:45:41,id_81642,True,FOR_SALE,Active,$,445000.0,Portland,OR,0,...,4792.0,sqft,0.0,4792.0,0.0,Female,Other,45.0,-123.0,4.813908e+05
2,2023-06-08 22:46:41,id_81643,True,FOR_SALE,Active,$,295000.0,Portland,OR,0,...,3655.0,UNKNOWN,0.0,0.0,0.0,Male,Caucasian,46.0,-123.0,4.666216e+05
3,2023-06-08 22:47:41,id_81644,True,FOR_SALE,Active,$,899000.0,Portland,OR,0,...,3655.0,UNKNOWN,0.0,0.0,0.0,Female,African_American,46.0,-123.0,9.442756e+05
4,2023-06-08 22:48:41,id_81645,True,FOR_SALE,Active,$,599900.0,Portland,OR,0,...,7405.0,sqft,0.0,7405.0,0.0,Female,Caucasian,46.0,-123.0,6.655686e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2023-06-19 08:39:41,id_96636,True,FOR_SALE,Multi-family home for sale,$,399900.0,Saint Paul,MN,0,...,3485.0,sqft,0.0,3485.0,0.0,Female,Hispanic,45.0,-93.0,3.699642e+05
14996,2023-06-19 08:40:41,id_96637,True,FOR_SALE,New construction,$,3250000.0,Saint Paul,MN,0,...,0.0,acres,0.0,17860.0,0.0,Male,Caucasian,45.0,-93.0,1.753757e+06
14997,2023-06-19 08:41:41,id_96638,True,FOR_SALE,Condo for sale,$,160000.0,Saint Paul,MN,0,...,871.0,sqft,0.0,871.0,0.0,Other,Caucasian,45.0,-93.0,1.208995e+05
14998,2023-06-19 08:42:41,id_96639,True,FOR_SALE,Condo for sale,$,395800.0,Saint Paul,MN,0,...,1742.0,sqft,0.0,1742.0,0.0,Female,Hispanic,45.0,-93.0,3.750846e+05


### 4. Create Project

In [47]:
# adding project
project_name = "PROJECT_NAME_REGRESSION" # Replace this with a project name of choice. 
if project_name not in tru.get_projects():
    tru.add_project(project=project_name, score_type="regression")
else:
    tru.set_project(project_name)

### 5. Add data collection and model

In [48]:
MODEL_NAME = "lr"

data_collection_name = "{}_dataset".format(MODEL_NAME)

# Add data collection.
if data_collection_name not in tru.get_data_collections():
    tru.add_data_collection(data_collection_name=data_collection_name)
else:
    tru.set_data_collection(data_collection_name)

# Add models.
if MODEL_NAME not in tru.get_models():
    tru.add_model(MODEL_NAME)
else:
    tru.set_model(MODEL_NAME)

INFO:truera.client.remote_truera_workspace:Data collection in remote environment is now set to "lr_dataset". 
INFO:truera.client.remote_truera_workspace:Setting remote model context to "lr".


### Step 5: Add production data

In [49]:
from typing import Sequence
from truera.client.ingestion.util import ColumnSpec, ModelOutputContext

def columns_excluding(df: pd.DataFrame, exclude_columns: Sequence[str]):
    column_names = df.columns.tolist()
    for column in exclude_columns:
        if column in column_names:
            column_names.remove(column)
    return column_names

pre_column_names = columns_excluding(
            prod_data, [
                _ID_COLUMN_NAME, _TIMESTAMP_COLUMN_NAME,
                _PREDICTION_COLUMN_NAME, _LABEL_COLUMN_NAME
            ]
        )

# Add prod data
tru.add_production_data(
                prod_data,
                column_spec=ColumnSpec(
                    id_col_name=_ID_COLUMN_NAME,
                    pre_data_col_names=pre_column_names,
                    label_col_names=[_LABEL_COLUMN_NAME],
                    prediction_col_names=[_PREDICTION_COLUMN_NAME],
                    timestamp_col_name=_TIMESTAMP_COLUMN_NAME
                )
            )

INFO:truera.client.remote_truera_workspace:`model_output_context` will be inferred as it was not provided.
INFO:truera.client.remote_truera_workspace:Inferred ModelOutputContext: ModelOutputContext(model_name='lr', score_type='regression', background_split_name='', influence_type='')


Uploading tmph2cf0b8a.parquet (671.8KiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 630a4349-54a9-4a42-b075-207fdc3851ff finished with status: SUCCEEDED.


### Step 6: Create dahsboard

You can head to URL: https://app.truera.net/home/monitoring to create the dashboard.