# TruEra Python SDK Ingestion Demo: OJ Forecasting

## Pre-requisites: Install Truera Python Client from pypi
Install the wheel in your Python environment using `pip install truera`

Note: to use QII to accelerate Feature influence calculation (Shapley value estimation), ensure you have access to, and have installed, the corresponding truera-qii package from the Resources page of your TruEra workspace. 

## Pre-requisites: Quickstart Data 
2. If not using these scripts as a reference to ingest your own model & data, download the OJ sales data. See README. 

In [None]:
#!pip install truera

In [None]:
!pip list | grep truera

In [None]:
import pandas as pd
import numpy as np

import pickle
import random

import sklearn
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
from truera.client.truera_workspace import TrueraWorkspace

from truera.client.truera_authentication import TokenAuthentication
from truera.client.truera_authentication import BasicAuthentication

from truera.client.ingestion import ColumnSpec, ModelOutputContext

# Demo: model development
As an illustration we train an scikit-learn `GradientBoostingClassifier` model on pre-processed data here.
To explore on your own, you can use these models or load your own models & data.

## Data Prep

In [None]:
data = pd.read_csv('oj.csv')

In [None]:
data.shape

In [None]:
data.index.min(), data.index.max()

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
timestamps = pd.to_datetime(data.week, unit='D',
               origin=pd.Timestamp('2023-06-01'))

In [None]:
#data= data.drop(columns='week')
data['datetime'] = timestamps

In [None]:
df = data.set_index('datetime').sort_index().reset_index()

In [None]:
df.index.min(), df.index.max()

In [None]:
df.head()

In [None]:
t1 = int(len(df)/3)
t1

In [None]:
df_train = df.iloc[:t1,:]
df_holdout = df.iloc[t1:,:]
df_train.shape, df_holdout.shape

In [None]:
#sanity check
len(df_train) + len(df_holdout) == len(df)

In [None]:
df_holdout.datetime.min() #simulated production start period

In [None]:
df_holdout.datetime.max()

## Prepare data for modeling
The following utility function is used in two places in this notebook:
1. Standalone, to generate training splits. Could be modified/improved to be more generalized on any set of training data, labels, and/or extra data of interest
2. In "split_data_export" function, for preparing production data simulations in correct format

In [None]:
def data_prep(input, extra_feat, target):
    
    #extra data - for segmentation, don't train upon
    if extra_feat != None:
        extra_data = pd.concat([extra_data, input[extra_feat]], axis=1)
        input=input.drop(columns=extra_feat)
    else:
        extra_data = None
    
    #one hot features of type object -- note, be careful in understanding types of "pre" data features before using this method
    cats = input.select_dtypes(include=['object'])
    print('The following variables will be one-hot encoded: '+cats)
    enc = OneHotEncoder(drop=None, sparse=False).fit(cats)
    encoded = enc.transform(cats)
    
    #Create a Pandas DataFrame of the hot encoded column
    ohe_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(), index=input.index)
    #concat with original data, drop original
    input_post = pd.concat([input, ohe_df], axis=1).drop(cats.columns, axis=1)
    print(input.shape, input_post.shape)

    #prep data & labels
    y = input[target]
    X_pre = input.drop(columns=target)
    X_post = input_post.drop(columns=target)
    
    return X_pre, X_post, y, extra_data

## Generate data artifacts for training & TruEra ingestion

In [None]:
X_train_pre, X_train_post, y, extra_data = data_prep(df_train, None, 'logmove')


In [None]:
X_train_pre.to_csv('pre_train.csv',index=True)
X_train_post.to_csv('post_train.csv',index=True)
y.to_csv('labels_train.csv',index=True)

In [None]:
y

## Model Selection / Training

### V1: Ridge Regression
- Linear regression model as baseline
- Ridge Regression chosen for regularization of feature weights

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

#score_models = {'f1': 'f1', 'recall': 'recall', 'precision': 'precision'}

lin_reg = RidgeCV(cv=tscv)

lin_reg.fit(X_train_post.drop(columns=['datetime']),y)
lin_reg.best_score_

pickle.dump(lin_reg, open('linreg.pkl', "wb"))

## Model v2: random forest
- unfortunately, there is no random forest implementation that takes advantage of a general cross-validation function
- simply train on full training dataset, yolo / study behavior in TruEra

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

random_forest = RandomForestRegressor(verbose=1, n_jobs=-1, random_state=42) ##fit intercept is true by default; l2 reg by default

random_forest.fit(X_train_post.drop(columns=['datetime']),y)

pickle.dump(random_forest, open('rf.pkl', "wb"))

## Simulating Production Splits

In [None]:
def split_data_export(n, prod_data, extra_feat, target):
    pre_split_dict = {}
    post_split_dict = {}
    labels_dict = {}
    extra_dict = {}
    
    pre_split_file_names = list()
    post_split_file_names = list()
    label_file_names = list()
    extra_file_names = list()
    monitoring_splits = list()
     
    #prep data -- note use of data prep utility function here
    X_pre, X_post, y, extra_data = data_prep(prod_data, extra_feat, target)
    print(X_pre.shape, X_post.shape, y.shape)
    if extra_data !=None:
        print(extra_data.shape)

    X_splits = np.array_split(X_pre,n)
    X_post_splits = np.array_split(X_post, n)
    y_splits = np.array_split(y,n) 
    if extra_data != None:
        extra_splits = np.array_split(extra_data, n)

    #populate dicts for each data artifact type, for each split, with names and partitioned data
    for i in range(n):
        date = str(X_splits[i].datetime.iloc[0])[:10]
        pre_split_dict["pre_split_{0}".format(date)] = X_splits[i]
        post_split_dict["post_split_{0}".format(date)] = X_post_splits[i]
        labels_dict["label_{0}".format(date)] = y_splits[i]
        if extra_data != None:
            extra_dict["extra_{0}".format(date)] = extra_splits[i]
   
    ## save csvs, and, for documentation purposes & to help with monitoring simulation, create file with names of splits (and associated timestamps)
    for key, value in pre_split_dict.items():
        split_name = './split_sim/{}.csv'.format(key)
            
        #data for each split
        pre_split_file_names.append(split_name)   
        value.to_csv(split_name, index=True)
        
        value['timestamp'] = value.datetime
        timestamps = value['timestamp']
        timestamps.to_csv('./split_sim/timestamp_'+str(key)+'.csv', index=True)
        
        monitoring_splits.append([min(timestamps), max(timestamps)])
    
    #post data
    for key, value in post_split_dict.items():
        post_split_name = './split_sim/{}.csv'.format(key)
            
        #data for each split
        post_split_file_names.append(split_name)   
        value.to_csv(post_split_name,index=True)

    ## continued .. labels
    for key, value in labels_dict.items():
        if n ==1: #use this to uniquely identify initial partitions for pre-production purposes 
            label_name = './split_sim/{}.csv'.format(key)
        else:
            label_name = './split_sim/{}.csv'.format(key)
            
        label_file_names.append(label_name)   
        value.to_csv(label_name,index=True)
        
    ## continued .. extra data
    if extra_data != None:
        for key, value in extra_dict.items():
            if n ==1:
                extra_name = './split_sim/{}.csv'.format(key)
            else:
                extra_name = './split_sim/{}.csv'.format(key)

            extra_file_names.append(extra_name)   
            value.to_csv(extra_name,index=True)
        
    return pre_split_file_names, post_split_file_names, label_file_names, extra_file_names, monitoring_splits

In [None]:
df_holdout.shape

In [None]:
start = min(df_holdout.datetime)
start

In [None]:
end = max(df_holdout.datetime)
end

In [None]:
(end-start).days

### Persist simulated production splits, for future use
- use split_data_export function to simulate n splits from holdout dataframe

In [None]:
pre_split_file_names, \
post_split_file_names, \
label_file_names, \
extra_file_names, \
monitoring_splits  = split_data_export(80, df_holdout, None, 'logmove')

In [None]:
monitoring_splits[:10]

# TruEra SDK
## Create Project
A project is a collection of models and datasets solving a single problem statement.
Users can be provided access to collaborate on a project.

In [None]:
# connection details
TRUERA_URL = "https://app.truera.net"
AUTH_TOKEN = "<INSERT AUTH TOKEN>"

In [None]:
auth = TokenAuthentication(AUTH_TOKEN)
tru = TrueraWorkspace(TRUERA_URL, auth, ignore_version_mismatch=True)

In [None]:
tru.get_projects()

In [None]:
project_name = "Sales Forecasting v1-2"

In [None]:
tru.add_project(project_name, score_type="regression")

In [None]:
tru.get_projects()

In [None]:
tru.set_model_execution("local")

# Adding a Data Collection
A data collection organizes data by schema within a TruEra project

* Data splits: A set of in-sample data (train, test, validate) or out-of-sample (OOS) / out-of-time (OOT) data to test model quality, stability and generalizability.
* Feature Metadata: An (optional) set of metadata defining the set of features for a set of splits and the various models trained and evaluated on them. This allows you to group features and provide feature descriptions for use throughout the tool.

Note that all splits associated with a data collection are assumed to follow the same set of features. As a general rule of thumb, if a model can read one split in a data collection it should be able to read all other splits in the data collection.

Reference: https://docs.microsoft.com/en-us/azure/open-datasets/dataset-oj-sales-simulated?tabs=azureml-opendatasets

In [None]:
FEATURE_MAP = {}
for post in X_train_post.drop(columns='datetime').columns:
    mapped = None
    for pre in X_train_pre.columns:
        if post.startswith(pre) and (mapped is None or len(mapped) < len(pre)):
            mapped = pre
    if mapped not in FEATURE_MAP:
        FEATURE_MAP[mapped] = []
    FEATURE_MAP[mapped].append(post)

In [None]:
FEATURE_MAP

In [None]:
tru.set_project(project_name)

In [None]:
data_collection_name='OJ Sales Data'
tru.add_data_collection(data_collection_name, pre_to_post_feature_map=FEATURE_MAP, provide_transform_with_model=False)

# Add data and models to project
A data collection is a container for two related things:

* Data splits: A set of in-sample data (train, test, validate) or out-of-sample (OOS) / out-of-time (OOT) data to test model quality, stability and generalizability.
* Feature Metadata: An (optional) set of metadata defining the set of features for a set of splits and the various models trained and evaluated on them. This allows you to group features and provide feature descriptions for use throughout the tool.

Note that all splits associated with a data collection are assumed to follow the same set of features. As a general rule of thumb, if a model can read one split in a data collection it should be able to execute against all other splits in the data collection.

In [None]:
tru.activate_client_setting('create_model_tests_on_split_ingestion')

In [None]:
X_train_pre = X_train_pre.reset_index()
X_train_post = X_train_post.reset_index()

In [None]:
y_df = y.to_frame().reset_index()

In [None]:
tru

## Uploading one or more data splits
Now we can upload some data to our data collection to prepare for analyzing the model.
Here we upload the entire data as an "all" split type. We could choose to upload just the train or test datasets as "train" or "test" split types. 
At least one "train" or "all" split is required for generating analysis. You can have 0 or more splits of other kinds. 
You upload a split by providing:
 * A friendly name to indentify the split (required).
 * Input data in the shape the model expects (required). This can be a pandas DataFrame.
 * Labels/target ground-truth values (optional). It is strongly recommended to provide labels when available.

In [None]:
from truera.client.ingestion.util import merge_dataframes_and_create_column_spec

In [None]:
data_df, column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data=X_train_pre,
                        post_data=X_train_post,
                        labels=y_df)
print(data_df.columns)

In [None]:
column_spec

In [None]:
data_df.shape

In [None]:
tru.add_data(
        data_split_name='training data',
        data=data_df,
        column_spec=column_spec)

In [None]:
X_val_pre = pd.read_csv('./split_sim/pre_split_2023-08-20.csv',index_col=0).reset_index()
X_val_post = pd.read_csv('./split_sim/post_split_2023-08-20.csv',index_col=0).reset_index()
y_val = pd.read_csv('./split_sim/label_2023-08-20.csv',index_col=0).reset_index()
#extra_val = pd.read_csv('./split_sim/extra_1.csv', index_col='datetime')

In [None]:
X_val_pre.head()

In [None]:
val_data_df, column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data=X_val_pre,
                        post_data=X_val_post,
                        labels=y_val)

In [None]:
tru.add_data(
        data_split_name='validation data',
        data=val_data_df,
        column_spec=column_spec)

In [None]:
X_test_pre = pd.read_csv('./split_sim/pre_split_2023-08-21.csv',index_col=0).reset_index()
X_test_post = pd.read_csv('./split_sim/post_split_2023-08-21.csv',index_col=0).reset_index()
y_test = pd.read_csv('./split_sim/label_2023-08-21.csv',index_col=0).reset_index()
#extra_test = pd.read_csv('./split_sim/extra_2023-08-21.csv', index_col='datetime')

In [None]:
test_data_df, column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data=X_test_pre,
                        post_data=X_test_post,
                        labels=y_test)

In [None]:
tru.add_data(
        data_split_name='test data',
        data=test_data_df,
        column_spec=column_spec)

## Uploading a model
This is the last step before we can start analyzing the model in TruEra dashboards.
Model type and dependency versions are automatically inferred from the environment and the model object. A friendly name is provided to be able to find the model in the Truera dashboard and be able to work with it in the future.
The model is automatically attached to the current data collection, set by invoking `set_data_collection`.

In [None]:
model_name = 'Ridge Regression'
tru.add_python_model(model_name, lin_reg)

In [None]:
tru.get_data_collections()

## **Monitoring requirement: add new data collection for second model**
- 1:1 dc:model req for monitoring
- duplicated development data into 2nd data collection
  - this is required to have access to FIs for all 6 model-dev_split combinations (see next section)

## New Data Collection for RF prod data
Notes:
1. adding prod FIs: Adding feature infs for prod data happens under "virtual" model. 
    a. this requires a background split in the data collection
    b. re-used code from first add_data call, earlier in script, to create train dataframe and associated column spec for background (training) split
    c. this is more duplicate data -- training split already exists in the other dc.
    
    
2. add model: in testing -- did not need to, and could not, add the RF model _with the same name_ to this data collection  
    a. Models don't appear constrained to data collections anymore -- tru.get_models() listed both RF and Ridge Regression model, even though I was in a different (new) data collection.  
    b. on further inspection, the existing 'Random Forest Regressor' model was able to be referenced by ModelOutputContext***, even though it wasn't in the same data collection. BUT, the new data collection I've created below, shows no associated models.
    
    ***perhaps "referenced by" is incorrect -- it may be that this is a silent failure / nothing happens


In [None]:
tru.add_data_collection("OJ Sales Data RF", pre_to_post_feature_map=FEATURE_MAP, provide_transform_with_model=False)

In [None]:
model_name = 'Random Forest Regressor'
tru.add_python_model(model_name, random_forest)

In [None]:
data_df, column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data=X_train_pre,
                        post_data=X_train_post,
                        labels=y_df)
print(data_df.columns)

In [None]:
tru.add_data(
        data_split_name='training data',
        data=data_df,
        column_spec=column_spec)

In [None]:
tru.add_data(
        data_split_name='validation data',
        data=val_data_df,
        column_spec=column_spec)

In [None]:
tru.add_data(
        data_split_name='test data',
        data=test_data_df,
        column_spec=column_spec)

----

# Compute & Upload dev split Feature Influences using TruEra QII

In [None]:
tru.set_model("Ridge Regression")
tru.set_data_split("training data")

In [None]:
lr_train_feat_infs = tru.compute_feature_influences()
## Note: we need predictions, to generate feature influences. 
## In other words, (some) predictions are being generated as part of this call

In [None]:
tru.set_model("Ridge Regression")
tru.set_data_split("validation data")

In [None]:
lr_val_feat_infs = tru.compute_feature_influences()

In [None]:
tru.set_model("Ridge Regression")
tru.set_data_split("test data")

In [None]:
lr_test_feat_infs = tru.compute_feature_influences()

In [None]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("training data")

In [None]:
rf_train_feat_infs = tru.compute_feature_influences()

In [None]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("validation data")

In [None]:
rf_val_feat_infs = tru.compute_feature_influences()

In [None]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("test data")

In [None]:
rf_test_feat_infs = tru.compute_feature_influences()

# Adding Predictions

### Note: make sure to push predictions to the correct data collection, aligning to its model
- RISK: add predictions to incorrect data collections
- If so, they will end up as orphan predictions -- not associated with any data
- currently, the SDK does not prevent this 

### "Dev" DC: Linear Regression Model

In [None]:
tru.get_data_collections()

In [None]:
#this dc contains lin_reg / ridge regression model
tru.set_data_collection("OJ Sales Data")

#### Training Data

In [None]:
preds = lin_reg.predict(X_train_post.drop(columns=['datetime','index']))

In [None]:
#note - use column 'index', not actual index of dataframe
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_train_post['index'], X_train_post.datetime])

In [None]:
lr_train_preds= preds_df.reset_index()

In [None]:
tru.add_data(
    data=lr_train_preds,
    data_split_name="training data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'),
        
    model_output_context=ModelOutputContext(
        model_name="Ridge Regression",
        score_type='regression')
    )

In [None]:
tru

In [None]:
#need this for drift calculations in druid
tru.set_model("Ridge Regression")
tru.add_model_metadata(train_split_name='training data')

#### Validation Data

In [None]:
preds = lin_reg.predict(X_val_post.drop(columns=['datetime','index']))

In [None]:
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_val_post['index'], X_val_post.datetime])

In [None]:
lr_val_preds = preds_df.reset_index() #index as column

In [None]:
tru.add_data(
    data=lr_val_preds,
    data_split_name="validation data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'
    ),
    model_output_context=ModelOutputContext(
        model_name="Ridge Regression",
        score_type='regression')
)

#### Holdout Data

In [None]:
preds = lin_reg.predict(X_test_post.drop(columns=['datetime','index']))

In [None]:
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_test_post['index'], X_test_post.datetime])

In [None]:
lr_test_preds = preds_df.reset_index()

In [None]:
tru.add_data(
    data=lr_test_preds,
    data_split_name="test data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'
    ),
    model_output_context=ModelOutputContext(
        model_name="Ridge Regression",
        score_type='regression')
)

### "Prod" DC: 2nd DC that contains Random Forest Regressor
- satisfy 1:1 DC:Model req from monitoring

In [None]:
#this DC contains Random Forest Regressor / RF model
tru.set_data_collection("OJ Sales Data RF")

#### Training Data

In [None]:
preds = random_forest.predict(X_train_post.drop(columns=['index','datetime']))

In [None]:
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_train_post['index'],X_train_post.datetime])

In [None]:
rf_train_preds = preds_df.reset_index()

In [None]:
tru.add_data(
    data=rf_train_preds,
    data_split_name="training data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'
    ),
    model_output_context=ModelOutputContext(
        model_name="Random Forest Regressor",
        score_type='regression')
)

In [None]:
#need this for drift calculations in druid
tru.set_model("Random Forest Regressor")
tru.add_model_metadata(train_split_name='training data')

#### Validation Data

In [None]:
preds = random_forest.predict(X_val_post.drop(columns=['index','datetime']))

In [None]:
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_val_post['index'], X_val_post.datetime])

In [None]:
rf_val_preds = preds_df.reset_index()

In [None]:
tru.add_data(
    data=rf_val_preds,
    data_split_name="validation data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'
    ),
    model_output_context=ModelOutputContext(
        model_name="Random Forest Regressor",
        score_type='regression')
)

#### Test Data

In [None]:
preds = random_forest.predict(X_test_post.drop(columns=['index','datetime']))

In [None]:
preds_df = pd.DataFrame(preds, columns = ['logmove'], index=[X_test_post['index'], X_test_post.datetime])

In [None]:
rf_test_preds = preds_df.reset_index()

In [None]:
tru.add_data(
    data=rf_test_preds,
    data_split_name="test data",
    column_spec=ColumnSpec(
        id_col_name="index",
        timestamp_col_name='datetime',
        prediction_col_names='logmove'
    ),
    model_output_context=ModelOutputContext(
        model_name="Random Forest Regressor",
        score_type='regression')
)

# Monitoring: Production Data

In [None]:
df_mon_splits = pd.DataFrame.from_records(monitoring_splits, columns =['start','end'])

In [None]:
df_mon_splits[:5]

In [None]:
import glob
import os

In [None]:
from datetime import datetime

In [None]:
def load_prod_data(start, end):
    start = datetime.strptime(start, '%Y-%m-%d').date()
    end = datetime.strptime(end, '%Y-%m-%d').date()
    print(type(start))
    
    #gather files to include
    f_prod = glob.glob(os.path.join('./split_sim', 'pre_split_*.csv'))
    f_prod_post= glob.glob(os.path.join('./split_sim', "post_split_*.csv"))
    f_y_prod = glob.glob(os.path.join('./split_sim', "label_*.csv"))
    
    #sort file names
    f_prod.sort()
    f_prod_post.sort()
    f_y_prod.sort()
    
    X_prod = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_prod), ignore_index=True)
    X_prod_post = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_prod_post), ignore_index=True)
    y_prod = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_y_prod), ignore_index=True)
        
    prod_data_df, column_spec = merge_dataframes_and_create_column_spec(id_col_name='index',
                                                                        timestamp_col_name='datetime',
                                                                        pre_data=X_prod,
                                                                        post_data=X_prod_post,
                                                                        labels=y_prod)
    #greater than the start date and smaller than the end date
    prod_data_df['datetime'] = pd.to_datetime(prod_data_df['datetime']).dt.date
    prod_data_df = prod_data_df[(prod_data_df['datetime'] >= start) & (prod_data_df['datetime'] <= end)]
    print(prod_data_df.datetime.min())
    print(prod_data_df.datetime.max())
    print(prod_data_df.shape)
    print(column_spec)
    return prod_data_df, column_spec

In [None]:
prod_data_df, column_spec = load_prod_data('2023-08-24', '2023-09-20')

## Prod Predictions

In [None]:
prod_data_df.columns

In [None]:
column_spec.post_data_col_names

In [None]:
def generate_prod_preds(model, data):
    preds = model.predict(data.drop(columns=data.columns.difference(column_spec.post_data_col_names)))
    preds_df = pd.DataFrame(preds, columns = ['preds'], index=[data['index'], data.datetime])
    preds_df = preds_df.reset_index()
    print(preds_df.shape)

    return preds_df

In [None]:
lr_prod_preds = generate_prod_preds(lin_reg, prod_data_df)

In [None]:
rf_prod_preds = generate_prod_preds(random_forest, prod_data_df)

## Prod Feature Infs

In [None]:
tru.get_data_collections()

In [None]:
tru.set_data_collection('OJ Sales Data')
tru.set_model('Ridge Regression')

In [None]:
tru

In [None]:
LR_explainer = tru.get_explainer()

In [None]:
prod_data_df.columns

In [None]:
column_spec.pre_data_col_names

In [None]:
column_spec.post_data_col_names

In [None]:
column_spec.label_col_names

In [None]:
prod_data_df[column_spec.pre_data_col_names].shape

In [None]:
?explainer.compute_feature_influences_for_data

In [None]:
LR_prod_FIs = LR_explainer.compute_feature_influences_for_data(pre_data = prod_data_df[column_spec.pre_data_col_names], 
                                                         post_data = prod_data_df[column_spec.post_data_col_names], 
                                                         ys = prod_data_df[column_spec.label_col_names])

In [None]:
LR_prod_FIs['index'] = prod_data_df[column_spec.id_col_name]

In [None]:
LR_prod_FIs

In [None]:
tru

### Merge data, predictions, and feature influence

In [None]:
lr_prod_data_df, lr_column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data = prod_data_df[column_spec.pre_data_col_names+[column_spec.id_col_name]], 
                        post_data = prod_data_df[column_spec.post_data_col_names+[column_spec.id_col_name]], 
                        labels = prod_data_df[column_spec.label_col_names+[column_spec.id_col_name]],
                        predictions = lr_prod_preds,
                        feature_influences = LR_prod_FIs)

In [None]:
model_name = 'Ridge Regression'
tru.add_production_data(
        data=lr_prod_data_df,
        column_spec=lr_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='training data',
            influence_type='truera-qii',
            score_type='regression'))

In [None]:
tru.set_data_collection("OJ Sales Data RF")
tru.set_model("Random Forest Regressor")

In [None]:
RF_explainer = tru.get_explainer()

#####  for FI sampling
Use the following code snippet to select a random sample of some large dataset, to reduce compute cost & time to generate sufficient feature influences for analysis.

In [None]:
RF_prod_FIs = RF_explainer.compute_feature_influences_for_data(pre_data = prod_data_df[column_spec.pre_data_col_names],
                                                                post_data = prod_data_df[column_spec.post_data_col_names],
                                                                ys = prod_data_df[column_spec.label_col_names])

In [None]:
RF_prod_FIs['index'] = prod_data_df[column_spec.id_col_name]

In [None]:
rf_prod_data_df, rf_column_spec = merge_dataframes_and_create_column_spec(
                        id_col_name='index',
                        timestamp_col_name='datetime',
                        pre_data = prod_data_df[column_spec.pre_data_col_names+[column_spec.id_col_name]], 
                        post_data = prod_data_df[column_spec.post_data_col_names+[column_spec.id_col_name]], 
                        labels = prod_data_df[column_spec.label_col_names+[column_spec.id_col_name]],
                        predictions = rf_prod_preds,
                        feature_influences = RF_prod_FIs)

In [None]:
model_name = 'Random Forest Regressor'
tru.add_production_data(
        data=rf_prod_data_df,
        column_spec=rf_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='training data',
            influence_type='truera-qii',
            score_type='regression'))

In [None]:
LR_prod_FIs.to_csv('lr_prod_FIs.csv',index=True)

In [None]:
RF_prod_FIs.to_csv('rf_prod_FIs.csv',index=True)