# 4.1. load train/test data

In [1]:
import os
import pandas as pd

import wandb
from ex_wandb.utils.wandb_utils import wandb_log_artifact, get_wandb_artifact

from ex_wandb.ml_src.data import preprocess
from ex_wandb.ml_src.model import train_model, inference, compute_model_metrics

In [2]:
run = wandb.init(project='ex_census_wandb', job_type='model training')

[34m[1mwandb[0m: Currently logged in as: [33mwg_lucas[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
_, train_file_path = get_wandb_artifact(run,
                               artifact_name="census_split", 
                               file_name="census_train.csv", 
                               tag="latest", 
                               artifact_type='data')

_, test_file_path = get_wandb_artifact(run,
                               artifact_name="census_split", 
                               file_name="census_test.csv", 
                               tag="latest", 
                               artifact_type='data')

print(train_file_path)
print(test_file_path)

train_df = pd.read_csv(train_file_path, sep='\t', encoding='utf-8')
test_df = pd.read_csv(test_file_path, sep='\t', encoding='utf-8')

./artifacts/census_split:v0/census_train.csv
./artifacts/census_split:v0/census_test.csv


# 4.2. load feature engineering artifacts (encoder, label binarizer)

In [4]:
from joblib import load

In [5]:
encoder_artifact, encoder_path = get_wandb_artifact(run,
                               artifact_name="feature_engineering", 
                               file_name="encoder.joblib", 
                               tag="latest", 
                               artifact_type='feature_engineering_artifact')

In [6]:
lb_artifact, lb_path = get_wandb_artifact(run,
                               artifact_name="feature_engineering", 
                               file_name="lb.joblib", 
                               tag="latest", 
                               artifact_type='feature_engineering_artifact')

In [7]:
cat_features = encoder_artifact.metadata['categorical_feature']
label = encoder_artifact.metadata['label']

# 4.3. Train model

In [8]:
parameters = {
    "n_estimators": 300,
    "min_samples_split": 5,
    "min_samples_leaf": 2,
    "max_features": "sqrt",
    "max_depth": 100,
    "criterion": "gini",
    "bootstrap": False,
}

In [9]:
X_train, y_train, encoder, lb = preprocess(train_df, categorical_features=cat_features, label=label, training=True)
X_test, y_test, _, _ = preprocess(test_df, categorical_features=cat_features, label=label, training=False, encoder=encoder, lb=lb)

In [10]:
model = train_model(X_train, y_train, params=parameters)

# 4.4. log model configs + parameters + performance

In [11]:
preds = inference(model, X_test)
precision, recall, fbeta = compute_model_metrics(y_test, preds)
print(precision, recall, fbeta)

0.7052117263843648 0.5660130718954248 0.6279912980420596


In [12]:
metadata = {
    "categorical_feature": cat_features,
    "label" : label,
    "param": parameters,
    "train_data_path" : train_file_path,
    "test_data_path" : test_file_path,
    "encoder" : encoder_path,
    "lable_binarizer" : lb_path
}

In [13]:
run.config.update(metadata)

In [14]:
run.summary['precision'] = precision
run.summary['recall'] = recall
run.summary['fbeta'] = fbeta

# 4. 5. save model artifact

In [15]:
from joblib import dump

In [16]:
model_file_path = "./../model/model.joblib"

dump(model, model_file_path)

['./../model/model.joblib']

In [17]:
wandb_log_artifact(run, "model", 
                   description="baseline RandomForest model", 
                   file_path=[model_file_path],
                   artifact_type="model_artifact",
                   remove_logged_file=True)

In [18]:
run.finish()

VBox(children=(Label(value='57.520 MB of 57.520 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
fbeta,0.62799
precision,0.70521
recall,0.56601
