# 3.1. load train/test data

In [None]:
import os
import pandas as pd

import wandb
from utils.wandb_utils import wandb_log_artifact, get_wandb_artifact

from ml_src.data import preprocess

In [None]:
run = wandb.init(project='ex_census_wandb', job_type='feature engineering')

In [None]:
_, train_file_path = get_wandb_artifact(run,
                               artifact_name="census_split", 
                               file_name="census_train.csv", 
                               tag="latest", 
                               artifact_type='data')

print(train_file_path)

train_df = pd.read_csv(train_file_path, sep='\t', encoding='utf-8')

# 3.2. feature engineering + log feature engineering artifacts

artifacts to log

- one-hot encoder
- label binarizer

In [None]:
from joblib import dump

In [None]:
cat_features = [
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native-country"]

label = "salary"

X_train, y_train, encoder, lb = preprocess(train_df, categorical_features=cat_features, label=label, training=True)

In [None]:
encoder_file_path = "./model/encoder.joblib"
label_binarizer_file_path = "./model/lb.joblib"

dump(encoder, encoder_file_path)
dump(lb, label_binarizer_file_path)

In [None]:
metadata = {
    "categorical_feature": cat_features,
    "label" : label,
    "train_artifact_name" : train_file_path.split("/")[2].split(":")[0],
    "train_artifact_ver" : train_file_path.split("/")[2].split(":")[1],
    "train_file_name" : train_file_path.split("/")[3]
}

wandb_log_artifact(run, "feature_engineering", 
                   description="OneHotEncoder, LabelBinarizer", 
                   file_path=[encoder_file_path, label_binarizer_file_path],
                   artifact_type="feature_engineering_artifact",
                   metadata=metadata,
                   remove_logged_file=True)

In [None]:
run.finish()