# A Simple Model

## Data Transformation Pipeline 1: Create encoders

In [2]:
#papermill_description=Starting_Up
import pandas as pd
from joblib import dump, load
from hamilton import driver, base

In [3]:
import features_pipeline

In [4]:
# parameters
train_data : str = "data/train.csv"
test_data : str = "data/test.csv"

In [5]:
df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)

In [6]:
index_col = 'passengerid'
target_col = "survived"
cat_cols = ["sex", "cabin", "embarked"]
config = {
    'index_column': index_col,
    'target_column': target_col,
    'categorical_columns': cat_cols
}

In [7]:
#papermill_description=Visualizing
adapter = base.SimplePythonGraphAdapter(base.DictResult())
encode_dr = driver.Driver(config, features_pipeline, adapter=adapter)
output_nodes = ['encoders']
encode_dr.visualize_execution(output_nodes,'./artifacts/encoder.dot', {}, 
    inputs = dict(
        df_train = df_train,
        df_test = df_test
    )                           
)

![](./encoder.dot.pdf)

In [8]:
#papermill_description=Running_Pipeline
out = encode_dr.execute(['encoders'],
        inputs = dict(
            df_train = df_train,
            df_test = df_test
        )         
)
out

Index(['sex', 'cabin', 'embarked'], dtype='object')


{'encoders': {'cabin_encoder': LabelEncoder(),
  'sex_encoder': LabelEncoder(),
  'embarked_encoder': LabelEncoder()}}

In [9]:
out['encoders']['sex_encoder'].classes_

array(['female', 'male'], dtype=object)

In [11]:
dump(out, "intermediate_data/encoder.joblib")

['intermediate_data/encoder.joblib']