# A Simple Model

In [1]:
import pandas as pd
from joblib import dump, load
from hamilton import driver, base

In [2]:
import features_pipeline


In [3]:
index_col = 'passengerid'
target_col = "survived"
cat_cols = ["sex", "cabin", "embarked"]
config = {
    'index_column': index_col,
    'target_column': target_col,
    'categorical_columns': cat_cols
}

## Data Transformation Pipeline 2: Use encoders to transform data.

In [4]:
# parameters
encoder_file: str = "intermediate_data/encoder.joblib"
data: str = "data/train.csv"
datatype: str = "train"

In [5]:
out = load(encoder_file)

In [6]:
inputencoders = {}
for k in out['encoders']:
    knew = "".join(k.split('_'))
    inputencoders[knew] = out['encoders'][k]

In [7]:
inputencoders

{'cabinencoder': LabelEncoder(),
 'sexencoder': LabelEncoder(),
 'embarkedencoder': LabelEncoder()}

In [8]:
df = pd.read_csv(data)

### Transform the training set

In [9]:
transform_dr = driver.Driver(config, features_pipeline)
ddf = dict(df = df, **inputencoders) 
#print(transform_train_dr.list_available_variables())
transform_dr.visualize_execution(['final_imputed_features'],
                                       f"./artifacts/trans{datatype}.dot", 
                                       {}, 
                                       inputs = ddf
)

In [10]:
from IPython.display import Image, Markdown

In [11]:
Markdown(f"![](./trans{datatype}.dot.pdf)")

![](./transtrain.dot.pdf)

In [12]:
outputdf = transform_dr.execute(['final_imputed_features'], inputs = ddf)

<class 'pandas.core.frame.DataFrame'>


In [13]:
outputdf

Unnamed: 0_level_0,pclass,age,fare,cabin_category,sex_category,embarked_category,family
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0.00,27.14,2,1,2,2
1,3,0.00,13.35,8,1,2,0
2,3,0.33,71.29,8,1,2,3
3,3,19.00,13.04,8,1,2,0
4,3,25.00,7.76,8,1,2,0
...,...,...,...,...,...,...,...
99995,2,62.00,14.86,3,0,0,0
99996,2,66.00,11.15,8,1,2,0
99997,3,37.00,9.95,8,1,2,0
99998,3,51.00,30.92,8,1,2,1


In [14]:
outputdf.to_pickle(f"intermediate_data/featurestore_{datatype}.pkl")