# A Simple Model

In [1]:
import pandas as pd
from joblib import dump, load
from hamilton import driver, base

In [2]:
import features_pipeline


In [3]:
index_col = 'passengerid'
target_col = "survived"
cat_cols = ["sex", "cabin", "embarked"]
config = {
    'index_column': index_col,
    'target_column': target_col,
    'categorical_columns': cat_cols
}

## Data Transformation Pipeline 2: Use encoders to transform data.

In [4]:
# parameters
encoder_file: str = "intermediate_data/encoder.joblib"
data: str = "data/train.csv"
datatype: str = "train"

In [5]:
# Parameters
data = "data/test.csv"
datatype = "test"


In [6]:
out = load(encoder_file)

In [7]:
inputencoders = {}
for k in out['encoders']:
    knew = "".join(k.split('_'))
    inputencoders[knew] = out['encoders'][k]

In [8]:
inputencoders

{'cabinencoder': LabelEncoder(),
 'sexencoder': LabelEncoder(),
 'embarkedencoder': LabelEncoder()}

In [9]:
df = pd.read_csv(data)

### Transform the training set

In [10]:
transform_dr = driver.Driver(config, features_pipeline)
ddf = dict(df = df, **inputencoders) 
#print(transform_train_dr.list_available_variables())
transform_dr.visualize_execution(['final_imputed_features'],
                                       f"./artifacts/trans{datatype}.dot", 
                                       {}, 
                                       inputs = ddf
)

In [11]:
from IPython.display import Image, Markdown

In [12]:
Markdown(f"![](./trans{datatype}.dot.pdf)")

![](./transtest.dot.pdf)

In [13]:
outputdf = transform_dr.execute(['final_imputed_features'], inputs = ddf)

<class 'pandas.core.frame.DataFrame'>


In [14]:
outputdf

Unnamed: 0_level_0,pclass,age,fare,cabin_category,sex_category,embarked_category,family
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100000,3,19.0,63.01,8,1,2,0
100001,3,53.0,5.81,8,0,2,0
100002,1,19.0,38.91,1,0,0,0
100003,2,25.0,12.93,8,1,2,0
100004,1,17.0,26.89,1,0,0,2
...,...,...,...,...,...,...,...
199995,3,27.0,10.12,8,0,1,0
199996,1,59.0,68.31,8,1,2,1
199997,3,47.0,10.87,8,1,2,0
199998,1,49.0,29.68,1,0,0,3


In [15]:
outputdf.to_pickle(f"intermediate_data/featurestore_{datatype}.pkl")