# A Simple Model

In [1]:
import pandas as pd
from joblib import dump, load
from hamilton import driver, base

In [2]:
index_col = 'passengerid'
target_col = "survived"
cat_cols = ["sex", "cabin", "embarked"]
config = {
    'index_column': index_col,
    'target_column': target_col,
    'categorical_columns': cat_cols
}

In [9]:
# parameters
features_file: str = "intermediate_data/featurestore_train.pkl"
training_data: str = "data/train.csv"

In [5]:
traindf = pd.read_pickle(features_file)

In [10]:
df_train = pd.read_csv(training_data)

## Model Training Pipeline

In [11]:
import model_pipeline, features_pipeline

In [12]:
config_model = {
    'index_column': index_col,
    'target_column': target_col,
    'random_state': 42,
    'max_depth': None,
    'validation_size_fraction': 0.33,
    't': 0.5
}

In [13]:
training_adapter = base.SimplePythonGraphAdapter(base.DictResult())
training_dr = driver.Driver(config_model, 
                            features_pipeline, model_pipeline,
                           adapter=training_adapter)
dtraining = dict(
    df_train = df_train,
    final_feature_matrix = traindf
)
training_dr.visualize_execution(['fit_clf', 'train_predictions', 'valid_predictions'],
                                       './artifacts/training.dot', 
                                       {}, 
                                       inputs = dtraining
)

![](./training.dot.pdf)

In [14]:
rfdict = training_dr.execute(['fit_clf', 'train_predictions', 'valid_predictions'], inputs = dtraining)

In [15]:
rfdict

{'fit_clf': RandomForestClassifier(random_state=42),
 'train_predictions': (array([0.795, 0.1  , 0.95 , ..., 0.83 , 0.   , 0.04 ]),
  array([1, 0, 1, ..., 1, 0, 0])),
 'valid_predictions': (array([0.93, 0.84, 0.16, ..., 0.99, 0.4 , 0.16]),
  array([1, 1, 0, ..., 1, 0, 0]))}

In [17]:
dump(rfdict['fit_clf'], "models/rf.joblib")

['models/rf.joblib']