# Basic Workflow

In [1]:
# Always have your imports at the top
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin

from hashlib import sha1 # just for grading purposes
import json # just for grading purposes

def _hash(obj, salt='none'):
    if type(obj) is not str:
        obj = json.dumps(obj)
    to_encode = obj + salt
    return sha1(to_encode.encode()).hexdigest()

# Workflow steps

What are the basic workflow steps?

It's incredibly obvious what the steps are since you can see them graded in plain text. However we deem it worth actually making you type each one of the steps and take a moment to think about it and internalize them.

Please do actually type them rather than just copy-pasting as fast as you can. Type it out character by character and internalize.

In [7]:
step_1 = 'Get the data'
step_2 = 'Data analysis and preparation'
step_2_a = 'Data analysis'
step_2_b = 'Dealing with data problems'
step_2_c = 'Feature engineering'
step_2_d = 'Feature selection'
step_3 = 'Train model'
step_4 = 'Evaluate results'
step_5 = 'Iterate'

In [8]:
### BEGIN TESTS
assert step_1 == 'Get the data'
assert step_2 == 'Data analysis and preparation'
assert step_2_a == 'Data analysis'
assert step_2_b == 'Dealing with data problems'
assert step_2_c == 'Feature engineering'
assert step_2_d == 'Feature selection'
assert step_3 == 'Train model'
assert step_4 == 'Evaluate results'
assert step_5 == 'Iterate'
### END TESTS

# Specific workflow questions

Here are some more specific questions about individual workflow steps.

In [11]:
# True or False, it's super easy to gather your dataset in a production environment
real_world_dataset_gathering_easy = False

# True or False, it's super easy to gather your dataset in the context of the academy
academy_dataset_gathering_easy = True

# True or False, you should try as hard as you can to get the best possible score
# on your test set by iterating until you can't get your test set score any higher
# by any means possible
test_set_optimization_is_good = False
# This behavior incentives create leakage to the test_set

# True or False, you should choose one metric by which to evaluate your model and
# never consider using another one
one_metric_should_rule_them_all = False

In [12]:
### BEGIN TESTS
assert _hash(real_world_dataset_gathering_easy, 'salt1') == '63b5b9a8f2d359e1fc175c3b01b907ef87590484'
assert _hash(academy_dataset_gathering_easy, 'salt2') == 'dd7dee495a153c95d28c7aa95289c0415242f5d8'
assert _hash(test_set_optimization_is_good, 'salt3') == 'f24a294afb4a09f7f9df9ee13eb18e7d341c439d'
assert _hash(one_metric_should_rule_them_all, 'salt4') == '2360691a582e4f0fbefa238ab6ced1cbfbfe8a50'
### END TESTS

# scikit pipelines

Make a simple pipeline that
1. Drops all columns that start with the string `evil`
1. Fills all nulls with the median

In [38]:
# Create a pipeline step called RemoveEvilColumns the removed any
# column whose name starts with the string 'evil'

from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Create an pipeline using make_pipeline
class RemoveEvilColumns(TransformerMixin):
    
    def transform(self,X,*_):
        return X.drop(list(X.filter(regex='^evil')),axis=1).copy()
    
    def fit(self,X,*_):
        return self
    
pipeline = make_pipeline(
    RemoveEvilColumns(),
    SimpleImputer(strategy='mean'),
    RandomForestClassifier()
)    

# 1. removes evil columns
# 2. imputes with the mean
# 3. has a random forest classifier as the last step

In [40]:
X = pd.DataFrame({
    'evil_1': ['a'] * 100,
    'evil_2': ['b'] * 100,
    'not_so_evil': list(range(0, 100))
})
y = pd.Series([x % 2 for x in range(0, 100)])

pipeline.fit(X, y)

### BEGIN TESTS
assert pipeline.steps[0][0] == 'removeevilcolumns', pipeline.steps[0][0]
assert pipeline.steps[1][0] == 'simpleimputer', pipeline.steps[1][0]
assert pipeline.steps[2][0] == 'randomforestclassifier', pipeline.steps[2][0]
### END TESTS

