In [1]:
import flor

flor.setNotebookName('FlorParameterTuning.ipynb')

ex = flor.Experiment('FlorParameterTuning')

ex.groundClient('ground')

In [2]:
import sklearn.linear_model as linear_model
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np

# Data Loading

Here I am using built-in data to make a quick example.  In practice I would probably want to download the data from some external source

In [3]:
@flor.func
def crawl():
    return sns.load_dataset('titanic')

doCrawl = ex.action(crawl)
titanic_data = ex.artifact('titanic.pkl', doCrawl) 

In [4]:
titanic_data.peek()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


# Data Processing

I need to extract some binary features

In [5]:
@flor.func
def featurize(df):
    return pd.get_dummies(df)

doFeaturize = ex.action(featurize, [titanic_data])
ft_titanic_data = ex.artifact('ft_titanic.pkl', doFeaturize)

In [6]:
ft_titanic_data.peek(func=lambda x: x.head())

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,...,0,0,0,0,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,...,1,0,0,0,0,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,...,0,0,0,0,0,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,...,1,0,0,0,0,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,...,0,0,0,0,0,0,0,1,1,0


# Make the training matrices

In [7]:
@flor.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived'], axis=1).values.astype('float')
    return X, Y

doSepLabels = ex.action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = ex.artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = ex.artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split

In [8]:
@flor.func
def trainTestSplit(X, Y, test_size, random_state):
    from sklearn.model_selection import train_test_split
    (X_tr, X_te, Y_tr, Y_te) = train_test_split(X, Y, test_size = test_size, random_state=random_state)
    return (X_tr, X_te, Y_tr, Y_te)

doTrTeSplit = ex.action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, ex.literal(0.1), ex.literal(42)])
X_tr = ex.artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = ex.artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = ex.artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = ex.artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development

First cut at model development

In [9]:
@flor.func
def trainModel(X_tr, Y_tr, n_estimators, min_samples_split):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split)
    model.fit(X_tr, Y_tr)
    return model

doTrainModel = ex.action(trainModel, [X_tr, Y_tr, ex.literal(10), ex.literal(2)])
model = ex.artifact('model.pkl', doTrainModel)

In [10]:
@flor.func
def scoreModel(model, X_tr, X_te, Y_tr, Y_te):
    tr_acc = "Train Accuracy: {}".format(model.score(X_tr, Y_tr))
    te_acc = "Test Accuracy: {}".format(model.score(X_te, Y_te))
    return (tr_acc + '\n' + te_acc, )

doScoreModel = ex.action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = ex.artifact('output.txt', doScoreModel)

In [11]:
output.peek(func=lambda x: print(''.join(x)))

Train Accuracy: 1.0
Test Accuracy: 1.0



**Error!!!** 

The accuracy is too high!  We must have a feature that contains the label

In [12]:
ft_titanic_data.peek(func=lambda x: x.dropna().columns)

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q',
       'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child',
       'who_man', 'who_woman', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no',
       'alive_yes'],
      dtype='object')

Notice the **alive_no** and **alive_yes** columns appear to have same data as survived.  Need to drop these columns

# Re-make the training matrices

In [13]:
@flor.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived', 'alive_no', 'alive_yes'], axis=1).values.astype('float')
    return X, Y

doSepLabels = ex.action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = ex.artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = ex.artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split (Again)

In [14]:
doTrTeSplit = ex.action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, ex.literal(0.1), ex.literal(42)])
X_tr = ex.artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = ex.artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = ex.artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = ex.artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development (Again)

First cut at model development

In [15]:
doTrainModel = ex.action(trainModel, [X_tr, Y_tr, ex.literal(10), ex.literal(2)])
model = ex.artifact('model.pkl', doTrainModel)

In [16]:
doScoreModel = ex.action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = ex.artifact('output.txt', doScoreModel)

In [17]:
output.peek(func=lambda x: print(''.join(x)))

Train Accuracy: 0.9797507788161994
Test Accuracy: 0.6944444444444444



In [None]:
output.pull()

# Model selection through search

**To be continued after Aggregation is implemented ...**