In [3]:
import jarvis

with jarvis.Experiment('taxi_demo') as ex:
    
    jarvis.setNotebookName('Taxi.ipynb')
    
    ex.groundClient('ground')

    tr_data = ex.artifact('train_df.csv')

    tr_data2 = ex.artifact('train.csv')

    te_data = ex.artifact('test_df.csv')

    @jarvis.func
    def dataframize(csvpath):
        import pandas as pd
        return pd.read_csv(csvpath)


    do_dfize = ex.action(dataframize, [tr_data])

    tr_data_df = ex.artifact('train_df.pkl', do_dfize)

    @jarvis.func
    def calculate_distance(data_df):
        def manhattan_distance(x1, y1, x2, y2):
            return abs(x1 - x2) + abs(y1 - y2)
        data_df['distance'] = [ i for i in map(manhattan_distance, data_df['pickup_longitude'], data_df['pickup_latitude'], 
                                               data_df['dropoff_longitude'], data_df['dropoff_latitude'])]
        return data_df

    # Trimmed some Notebook Cells without losing the log.

    # Other thing with Nb. variables continue to exist even after cell has been clipped.

    do_calc_dist = ex.action(calculate_distance, [tr_data_df])
    tr_data_dist_df = ex.artifact('train_dist_df.pkl', do_calc_dist)

    @jarvis.func
    def preproc(train_data):
        import numpy as np
        # https://www.kaggle.com/stephaniestallworth/nyc-taxi-eda-regression-fivethirtyeight-viz/notebook
        train_data = train_data[train_data['passenger_count']>0]
        train_data = train_data[train_data['passenger_count']<9]

        # Remove coordinate outliers
        train_data = train_data[train_data['pickup_longitude'] <= -73.75]
        train_data = train_data[train_data['pickup_longitude'] >= -74.03]
        train_data = train_data[train_data['pickup_latitude'] <= 40.85]
        train_data = train_data[train_data['pickup_latitude'] >= 40.63]
        train_data = train_data[train_data['dropoff_longitude'] <= -73.75]
        train_data = train_data[train_data['dropoff_longitude'] >= -74.03]
        train_data = train_data[train_data['dropoff_latitude'] <= 40.85]
        train_data = train_data[train_data['dropoff_latitude'] >= 40.63]

        # Remove trip_duration outliers
        trip_duration_mean = np.mean(train_data['trip_duration'])
        trip_duration_std = np.std(train_data['trip_duration'])
        train_data = train_data[train_data['trip_duration']<=trip_duration_mean + 2*trip_duration_std]
        train_data = train_data[train_data['trip_duration']>= trip_duration_mean - 2*trip_duration_std]
        train_data = train_data[train_data['trip_duration'] >= 30]
        train_data = train_data[train_data['trip_duration'] <= 60*240]

        return train_data

    do_preproc = ex.action(preproc, [tr_data_dist_df])
    tr_ready = ex.artifact('train_ready.pkl', do_preproc)

    @jarvis.func
    def split(data_df):
        X = data_df[['vendor_id', 'passenger_count', 'pickup_longitude',
            'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
           'store_and_fwd_flag', 'pickup_datetime', 'distance']]
        y = data_df['trip_duration']

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

        return X_train, X_test, y_train, y_test

    do_split = ex.action(split, [tr_ready])
    xTrain = ex.artifact('xTrain.pkl', do_split)
    xTest = ex.artifact('xTest.pkl', do_split)
    yTrain = ex.artifact('yTrain.pkl', do_split)
    yTest = ex.artifact('yTest.pkl', do_split)

    @jarvis.func
    def train(data_df, trainingy):
        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import StandardScaler
        from sklearn.ensemble import RandomForestRegressor

        data_df['duration'] = trainingy

        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours

        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))

        import math

        clf = RandomForestRegressor(n_estimators=20, n_jobs=3)
        #clf = LinearRegression()
        scaler = StandardScaler()
        # Scaler does not help
        #scaler.fit(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance']].values)
        clf.fit(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance', 
                         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                        'dropoff_latitude']].values, data_df['duration'].values )
        return clf, scaler

    do_train = ex.action(train, [xTrain, yTrain])
    model = ex.artifact('model.pkl', do_train)
    scaler = ex.artifact('scaler.pkl', do_train)

    do_te_dfize = ex.action(dataframize, [te_data])
    te_data_df = ex.artifact('test_df.pkl', do_te_dfize)

    do_te_calcdist = ex.action(calculate_distance, [te_data_df])
    te_data_dist_df = ex.artifact('test_dist_df.pkl', do_te_calcdist)

    @jarvis.func
    def calculate_duration(data_df):
        def tdiff(start, end):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp1 = datetime.strptime(start, fmt)
            tstamp2 = datetime.strptime(end, fmt)

            if tstamp1 > tstamp2:
                td = tstamp1 - tstamp2
            else:
                td = tstamp2 - tstamp1
            return int(td.total_seconds())
        data_df['duration'] = [i for i in map(tdiff, data_df['pickup_datetime'], data_df['dropoff_datetime'])]
        return data_df



    do_te_calcdur = ex.action(calculate_duration, [te_data_dist_df])
    te_data_full_df = ex.artifact('test_dist_dur_df.pkl', do_te_calcdur)

    @jarvis.func
    def test(model, data_df, testingy, scaler):
        import numpy as np
        data_df['duration'] = testingy
        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours
        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))


        preds = model.predict(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance', 
                         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                        'dropoff_latitude']].values)
        from sklearn import metrics
        import math
        score = metrics.explained_variance_score(data_df['duration'].values, preds)
        rmse = np.sqrt(metrics.mean_squared_error(data_df['duration'].values, preds))
        return str(score) + '\n' + str(rmse)


    do_test = ex.action(test, [model, xTest, yTest, scaler])
    score = ex.artifact('score.txt', do_test)

    @jarvis.func
    def comp_predictions(model, data_df, scaler):
        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours
        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['start_timestamp'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['start_timestamp'].apply(lambda x: weekday(x))

        predictions = (model.predict(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values), data_df['duration'])
        return predictions

    do_comp = ex.action(comp_predictions, [model, te_data_full_df, scaler])
    preds = ex.artifact('predictions.pkl', do_comp)


In [6]:
tr_data.peek(head=1)

['id,vendor_id,start_timestamp,end_timestamp,passenger_count,start_lng,start_lat,end_lng,end_lat,store_and_fwd_flag,duration\n']

In [7]:
tr_data2.peek(head=1)

['id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration\n']

In [2]:
score.pull()

In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def manhattan_distance(x1, y1, x2, y2):
    return abs(x1 - x2) + abs(y1 - y2)

def roundtime(tstring):
    hours, mins, secs = tstring.split(':')
    if int(mins) >= 30:
        if hours == '23':
            return '00'
        else:
            return str(int(hours) + 1)
    else:
        return hours

def weekday(start):
    from datetime import datetime
    fmt = '%Y-%m-%d %H:%M:%S'
    tstamp = datetime.strptime(start, fmt)
    return int(tstamp.weekday())

data_df = pd.read_csv('train.csv')

data_df['distance'] = [i for i in map(manhattan_distance,
    data_df['pickup_longitude'], data_df['pickup_latitude'], 
    data_df['dropoff_longitude'], data_df['dropoff_latitude'])]

# Remove outliers in passenger_count
data_df = data_df[data_df['passenger_count']>0]
data_df = data_df[data_df['passenger_count']<9]

# Remove coordinate outliers
data_df = data_df[data_df['pickup_longitude'] <= -73.75]
data_df = data_df[data_df['pickup_longitude'] >= -74.03]
data_df = data_df[data_df['pickup_latitude'] <= 40.85]
data_df = data_df[data_df['pickup_latitude'] >= 40.63]
data_df = data_df[data_df['dropoff_longitude'] <= -73.75]
data_df = data_df[data_df['dropoff_longitude'] >= -74.03]
data_df = data_df[data_df['dropoff_latitude'] <= 40.85]
data_df = data_df[data_df['dropoff_latitude'] >= 40.63]

# Remove trip_duration outliers
trip_duration_mean = np.mean(data_df['trip_duration'])
trip_duration_std = np.std(data_df['trip_duration'])
data_df = data_df[data_df['trip_duration'] <= trip_duration_mean + 2*trip_duration_std]
data_df = data_df[data_df['trip_duration'] >= trip_duration_mean - 2*trip_duration_std]
data_df = data_df[data_df['trip_duration'] >= 30]
data_df = data_df[data_df['trip_duration'] <= 60*240]

data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))

X = data_df[['vendor_id', 'pickup_longitude',
            'pickup_latitude', 'dropoff_longitude', 
            'dropoff_latitude', 'distance',
            'start_hr', 'start_month', 'start_weekday']]
y = data_df['trip_duration']

X_train, X_test, y_train, y_test = train_test_split(X, 
    y, test_size = 0.2, random_state = 0)

clf = RandomForestRegressor(n_estimators=20, n_jobs=3)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = metrics.explained_variance_score(y_test, preds)
rmse = np.sqrt(metrics.mean_squared_error(y_test, preds))

print("R2: {}".format(score))
print("RMSE: {}".format(rmse))

R2: 0.7800057858579262
RMSE: 302.9232106738426


In [3]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import jarvis

with jarvis.Experiment('taxi_coarse') as ex:
    
    jarvis.setNotebookName('Taxi.ipynb')
    ex.groundClient('ground')
    
    @jarvis.func
    def run_existing_pipeline(path_to_data):
    
        def manhattan_distance(x1, y1, x2, y2):
            return abs(x1 - x2) + abs(y1 - y2)

        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours

        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df = pd.read_csv(path_to_data)

        data_df['distance'] = [i for i in map(manhattan_distance,
            data_df['pickup_longitude'], data_df['pickup_latitude'], 
            data_df['dropoff_longitude'], data_df['dropoff_latitude'])]

        # Remove outliers in passenger_count
        data_df = data_df[data_df['passenger_count']>0]
        data_df = data_df[data_df['passenger_count']<9]

        # Remove coordinate outliers
        data_df = data_df[data_df['pickup_longitude'] <= -73.75]
        data_df = data_df[data_df['pickup_longitude'] >= -74.03]
        data_df = data_df[data_df['pickup_latitude'] <= 40.85]
        data_df = data_df[data_df['pickup_latitude'] >= 40.63]
        data_df = data_df[data_df['dropoff_longitude'] <= -73.75]
        data_df = data_df[data_df['dropoff_longitude'] >= -74.03]
        data_df = data_df[data_df['dropoff_latitude'] <= 40.85]
        data_df = data_df[data_df['dropoff_latitude'] >= 40.63]

        # Remove trip_duration outliers
        trip_duration_mean = np.mean(data_df['trip_duration'])
        trip_duration_std = np.std(data_df['trip_duration'])
        data_df = data_df[data_df['trip_duration'] <= trip_duration_mean + 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= trip_duration_mean - 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= 30]
        data_df = data_df[data_df['trip_duration'] <= 60*240]

        data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))

        X = data_df[['vendor_id', 'pickup_longitude',
                    'pickup_latitude', 'dropoff_longitude', 
                    'dropoff_latitude', 'distance',
                    'start_hr', 'start_month', 'start_weekday']]
        y = data_df['trip_duration']

        X_train, X_test, y_train, y_test = train_test_split(X, 
            y, test_size = 0.2, random_state = 0)

        clf = RandomForestRegressor(n_estimators=20, n_jobs=3)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_test)
        score = metrics.explained_variance_score(y_test, preds)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, preds))

        score = "R2: {}".format(score)
        rmse = "RMSE: {}".format(rmse)
        
        print(score, rmse)
    
        return clf, score, rmse
    
    data = ex.artifact('train.csv')
    do_all = ex.action(run_existing_pipeline, [data])
    model = ex.artifact('model.pkl', do_all)
    score = ex.artifact('score.txt', do_all)
    rmse = ex.artifact('rmse.txt', do_all)

In [5]:
score.pull()

R2: 0.779745741917466 RMSE: 303.1106087514103


In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import jarvis

with jarvis.Experiment('taxi_coarse') as ex:
    
    jarvis.setNotebookName('Taxi.ipynb')
    ex.groundClient('ground')
    
    @jarvis.func
    def run_existing_pipeline(path_to_data, n_estimators):
    
        def manhattan_distance(x1, y1, x2, y2):
            return abs(x1 - x2) + abs(y1 - y2)

        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours

        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df = pd.read_csv(path_to_data)

        data_df['distance'] = [i for i in map(manhattan_distance,
            data_df['pickup_longitude'], data_df['pickup_latitude'], 
            data_df['dropoff_longitude'], data_df['dropoff_latitude'])]

        # Remove outliers in passenger_count
        data_df = data_df[data_df['passenger_count']>0]
        data_df = data_df[data_df['passenger_count']<9]

        # Remove coordinate outliers
        data_df = data_df[data_df['pickup_longitude'] <= -73.75]
        data_df = data_df[data_df['pickup_longitude'] >= -74.03]
        data_df = data_df[data_df['pickup_latitude'] <= 40.85]
        data_df = data_df[data_df['pickup_latitude'] >= 40.63]
        data_df = data_df[data_df['dropoff_longitude'] <= -73.75]
        data_df = data_df[data_df['dropoff_longitude'] >= -74.03]
        data_df = data_df[data_df['dropoff_latitude'] <= 40.85]
        data_df = data_df[data_df['dropoff_latitude'] >= 40.63]

        # Remove trip_duration outliers
        trip_duration_mean = np.mean(data_df['trip_duration'])
        trip_duration_std = np.std(data_df['trip_duration'])
        data_df = data_df[data_df['trip_duration'] <= trip_duration_mean + 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= trip_duration_mean - 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= 30]
        data_df = data_df[data_df['trip_duration'] <= 60*240]

        data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))

        X = data_df[['vendor_id', 'pickup_longitude',
                    'pickup_latitude', 'dropoff_longitude', 
                    'dropoff_latitude', 'distance',
                    'start_hr', 'start_month', 'start_weekday']]
        y = data_df['trip_duration']

        X_train, X_test, y_train, y_test = train_test_split(X, 
            y, test_size = 0.2, random_state = 0)

        clf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=3)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_test)
        score = metrics.explained_variance_score(y_test, preds)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, preds))

        score = "R2: {}".format(score)
        rmse = "RMSE: {}".format(rmse)
        
        print(score, rmse)
    
        return clf, score, rmse
    
    data = ex.artifact('train.csv')
    num_est = ex.literal([15, 20, 30], 'num_estimators')
    num_est.forEach()
    do_all = ex.action(run_existing_pipeline, [data, num_est])
    model = ex.artifact('model.pkl', do_all)
    score = ex.artifact('score.txt', do_all)
    rmse = ex.artifact('rmse.txt', do_all)

In [2]:
score.pull()

R2: 0.7748725843105921 RMSE: 306.43767227788436
R2: 0.7791599274717955 RMSE: 303.49629696347023
R2: 0.7834509576422817 RMSE: 300.54869927226036


In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import jarvis

with jarvis.Experiment('taxi_coarse') as ex:
    
    jarvis.setNotebookName('Taxi.ipynb')
    ex.groundClient('ground')
    
    @jarvis.func
    def prepare_data(path_to_data):
    
        def manhattan_distance(x1, y1, x2, y2):
            return abs(x1 - x2) + abs(y1 - y2)

        def roundtime(tstring):
            hours, mins, secs = tstring.split(':')
            if int(mins) >= 30:
                if hours == '23':
                    return '00'
                else:
                    return str(int(hours) + 1)
            else:
                return hours

        def weekday(start):
            from datetime import datetime
            fmt = '%Y-%m-%d %H:%M:%S'
            tstamp = datetime.strptime(start, fmt)
            return int(tstamp.weekday())

        data_df = pd.read_csv(path_to_data)

        data_df['distance'] = [i for i in map(manhattan_distance,
            data_df['pickup_longitude'], data_df['pickup_latitude'], 
            data_df['dropoff_longitude'], data_df['dropoff_latitude'])]

        # Remove outliers in passenger_count
        data_df = data_df[data_df['passenger_count']>0]
        data_df = data_df[data_df['passenger_count']<9]

        # Remove coordinate outliers
        data_df = data_df[data_df['pickup_longitude'] <= -73.75]
        data_df = data_df[data_df['pickup_longitude'] >= -74.03]
        data_df = data_df[data_df['pickup_latitude'] <= 40.85]
        data_df = data_df[data_df['pickup_latitude'] >= 40.63]
        data_df = data_df[data_df['dropoff_longitude'] <= -73.75]
        data_df = data_df[data_df['dropoff_longitude'] >= -74.03]
        data_df = data_df[data_df['dropoff_latitude'] <= 40.85]
        data_df = data_df[data_df['dropoff_latitude'] >= 40.63]

        # Remove trip_duration outliers
        trip_duration_mean = np.mean(data_df['trip_duration'])
        trip_duration_std = np.std(data_df['trip_duration'])
        data_df = data_df[data_df['trip_duration'] <= trip_duration_mean + 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= trip_duration_mean - 2*trip_duration_std]
        data_df = data_df[data_df['trip_duration'] >= 30]
        data_df = data_df[data_df['trip_duration'] <= 60*240]

        data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
        data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
        data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))

        return data_df
    
    @jarvis.func
    def train_test_model(data_df, n_estimators):
        X = data_df[['vendor_id', 'pickup_longitude',
                    'pickup_latitude', 'dropoff_longitude', 
                    'dropoff_latitude', 'distance',
                    'start_hr', 'start_month', 'start_weekday']]
        y = data_df['trip_duration']

        X_train, X_test, y_train, y_test = train_test_split(X, 
            y, test_size = 0.2, random_state = 0)

        clf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=3)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_test)
        score = metrics.explained_variance_score(y_test, preds)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, preds))

        score = "R2: {}".format(score)
        rmse = "RMSE: {}".format(rmse)
        
        print(score, rmse)
    
        return clf, score, rmse
    
    data = ex.artifact('train.csv')
    num_est = ex.literal([15, 20], 'num_estimators')
    num_est.forEach()
    
    do_prep = ex.action(prepare_data, [data])
    prepd = ex.artifact('prepped_data.pkl', do_prep)
    
    do_tr_te = ex.action(train_test_model, [prepd, num_est])

    
    model = ex.artifact('model.pkl', do_tr_te)
    score = ex.artifact('score.txt', do_tr_te)
    rmse = ex.artifact('rmse.txt', do_tr_te)

In [2]:
score.plot()

In [3]:
score.pull()

R2: 0.7760128249160974 RMSE: 305.64950335540533
R2: 0.7797606529016166 RMSE: 303.08540456623774
