In [1]:
# Adaptation of https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
import dds
import sklearn
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)

import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import requests
import io
import json

In [2]:
path_raw = "/wine-quality/raw"
path_model = "/wine-quality/my_model"
path_model_stats = "/wine-quality/my_model_stats.json"

def _load_data():
    url = "https://raw.githubusercontent.com/zygmuntz/wine-quality/master/winequality/winequality-red.csv"
    x = requests.get(url=url, verify=False).content 
    return pd.read_csv(io.StringIO(x.decode('utf8')), sep=";")

def load_data():
    return dds.keep(path_raw, _load_data)

data = dds.eval(load_data)

data.head(3)
    

DEBUG:dds.introspect:Starting _introspect: <function load_data at 0x7fd30ca42d30>
DEBUG:dds.introspect:function: load_data <_ast.arguments object at 0x7fd30ca25df0> [<_ast.Return object at 0x7fd3383573d0>]
DEBUG:dds.introspect:external: [<__main__.path_raw>]
DEBUG:dds.introspect:_retrieve_object: not checking: /wine-quality/raw <class 'str'>
DEBUG:dds.introspect:Cache hash: <__main__.path_raw>: a1a3ce5966bea811120114207d004b9d61fea3f19ca024e5d8d0632fb90c40f4
DEBUG:dds.introspect:_inspect_call: fname: ['dds', 'keep']
DEBUG:dds.introspect:_retrieve_object: ['keep'] -> <function keep at 0x7fd33836e700>: dds
DEBUG:dds.introspect:_retrieve_object: authorized function ['keep'] -> <function keep at 0x7fd33836e700>: dds
DEBUG:dds.introspect:_inspect_call: ln:2 <_ast.Call object at 0x7fd3383577f0> <function keep at 0x7fd33836e700>
DEBUG:dds.introspect:_inspect_call: canon_path: <dds.keep>
DEBUG:dds.introspect:_inspect_call: Keep: store_path_symbol: path_raw <class 'str'>
DEBUG:dds.introspect:_r

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [6]:

def build_model(X_train, y_train):
    pipeline = make_pipeline(preprocessing.StandardScaler(), 
                             RandomForestRegressor(n_estimators=100))
    hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                      'randomforestregressor__max_depth': [None, 5, 3, 1]}

    clf = GridSearchCV(pipeline, hyperparameters, cv=10)
    clf.fit(X_train, y_train)
    return clf
 
    
def model_stats(clf, X_test, y_test):
    pred = clf.predict(X_test)
    return json.dumps({
        "r2_score": r2_score(y_test, pred),
        "mse": mean_squared_error(y_test, pred)
    })
    
    
def pipeline():
    wine_data = load_data()
    y = wine_data.quality
    X = wine_data.drop('quality', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.15, 
                                                        random_state=123, 
                                                        stratify=y)
    clf = dds.keep(path_model, build_model, X_train, y_train)
    dds.keep(path_model_stats, model_stats, clf, X_test, y_test)


dds.eval(pipeline)


DEBUG:dds.introspect:Starting _introspect: <function pipeline at 0x7fd33b3df0d0>
DEBUG:dds.introspect:function: pipeline <_ast.arguments object at 0x7fd338357460> [<_ast.Assign object at 0x7fd3383574f0>, <_ast.Assign object at 0x7fd30b232220>, <_ast.Assign object at 0x7fd30b37bbe0>, <_ast.Assign object at 0x7fd30b39bc10>, <_ast.Assign object at 0x7fd3087152b0>, <_ast.Expr object at 0x7fd3087157c0>]
DEBUG:dds.introspect:external: [<__main__.path_model>, <__main__.path_model_stats>]
DEBUG:dds.introspect:_retrieve_object: not checking: /wine-quality/my_model <class 'str'>
DEBUG:dds.introspect:Cache hash: <__main__.path_model>: 4697524b4c751aacee319944ec2f0a265336dc96ff692488d7cbc071fc6a3733
DEBUG:dds.introspect:_retrieve_object: not checking: /wine-quality/my_model_stats.json <class 'str'>
DEBUG:dds.introspect:Cache hash: <__main__.path_model_stats>: 110ff33eb1f89810ce6af6621ae232ae55f5408eb739bd82972c361bffaa48da
DEBUG:dds.introspect:_inspect_call: fname: ['load_data']
DEBUG:dds.introspe

DEBUG:dds.introspect:_inspect_call: skipping
DEBUG:dds.introspect:_inspect_call: fname: ['clf', 'fit']
DEBUG:dds.introspect:_inspect_call: skipping
INFO:dds.introspect:outputs: []
DEBUG:dds.introspect:End _introspect: <function build_model at 0x7fd30b227700>: FunctionInteractions(fun_body_sig='f974d4bbbffa5b098e3a51a3db390e25ed0426f44c25f67a41d288ce9d1412df', fun_return_sig='7cdcc39020446e16a540e5d29a5375c055a6c0bfebcddb00a8d06a1a682df9a6', fun_context_input_sig='bc4b228e03b21eb5664ca0c1288c4c8451bc9393fc027b5c741196a22f09a806', outputs=[])
DEBUG:dds.introspect:_inspect_call: keep: /wine-quality/my_model <- <__main__.build_model>: 7cdcc39020446e16a540e5d29a5375c055a6c0bfebcddb00a8d06a1a682df9a6
DEBUG:dds.introspect:_inspect_call: fname: ['dds', 'keep']
DEBUG:dds.introspect:_retrieve_object: ['keep'] -> <function keep at 0x7fd33836e700>: dds
DEBUG:dds.introspect:_retrieve_object: authorized function ['keep'] -> <function keep at 0x7fd33836e700>: dds
DEBUG:dds.introspect:_inspect_call: l

DEBUG:dds.store:Committed new blob in 054a1aa4de4a1944069ebf559bbe8c68d66a2965c66ab37df6394705d20d8ba2
INFO:dds:Evaluating (eval) fun <function pipeline at 0x7fd33b3df0d0> with args () kwargs {}
DEBUG:dds.store:Link /tmp/data/wine-quality/raw up to date
DEBUG:dds.store:Link /tmp/data/wine-quality/my_model -> /tmp/blobs/7cdcc39020446e16a540e5d29a5375c055a6c0bfebcddb00a8d06a1a682df9a6
DEBUG:dds.store:Link /tmp/data/wine-quality/my_model_stats.json -> /tmp/blobs/054a1aa4de4a1944069ebf559bbe8c68d66a2965c66ab37df6394705d20d8ba2


In [4]:
    
    # # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
 
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(X_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)
 
# # 10. Save model for future use
# joblib.dump(clf, 'rf_regressor.pkl')