In [1]:
# Adaptation of https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
import dds
import sklearn
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)

import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import requests
import io
import json


In [2]:
%%sh
rm -rf /tmp/data/*
rm -rf /tmp/blobs/*

In [3]:
path_raw = "/wine-quality/raw"
path_model = "/wine-quality/my_model"
path_model_stats = "/wine-quality/my_model_stats.json"

def _load_data():
    print("*** in _load_data ***")
    url = "https://raw.githubusercontent.com/zygmuntz/wine-quality/master/winequality/winequality-red.csv"
    x = requests.get(url=url, verify=False).content 
    return pd.read_csv(io.StringIO(x.decode('utf8')), sep=";")

def load_data():
    return dds.keep(path_raw, _load_data)


In [4]:

data = dds.eval(load_data)

data.head(3)

INFO:dds:Interaction tree:
INFO:dds:`- Fun <__main__.load_data> None <- 2880f4ba2f0ce029bf961232667d184af1073b4a091ddf67e54334331f0400c7
INFO:dds:   |- dep: path_raw -> <__main__.path_raw>: a1a3ce5966bea811120114207d004b9d61fea3f19ca024e5d8d0632fb90c40f4
INFO:dds:   `- Fun <__main__._load_data> /wine-quality/raw <- ccd1ea31245b11507b79fdf02946f9438d7de3b6933ac87348ed8cb94a7bc0d5
INFO:dds:_eval_new_ctx:Evaluating (eval) fun <function load_data at 0x7f4f69f60280> with args [] kwargs OrderedDict()
INFO:dds:_eval:Evaluating (keep:/wine-quality/raw) fun <function _load_data at 0x7f4f69f60040> with args [] kwargs OrderedDict()


*** in _load_data ***


INFO:dds:_eval:Evaluating (keep:/wine-quality/raw) fun <function _load_data at 0x7f4f69f60040>: completed
INFO:dds:_eval:Storing blob into key ccd1ea31245b11507b79fdf02946f9438d7de3b6933ac87348ed8cb94a7bc0d5
INFO:dds.codec:Loading pandas codecs
INFO:dds:_eval_new_ctx:Evaluating (eval) fun <function load_data at 0x7f4f69f60280>: completed
INFO:dds.store:Link /tmp/data/wine-quality/raw -> /tmp/blobs/ccd1ea31245b11507b79fdf02946f9438d7de3b6933ac87348ed8cb94a7bc0d5


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [5]:
def build_model(X_train, y_train):
    print("*** in build_model ***")
    pipeline = make_pipeline(preprocessing.StandardScaler(), 
                             RandomForestRegressor(n_estimators=100))
    hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                      'randomforestregressor__max_depth': [None, 5, 3, 1]}

    clf = GridSearchCV(pipeline, hyperparameters, cv=10)
    
    clf.fit(X_train, y_train)
    return clf
 
    
def model_stats(clf, X_test, y_test) -> str:
    print("*** in model_stats ***")
    pred = clf.predict(X_test)
    return json.dumps({
        "r2_score": r2_score(y_test, pred),
        "mse": mean_squared_error(y_test, pred)
    })
    
    
def pipeline():
    wine_data = load_data()
    y = wine_data.quality
    X = wine_data.drop('quality', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.15, 
                                                        random_state=123, 
                                                        stratify=y)
    clf = dds.keep(path_model, build_model, X_train, y_train)
    dds.keep(path_model_stats, model_stats, clf, X_test, y_test)
    print("*** done ***")


dds.eval(pipeline)


INFO:dds:Interaction tree:
INFO:dds:`- Fun <__main__.pipeline> None <- ac54dc585f8ac8156dae937ca7e1c8bfc9b4bc07793d983619a0506f144e1688
INFO:dds:   |- dep: path_model -> <__main__.path_model>: 4697524b4c751aacee319944ec2f0a265336dc96ff692488d7cbc071fc6a3733
INFO:dds:   |- dep: path_model_stats -> <__main__.path_model_stats>: 110ff33eb1f89810ce6af6621ae232ae55f5408eb739bd82972c361bffaa48da
INFO:dds:   |- Fun <__main__.load_data> None <- 2880f4ba2f0ce029bf961232667d184af1073b4a091ddf67e54334331f0400c7
INFO:dds:   |  |- dep: path_raw -> <__main__.path_raw>: a1a3ce5966bea811120114207d004b9d61fea3f19ca024e5d8d0632fb90c40f4
INFO:dds:   |  `- Fun <__main__._load_data> /wine-quality/raw <- ccd1ea31245b11507b79fdf02946f9438d7de3b6933ac87348ed8cb94a7bc0d5
INFO:dds:   |- Fun <__main__.build_model> /wine-quality/my_model <- 9fe95b32120281e19e4eeda64b01d84b374ec94b3d0e1438cd2bae248f1efb45
INFO:dds:   `- Fun <__main__.model_stats> /wine-quality/my_model_stats.json <- e3962df761a297029b30b735b61312fc

*** in build_model ***


INFO:dds:_eval:Evaluating (keep:/wine-quality/my_model) fun <function build_model at 0x7f4fa58db5e0>: completed
INFO:dds:_eval:Storing blob into key 9fe95b32120281e19e4eeda64b01d84b374ec94b3d0e1438cd2bae248f1efb45
INFO:dds:_eval:Evaluating (keep:/wine-quality/my_model_stats.json) fun <function model_stats at 0x7f4fa58db700> with args ["<class 'sklearn.model_selection._search.GridSearchCV'>", "<class 'pandas.core.frame.DataFrame'>", "<class 'pandas.core.series.Series'>"] kwargs OrderedDict()
INFO:dds:_eval:Evaluating (keep:/wine-quality/my_model_stats.json) fun <function model_stats at 0x7f4fa58db700>: completed
INFO:dds:_eval:Storing blob into key e3962df761a297029b30b735b61312fcbeb44a2373fcaffa87015648db49bb29
INFO:dds:_eval_new_ctx:Evaluating (eval) fun <function pipeline at 0x7f4fa58db790>: completed
INFO:dds.store:Link /tmp/data/wine-quality/my_model -> /tmp/blobs/9fe95b32120281e19e4eeda64b01d84b374ec94b3d0e1438cd2bae248f1efb45
INFO:dds.store:Link /tmp/data/wine-quality/my_model_st

*** in model_stats ***
*** done ***


In [6]:
%%sh
cat /tmp/data/wine-quality/my_model_stats.json

{"r2_score": 0.4838757229320779, "mse": 0.33310625000000005}

In [7]:
    
    # # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
 
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(X_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)
 
# # 10. Save model for future use
# joblib.dump(clf, 'rf_regressor.pkl')