In [1]:
%%sh
rm -rf /tmp/blobs
rm -rf /tmp/data
mkdir /tmp/data

In [2]:
# Adaptation of https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
import dds

import sklearn
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)

import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import requests
import io
import json

In [7]:
path_raw = "/wine-quality/raw"
path_model = "/wine-quality/my_model"
path_model_stats = "/wine-quality/my_model_stats.json"

def _load_data():
    print("**** load_data ****")
    url = "https://raw.githubusercontent.com/zygmuntz/wine-quality/master/winequality/winequality-red.csv"
    x = requests.get(url=url, verify=False).content 
    return pd.read_csv(io.StringIO(x.decode('utf8')), sep=";")

def load_data():
    return dds.keep(path_raw, _load_data)

data = dds.eval(load_data)

data.head(3)
    

INFO:dds:Interaction tree:
INFO:dds:`- Fun <__main__.load_data> None <- 258172310df073d19e828575cdc9eb2b44a405a0e6c42c5f3697b01eec119833
INFO:dds:   |- dep: path_raw -> <__main__.path_raw>: a1a3ce5966bea811120114207d004b9d61fea3f19ca024e5d8d0632fb90c40f4
INFO:dds:   `- Fun <__main__._load_data> /wine-quality/raw <- 5679a341642c7130a2963dd7c29d6b4d9c82fc14af03b9fc7c919c74576b9520
INFO:dds:Evaluating (eval) fun <function load_data at 0x7fbbffb025e0> with args [] kwargs OrderedDict()
INFO:dds:Evaluating (eval) fun <function load_data at 0x7fbbffb025e0>: completed
INFO:dds.store:Link /tmp/data/wine-quality/raw -> /tmp/blobs/5679a341642c7130a2963dd7c29d6b4d9c82fc14af03b9fc7c919c74576b9520


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [15]:
def build_model(X_train, y_train):
    print("**** build_model ****")
    pipeline = make_pipeline(preprocessing.StandardScaler(), 
                             RandomForestRegressor(n_estimators=100))
    hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                      'randomforestregressor__max_depth': [None, 5, 3]}

    clf = GridSearchCV(pipeline, hyperparameters, cv=10)
    
    clf.fit(X_train, y_train)
    return clf
 
    
def model_stats(clf, X_test, y_test):
    print("**** model_stats ****")
    pred = clf.predict(X_test)
    return json.dumps({
        "r2_score": r2_score(y_test, pred),
        "mse": mean_squared_error(y_test, pred)
    })
    
    
def pipeline():
    wine_data = load_data()
    y = wine_data.quality
    X = wine_data.drop('quality', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.15, 
                                                        random_state=123, 
                                                        stratify=y)
    clf = dds.keep(path_model, build_model, X_train, y_train)
    dds.keep(path_model_stats, model_stats, clf, X_test, y_test)


dds.eval(pipeline)


INFO:dds:Interaction tree:
INFO:dds:`- Fun <__main__.pipeline> None <- 5e8c461517e81e9a7e2e6b1337ee147fe61ccce96d9071cdebb6dba7b16301ba
INFO:dds:   |- dep: path_model -> <__main__.path_model>: 4697524b4c751aacee319944ec2f0a265336dc96ff692488d7cbc071fc6a3733
INFO:dds:   |- dep: path_model_stats -> <__main__.path_model_stats>: 110ff33eb1f89810ce6af6621ae232ae55f5408eb739bd82972c361bffaa48da
INFO:dds:   |- Fun <__main__.load_data> None <- 108f442b6e30c13f22758ae38032b30e4eda9cd46f8e8f56f02a3a01b563ec94
INFO:dds:   |  |- dep: path_raw -> <__main__.path_raw>: a1a3ce5966bea811120114207d004b9d61fea3f19ca024e5d8d0632fb90c40f4
INFO:dds:   |  `- Fun <__main__._load_data> /wine-quality/raw <- ca0b795f3aa53111810358c3519b3cd0249b51df1c9bd92245aaecc8a1ead17a
INFO:dds:   |- Fun <__main__.build_model> /wine-quality/my_model <- 826c081d4f7f518a34557c573dc35549064c78da7aaf9641c563bbf2b5061091
INFO:dds:   `- Fun <__main__.model_stats> /wine-quality/my_model_stats.json <- 89950d80e2b87822d061532669534fa9

In [14]:
%%sh
cat /tmp/data/wine-quality/my_model_stats.json

{"mse": 0.33753916666666667}

In [6]:
    
    # # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
 
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(X_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)
 
# # 10. Save model for future use
# joblib.dump(clf, 'rf_regressor.pkl')