This tutorial is an adaptation of the Machine Learning tutorial from Elite Data Science. The original tutorial is here:

https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn

Let us start with a few imports

In [None]:
import sklearn
import pandas as pd
import numpy as np
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import requests
import io
import json

We now add the import to the `dds` package. By default, the data will be stored in the temporary directory

In [None]:
import dds

The two internal directories (data and internal) are now there

Let's start with a familiar problem of accessing data from the internet. This piece of code will download a dataset, but with the additional twist that the dataset will be cached onto the local machine.

In [None]:
path_model = "/wine-quality/my_model"
path_model_stats = "/wine-quality/my_model_stats.json"

@dds.data_function("/wine-quality/raw")
def data():
    print("*** in _load_data ***")
    url = "https://raw.githubusercontent.com/zygmuntz/wine-quality/master/winequality/winequality-red.csv"
    x = requests.get(url=url, verify=False).content 
    return pd.read_csv(io.StringIO(x.decode('utf8')), sep=";")


In [None]:
dds.eval(data, dds_export_graph="/tmp/2.png", dds_extra_debug=True, dds_stages=["analysis"])
from IPython.display import Image
Image("/tmp/2.png")

In [None]:
data().head(3)

In [None]:
def build_model(X_train, y_train):
    print("*** in build_model ***")
    pipeline = make_pipeline(preprocessing.StandardScaler(), 
                             RandomForestRegressor(n_estimators=100))
    hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                      'randomforestregressor__max_depth': [None, 5, 3, 1]}

    clf = GridSearchCV(pipeline, hyperparameters, cv=10)
    
    clf.fit(X_train, y_train)
    return clf
 
    
def model_stats(clf, X_test, y_test) -> str:
    print("*** in model_stats ***")
    pred = clf.predict(X_test)
    return json.dumps({
#         "r2_score": r2_score(y_test, pred),
        "mse": mean_squared_error(y_test, pred)
    })
    
    
def pipeline():
    wine_data = data()
    y = wine_data.quality
    X = wine_data.drop('quality', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.15, 
                                                        random_state=123, 
                                                        stratify=y)
    clf = dds.keep(path_model, build_model, X_train, y_train)
    dds.keep(path_model_stats, model_stats, clf, X_test, y_test)
    print("*** done ***")


# dds.eval(pipeline, dds_export_graph="1.png")


In [None]:
%%sh
# cat /tmp/data/wine-quality/my_model_stats.json

In [None]:

@dds.data_function("/p1")
def f1(): return 2 

@dds.data_function("/p2")
def f2(): return 2 + f1()

@dds.data_function("/p3")
def f3(): return 1 + f1() + f2()

@dds.data_function("/p4")
def f4(): return 1


def f():
    f4()
    f3()

dds.eval(f, dds_export_graph="/tmp/2.png", dds_extra_debug=True, dds_stages=["analysis"])
from IPython.display import Image
Image("/tmp/2.png")

In [None]:
f()