In [1]:
cd ..

/Users/yonatanalexander/development/xdss/goldilox


# [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/index.html)

## Sklearn

In [2]:
from goldilox.datasets import make_blobs

df, features, target = make_blobs()
df.head(2)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,target
0,-9.404628,-6.615111,-4.451656,-10.918865,1.669457,-0.467001,-4.413303,-1.627234,3.257895,5.848858,0
1,-9.621981,-7.015441,-7.47403,-10.667444,2.174747,-1.838034,-4.871131,-1.797716,4.465596,6.266632,0


# Vaex

In [7]:
from goldilox.datasets import make_blobs

df, features, target = make_blobs()
df.head(2)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,target
0,0.548481,8.833977,-2.956357,7.750154,-6.745776,10.764411,-3.103165,2.602318,-1.278969,-0.083086,1
1,-2.766979,-6.480697,4.508302,7.172464,-7.873933,-1.597331,0.029204,-6.567046,-1.183978,0.237812,2


In [8]:
import numpy as np
import vaex
from hdbscan import HDBSCAN, approximate_predict

df = vaex.from_pandas(df)
for feature in features:
    df[feature] = df[feature].fillna(df[feature].mean())
model = HDBSCAN(prediction_data=True)
model.fit(df[features], df['target'])


@vaex.register_function()
def hdbscan(*columns):
    data = np.array(columns).T
    labels, _ = approximate_predict(model, data)
    return labels


df.add_function('hdbscan', hdbscan)
df['cluster'] = df.func.hdbscan(*features)

Let's build a production pipeline

In [9]:
from goldilox import Pipeline
import json

pipeline = Pipeline.from_vaex(df)

# I/O Example
print(f"predict for {json.dumps(pipeline.raw, indent=4)}")
pipeline.inference(pipeline.raw)

predict for {
    "feature0": 0.5484814115742233,
    "feature1": 8.833977435034102,
    "feature2": -2.9563565986601272,
    "feature3": 7.750153814007809,
    "feature4": -6.745776195525505,
    "feature5": 10.764410603844501,
    "feature6": -3.103164925629103,
    "feature7": 2.602318408173805,
    "feature8": -1.27896912181085,
    "feature9": -0.08308609471404838,
    "target": 1
}


#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,target,cluster
0,0.548481,8.83398,-2.95636,7.75015,-6.74578,10.7644,-3.10316,2.60232,-1.27897,-0.0830861,1,0


## Sklearn

We need to make hdbscan sklearn transformer - not much code.

In [12]:
from sklearn.base import TransformerMixin, BaseEstimator
from hdbscan import HDBSCAN, approximate_predict


class HDBSCANTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, prediction_column='cluster', **kwargs):
        kwargs['prediction_data'] = True
        self.model = HDBSCAN(**kwargs)
        self.prediction_column = prediction_column

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        labels, strength = approximate_predict(self.model, X)
        return labels

    def transform(self, X):
        X[self.prediction_column] = self.predict(X)
        return X
    


In [13]:
from goldilox import Pipeline

model = HDBSCANTransformer().fit(X, y)
pipeline = Pipeline.from_sklearn(model).fit(X, y)
pipeline.inference(pipeline.raw)



Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,cluster
0,-9.404628,-6.615111,-4.451656,-10.918865,1.669457,-0.467001,-4.413303,-1.627234,3.257895,5.848858,0


# Vaex

The pipeline is ready to go, but it does not handle missing values which we might get in production.   
Let's fix that!

In [60]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import sklearn.pipeline

imputer = ColumnTransformer([('features_mean', SimpleImputer(strategy='mean'), features)], remainder='passthrough')
sklearn_pipeline = sklearn.pipeline.Pipeline([('imputer', imputer), ('kmean', KMeans())])
pipeline = Pipeline.from_sklearn(sklearn_pipeline, features=features,
                                 output_columns=['cluster']).fit(df)

# I/O Example
print(f"predict for {json.dumps(pipeline.raw, indent=4)}")
pipeline.inference(pipeline.raw)

predict for {
    "sepal_length": 5.1,
    "sepal_width": 3.5,
    "petal_length": 1.4,
    "petal_width": 0.2
}


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,cluster
0,5.1,3.5,1.4,0.2,1


In [7]:
pipeline.save('pipeline.pkl')
print('Go to the fastapi docs here: http://127.0.0.1:5000/docs')
!gl serve 'pipeline.pkl'

Go to the fasgtapi docs here: http://127.0.0.1:5000/docs
[2021-11-26 12:10:04 +0100] [39494] [INFO] Starting gunicorn 20.1.0
[2021-11-26 12:10:04 +0100] [39494] [INFO] Listening at: http://127.0.0.1:5000 (39494)
[2021-11-26 12:10:04 +0100] [39494] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-26 12:10:04 +0100] [39527] [INFO] Booting worker with pid: 39527
[2021-11-26 12:10:04 +0100] [39527] [INFO] Started server process [39527]
[2021-11-26 12:10:04 +0100] [39527] [INFO] Waiting for application startup.
[2021-11-26 12:10:04 +0100] [39527] [INFO] Application startup complete.
^C
[2021-11-26 12:10:21 +0100] [39494] [INFO] Handling signal: int
