# [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/index.html)

## Sklearn

In [8]:
from goldilox.datasets import make_blobs

df, features, target = make_blobs()
X, y = df[features], df[target]
df.head(2)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,target
0,5.850949,5.314459,3.507009,5.746078,-7.654765,10.271936,0.520728,-4.270634,8.166574,-3.727932,0
1,7.262032,6.604955,4.342837,5.992455,-7.020435,11.561267,3.453387,-1.891555,8.82497,-3.954333,0


## Sklearn

We need to make hdbscan sklearn transformer - not much code.

In [9]:
from goldilox import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator


class HDBSCANTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, prediction_column='cluster', **kwargs):
        kwargs['prediction_data'] = True
        self.model = HDBSCAN(**kwargs)
        self.prediction_column = prediction_column

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        labels, strength = approximate_predict(self.model, X)
        return labels

    def transform(self, X):
        X[self.prediction_column] = self.predict(X)
        return X


pipeline = Pipeline.from_sklearn(HDBSCANTransformer()).fit(X, y)
pipeline.inference(pipeline.raw)



Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,cluster
0,5.850949,5.314459,3.507009,5.746078,-7.654765,10.271936,0.520728,-4.270634,8.166574,-3.727932,0


The pipeline is ready to go, but it does not handle missing values which we might get in production.   
Let's fix that!

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import sklearn.pipeline

imputer = ColumnTransformer([('features_mean', SimpleImputer(strategy='mean'), features)], remainder='passthrough')
sklearn_pipeline = sklearn.pipeline.Pipeline([('imputer', imputer), ('kmean', HDBSCANTransformer())])
pipeline = Pipeline.from_sklearn(sklearn_pipeline, features=features,
                                 output_columns=['cluster']).fit(df)

pipeline.inference(pipeline.raw)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,cluster
0,5.850949,5.314459,3.507009,5.746078,-7.654765,10.271936,0.520728,-4.270634,8.166574,-3.727932,0


## Vaex

In [13]:
import numpy as np
import vaex
from goldilox import Pipeline
from hdbscan import HDBSCAN, approximate_predict

df = vaex.from_pandas(df)

# This will also fill in production
for feature in features:
    df[feature] = df[feature].fillna(df[feature].mean())

model = HDBSCAN(prediction_data=True)
model.fit(df[features], df[target])


@vaex.register_function()
def hdbscan(*columns):
    data = np.array(columns).T
    labels, _ = approximate_predict(model, data)
    return labels


df.add_function('hdbscan', hdbscan)
df['cluster'] = df.func.hdbscan(*features)

pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)

#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,target,cluster
0,5.85095,5.31446,3.50701,5.74608,-7.65477,10.2719,0.520728,-4.27063,8.16657,-3.72793,0,0


# [Serve](https://docs.goldilox.io/reference/api-reference/cli/serve)

In [7]:
pipeline.save('pipeline.pkl')
print('Go to the fastapi docs here: http://127.0.0.1:5000/docs')
!gl serve 'pipeline.pkl'

Go to the fasgtapi docs here: http://127.0.0.1:5000/docs
[2021-11-26 12:10:04 +0100] [39494] [INFO] Starting gunicorn 20.1.0
[2021-11-26 12:10:04 +0100] [39494] [INFO] Listening at: http://127.0.0.1:5000 (39494)
[2021-11-26 12:10:04 +0100] [39494] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-26 12:10:04 +0100] [39527] [INFO] Booting worker with pid: 39527
[2021-11-26 12:10:04 +0100] [39527] [INFO] Started server process [39527]
[2021-11-26 12:10:04 +0100] [39527] [INFO] Waiting for application startup.
[2021-11-26 12:10:04 +0100] [39527] [INFO] Application startup complete.
^C
[2021-11-26 12:10:21 +0100] [39494] [INFO] Handling signal: int
