# All the ML
Let's try to solve as many machine learning tasks on the same dataset.

In [12]:
import warnings

import vaex
from sklearn.cluster import KMeans
from vaex.ml.lightgbm import LightGBMModel
from vaex.ml.sklearn import Predictor
from vaex.ml.xgboost import XGBoostModel

from goldilox import Pipeline
from goldilox.datasets import load_iris

warnings.filterwarnings('ignore')

df, features, target = load_iris()
df = vaex.from_pandas(df)

# feature engineering example
for feature in features:
    df[feature] = df[feature].fillna(df[feature].mean())

df['petal_ratio'] = df['petal_length'] / df['petal_width']
features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio']

# classification
lgm = LightGBMModel(features=features,
                    target=target,
                    prediction_name='classification',
                    num_boost_round=500, params={'verbosity': -1,
                                                 'objective': 'multiclass',
                                                 'num_class': 3})
lgm.fit(df)
df = lgm.transform(df)

# regression
xgb = XGBoostModel(
    features=features,
    target=target,
    prediction_name="regression",
    params={'objective': 'reg:squarederror'},
    num_boost_round=500,
)
xgb.fit(df)
df = xgb.transform(df)
# clustering
kmeans = Predictor(model=KMeans(), features=features, prediction_name='cluster')
kmeans.fit(df)
df = kmeans.transform(df)


# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)


df.add_function('argmax', argmax)
df['prediction'] = df['classification'].argmax()

df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,classification,regression,cluster,prediction,label
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",2.17295e-05,1,0,setosa
1,4.9,3.0,1.4,0.2,0,7,"'array([9.99999756e-01, 2.15645920e-07, 2.793808...",0.000246913,1,0,setosa


## Nearest nighbours

In [13]:
from hnswlib import Index
import numpy as np

# Build index
index = Index(
    space="l2", dim=len(features)
)  # possible options are l2, cosine or ip
index.init_index(max_elements=len(df), ef_construction=200, M=16)

for i1, i2, chunk in df[features + ['cluster']].to_pandas_df(chunk_size=1000):
    X = chunk[features]
    y = chunk["cluster"]
    index.add_items(X, y)

index.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)


# Add to Dataframe
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = index.knn_query(np.array(columns).T, k=k)
    return np.array(labels)


df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)
df.head(1)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,classification,regression,cluster,prediction,label,knn
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",2.17295e-05,1,0,setosa,"array([1, 4, 6], dtype=uint64)"


# Explain

In [15]:
import shap
import pandas as pd

explainer = shap.TreeExplainer(lgm.booster)
targets = df[target].unique()
feature_count = len(features)


@vaex.register_function(on_expression=False)
def explain(*columns):
    data = np.array(columns).T
    X = pd.DataFrame(data[:, :feature_count], columns=features)
    y = data[:, -1]
    shap_values = explainer.shap_values(X)
    explanation = []

    for i, c in enumerate(y):
        c = int(c)
        e = shap.force_plot(explainer.expected_value[c], shap_values[c][i, :], X.iloc[i]).data
        explanation.append(
            {feature: effects['effect'] for feature, effects in zip(e['featureNames'], e['features'].values())})
    return pa.array(explanation)


df.add_function('explain', explain)
explain_columns = features + ['prediction']
df['explanation'] = df.func.explain(*explain_columns)
df.head(1)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,classification,regression,cluster,prediction,label,knn,explanation
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",2.17295e-05,1,0,setosa,"array([1, 4, 6], dtype=uint64)","""{'petal_length': 7.824491447488607, 'petal_rati..."


# Recommender

In [16]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import pyarrow as pa

ratings = csr_matrix((np.ones(len(df)), (df.prediction.values, df.cluster.values)))
als = AlternatingLeastSquares(factors=32)
als.fit(ratings)
user_items = ratings.T.tocsr()


@vaex.register_function()
def recommend_als(ar, topk=5, filter_already_liked_items=False):
    ret = []
    for user in ar.tolist():
        recommendations = als.recommend(user, user_items, N=topk,
                                        filter_already_liked_items=filter_already_liked_items)
        ret.append([recommendation[0] for recommendation in recommendations])
    return np.array(ret)


df.add_function('recommend_als', recommend_als)
df['recommendations'] = df.prediction.recommend_als()
df.head(2)

  0%|          | 0/15 [00:00<?, ?it/s]

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,classification,regression,cluster,prediction,label,knn,explanation,recommendations
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",2.17295e-05,1,0,setosa,"array([1, 4, 6], dtype=uint64)","""{'petal_length': 7.824491447488607, 'petal_rati...","array([1, 2, 0])"
1,4.9,3.0,1.4,0.2,0,7,"'array([9.99999756e-01, 2.15645920e-07, 2.793808...",0.000246913,1,0,setosa,"array([1, 4, 6], dtype=uint64)","""{'petal_length': 8.066329204345955, 'petal_rati...","array([1, 2, 0])"


# Go to production?

In [18]:
pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,classification,regression,cluster,prediction,label,knn,explanation,recommendations
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",2.17295e-05,1,0,setosa,"array([1, 4, 6], dtype=uint64)","""{'petal_length': 7.824491447488607, 'petal_rati...","array([1, 2, 0])"


In [19]:
pipeline.save('pipeline.pkl')
print('Go to http://127.0.0.1:8000/docs')
!glx serve pipeline.pkl

Go to http://127.0.0.1:8000/docs
[2022-02-01 17:21:57 +0100] [87184] [INFO] Starting gunicorn 20.1.0
[2022-02-01 17:21:57 +0100] [87184] [INFO] Listening at: http://127.0.0.1:8000 (87184)
[2022-02-01 17:21:57 +0100] [87184] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2022-02-01 17:21:57 +0100] [87197] [INFO] Booting worker with pid: 87197
[2022-02-01 17:21:57 +0100] [87197] [INFO] Started server process [87197]
[2022-02-01 17:21:57 +0100] [87197] [INFO] Waiting for application startup.
[2022-02-01 17:21:57 +0100] [87197] [INFO] Application startup complete.
^C
[2022-02-01 17:22:10 +0100] [87184] [INFO] Handling signal: int
[2022-02-01 17:22:10 +0100] [87184] [INFO] Shutting down: Master
