# All the ML

## Classification/Regression/Clustering

In [40]:
import vaex
import warnings
from vaex.ml.datasets import load_iris
from vaex.ml.lightgbm import LightGBMModel
from vaex.ml.xgboost import XGBoostModel
from goldilox import Pipeline
import numpy as np
import json
from vaex.ml.sklearn import Predictor
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')


df = load_iris()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 
features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio']

# classification
lgm = LightGBMModel(features=features,
                        target=target,
                        prediction_name='classification',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
lgm.fit(df)
df = lgm.transform(df)

# regression
xgb = XGBoostModel( 
    features=features,
    target=target,
    prediction_name="regression",
    params = {'objective':'reg:squarederror'},
    num_boost_round=500,
)
xgb.fit(df)
df = xgb.transform(df)
# clustering
kmeans = Predictor(model=KMeans(), features=features, prediction_name='cluster')
kmeans.fit(df)
df = kmeans.transform(df)

# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)
df.add_function('argmax', argmax)
df['prediction'] = df['classification'].argmax()

df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Vaex remember all the transformations, this is a skleran.pipeline alternative
pipeline = Pipeline.from_vaex(df, description='simple lightGBM')
pipeline.raw.pop(target) # (optional) we don't expect to get the class_ in queries
assert pipeline.validate()
print("Pipeline raw data example:")
print(json.dumps(pipeline.raw, indent=4))
print('')
print("Pipeline output example:")
pipeline.inference(pipeline.raw).to_records()
df.head(2)

Pipeline raw data example:
{
    "sepal_length": 5.9,
    "sepal_width": 3.0,
    "petal_length": 4.2,
    "petal_width": 1.5
}

Pipeline output example:


#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,classification,regression,cluster,prediction,label
0,5.9,3,4.2,1.5,1,2.8,"'array([2.50134389e-08, 9.99999969e-01, 6.259919...",1.0007,3,1,versicolor
1,6.1,3,4.6,1.4,1,3.28571,"'array([8.83971014e-07, 9.99999089e-01, 2.725598...",1.00103,3,1,versicolor


## Nearest nighbours

In [41]:
from hnswlib import Index
import numpy as np


# Build index
index = Index(
    space="l2", dim=len(features)
)  # possible options are l2, cosine or ip
index.init_index(max_elements=len(df), ef_construction=200, M=16)

for i1, i2, chunk in df[features+['cluster']].to_pandas_df(chunk_size=1000):
    X = chunk[features]
    y = chunk["cluster"]
    index.add_items(X, y)

index.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)

# Add to Dataframe
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = index.knn_query(np.array(columns).T, k=k)
    return np.array(labels)

df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)

# build pipeline for production
pipeline = Pipeline.from_vaex(df)
assert pipeline.validate
pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,classification,regression,cluster,prediction,label,knn
0,5.9,3,4.2,1.5,1,2.8,"'array([2.50134389e-08, 9.99999969e-01, 6.259919...",1.0007,3,1,versicolor,"array([3, 6, 7], dtype=uint64)"


# Explain

In [42]:
import shap
import pyarrow as pa
import pandas as pd

explainer = shap.TreeExplainer(lgm.booster)
targets = df['class_'].unique()
feature_count = len(features)

@vaex.register_function(on_expression=False)
def explain(*columns):    
    data = np.array(columns).T
    X = pd.DataFrame(data[:,:feature_count], columns=features)    
    y = data[:,-1]
    shap_values = explainer.shap_values(X)
    explanation = []
    
    for i, c in enumerate(y):
        c = int(c)
        e = shap.force_plot(explainer.expected_value[c], shap_values[c][i,:], X.iloc[i]).data
        explanation.append({feature:effects['effect'] for feature, effects in zip(e['featureNames'], e['features'].values())})
    return pa.array(explanation)
    
df.add_function('explain', explain)
explain_columns = features+['prediction']
df['explanation']  =df.func.explain(*explain_columns)

# get ready for production
pipeline = Pipeline.from_vaex(df)
pipeline.validate()
pipeline.inference(pipeline.raw)



#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,classification,regression,cluster,prediction,label,knn,explanation
0,5.9,3,4.2,1.5,1,2.8,"'array([2.50134389e-08, 9.99999969e-01, 6.259919...",1.0007,3,1,versicolor,"array([3, 6, 7], dtype=uint64)","""{'petal_length': 3.5222047977554025, 'petal_rat..."


# Recommender

In [43]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import pyarrow as pa

ratings = csr_matrix((np.ones(len(df)), (df.prediction.values, df.cluster.values)))
als = AlternatingLeastSquares(factors=32)
als.fit(ratings)
user_items = ratings.T.tocsr()

@vaex.register_function()
def recommend_als(ar, topk=5, filter_already_liked_items=False):
    ret = []
    for user in ar.tolist():
        recommendations = als.recommend(user, user_items, N=topk,
                                        filter_already_liked_items=filter_already_liked_items)        
        ret.append([recommendation[0] for recommendation in recommendations ])
    return np.array(ret)
df.add_function('recommend_als', recommend_als)
df['recommendations'] = df.prediction.recommend_als() 

df.head(2)

  0%|          | 0/15 [00:00<?, ?it/s]

#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,classification,regression,cluster,prediction,label,knn,explanation,recommendations
0,5.9,3,4.2,1.5,1,2.8,"'array([2.50134389e-08, 9.99999969e-01, 6.259919...",1.0007,3,1,versicolor,"array([3, 6, 7], dtype=uint64)","""{'petal_length': 3.5222047977554025, 'petal_rat...","array([0, 2, 1])"
1,6.1,3,4.6,1.4,1,3.28571,"'array([8.83971014e-07, 9.99999089e-01, 2.725598...",1.00103,3,1,versicolor,"array([3, 6, 7], dtype=uint64)","""{'petal_length': 3.5749666244104925, 'petal_rat...","array([0, 2, 1])"


In [None]:
pipeline = Pipeline.from_vaex(df)
pipeline.save('pipeline.pkl')
print('Go to http://127.0.0.1:5000/docs')
!gl serve pipeline.pkl

Go to http://127.0.0.1:5000/docs
[2021-12-09 16:16:20 +0100] [9538] [INFO] Starting gunicorn 20.1.0
[2021-12-09 16:16:20 +0100] [9538] [INFO] Listening at: http://127.0.0.1:5000 (9538)
[2021-12-09 16:16:20 +0100] [9538] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-12-09 16:16:20 +0100] [9542] [INFO] Booting worker with pid: 9542
[2021-12-09 16:16:20 +0100] [9542] [INFO] Started server process [9542]
[2021-12-09 16:16:20 +0100] [9542] [INFO] Waiting for application startup.
[2021-12-09 16:16:20 +0100] [9542] [INFO] Application startup complete.
