# Every branch a server 

In [1]:
import sys

sys.path.append("..")

from lakeml.vaex import Pipeline
models_path = '../models'
import lakefs_client
from lakefs_client.models import RepositoryCreation
from lakefs_client.client import LakeFSClient
from dotenv import load_dotenv
import os
import vaex
import sys
import json
from collections import defaultdict
from lakeml.vaex.core.variables import *
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")
from lakeml.vaex import Pipeline
models_path = '../models'

load_dotenv()


configuration = lakefs_client.Configuration()
configuration.username = os.getenv('LAKEFS_USER')
configuration.password = os.getenv('LAKEFS_PASSWORD')
configuration.host = os.getenv('LAKEFS_HOST')
client= LakeFSClient(configuration)

def lakefs_open(path):
    splits = path.split('/')
    repo = splits[2]
    branch = splits[3]
    filename = splits[4]
    path = f"../data/{repo}/{branch}/{filename}"
    ret = vaex.open(path)
    ret.variables['lakefs'] = {'repo':repo,'branch':branch}
    return ret

def lakefs_commit(pipeline, path, hooks=None):
    splits = path.split('/')
    repo = splits[2]
    branch = splits[3]
    filename = splits[4]
    path = f"../data/{repo}/{branch}/{filename}"
    if hooks is not None:
        for hook in hooks:
            if hook(pipeline) is False:
                raise RuntimeError("invalid pipeline")
    pipeline.save(path)

def lakefs_load(path):
    splits = path.split('/')
    repo = splits[2]
    branch = splits[3]
    filename = splits[4]
    path = f"../data/{repo}/{branch}/{filename}"
    return Pipeline.from_file(path)

def validate_evaluation(pipeline):
    print('validateing "accuracy" ')
    safe = True
    for key in ['accuracy']:
        if key not in pipeline.variables.get('evaluation',{}):
            print(f"{key} is missing")
            safe = False
    return safe

def validate_output(pipeline):
    print('validateing "prediction" column exists')
    if not 'prediction' in pipeline.example.column_names:
        print("prediction column is missing")
    return 'prediction' in pipeline.example.column_names        


@vaex.register_dataframe_accessor('lakefs', override=True)
class LakeFSAccessor(object):
    def __init__(self, df):
        self.df = df
        self.client = LakeFSClient(configuration)
        
    def _list_branches(self, repo=None):
        repo = repo or self.df.lakefs.repo
        return self.client.branches.list_branches(repo)

    @property
    def branch(self):
        return self.df.variables.get('lakefs',{}).get('branch')
    
    @property
    def branches(self):
        return self._list_branches()['results']
    
    @property
    def repo(self):
        return self.df.variables.get('lakefs',{}).get('repo')
# client.repositories.create_repository(RepositoryCreation(name='server', storage_namespace='local:///Users/yonatanalexander/Dropbox/Development_box/xdss-projects/lakeml/data/server', default_branch='main'))

### First scientist build model - Classification/Regression

In [2]:
#client.branches.create_branch(repository='server',  \
#  branch_creation=models.BranchCreation(name='lightgbm', source='main'))

In [3]:
import vaex
import numpy as np
from vaex.ml.lightgbm import LightGBMModel
from sklearn.metrics import accuracy_score
import json
train, test = vaex.ml.datasets.load_iris_1e5().ml.train_test_split(test_size=0.2, verbose=False)
features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
target = 'class_'


booster = LightGBMModel(features=features, 
                        target=target,                         
                        prediction_name='lgm_predictions', 
                        num_boost_round=500, params={'verbose': -1,
                                                     'objective':'multiclass',
                                                    'num_class':3})
booster.fit(train)
train = booster.transform(train)

@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar,axis=axis)

train.add_function('argmax',argmax)
train['prediction'] = train['lgm_predictions'].argmax()

pipeline = Pipeline.from_dataframe(train)
pipeline.set_variable('accuracy', accuracy_score(pipeline.inference(test[features])['prediction'].values, test[target].values))
path = 'lakefs://server/lightgbm/pipeline.pkl'
data = test.gl.to_records(0)
lakefs_commit(pipeline, path) # commit
print(f"Path: {path}\nData: {json.dumps(data)}")
pipeline.inference(test).head(1)

Path: lakefs://server/lightgbm/pipeline.pkl
Data: {"sepal_length": 5.9, "sepal_width": 3.0, "petal_length": 4.2, "petal_width": 1.5, "class_": 1}


#,sepal_length,sepal_width,petal_length,petal_width,class_,lgm_predictions,prediction
0,5.9,3,4.2,1.5,1,"'array([6.04909665e-09, 9.99999993e-01, 1.266636...",1


## Second data scietist solves another problem - Nearest neighbors

In [None]:
#client.branches.create_branch(repository='server',  \
#  branch_creation=models.BranchCreation(name='knn', source='main'))

In [4]:
import numpy as np
import vaex
import hnswlib

df = vaex.example()
features = df.get_column_names(regex='^(?!id|\\.).*')  # not the id

p = hnswlib.Index(space='l2', dim=df.shape[1] - 1)  # possible options are l2, cosine or ip
p.init_index(max_elements=len(df), ef_construction=200, M=16)
features = df.get_column_names(regex='^(?!id|\\.).*')  # not the id
for i1, i2, chunk in df.to_pandas_df(chunk_size=10000):
    X = chunk[features]
    y = chunk['id']
    p.add_items(X, y)

p.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)

@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = p.knn_query(np.array(columns).T, k=k)
    return np.array(labels)

df['knn'] = df.func.topk(*tuple([df[col] for col in features]), k=3)
df.add_function('topk',topk)
pipeline = Pipeline.from_dataframe(df)
path = 'lakefs://server/knn/pipeline.pkl'
data = df.gl.to_records(8)
lakefs_commit(pipeline,path)
print(f"Path: {path}\nData: {json.dumps(data)}")
pipeline.inference(data)

Path: lakefs://server/knn/pipeline.pkl
Data: {"id": 25, "x": 3.98480486869812, "y": 5.40690803527832, "z": 2.577237367630005, "vx": -38.74491882324219, "vy": -152.4074249267578, "vz": -92.90726470947266, "E": -113632.3203125, "L": 493.3162536621094, "Lz": -397.8236389160156, "FeH": -1.180760145187378, "knn": [3, 26, 23]}


#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,25,3.9848,5.40691,2.57724,-38.7449,-152.407,-92.9073,-113632,493.316,-397.824,-1.18076,"array([ 3, 26, 23], dtype=uint64)"


## Third data scietist build a recommender

In [None]:
#client.branches.create_branch(repository='server',  \
#  branch_creation=models.BranchCreation(name='recommender', source='main'))

In [23]:
import numpy as np
import os
import pickle
from random import choice

import numpy as np
import pyarrow as pa
import pytest
import vaex
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
    )
from implicit.als import AlternatingLeastSquares

df = vaex.open('../datasets/imdb.hdf5')
counts = df['movieId'].value_counts()
counts = counts[counts > 100]
df = df[df['movieId'].isin(counts.index)]  # popular movies
unique_movies = df.groupby(['movieId', 'genres']).agg({'title': 'count'})
genres = {movie: genres for movie, genres in
          zip(unique_movies['movieId'].tolist(), unique_movies['genres'].tolist())}
unique_movies = df.groupby(['movieId', 'title']).agg({'count': 'count'})
titles = {movie: name for movie, name in
          zip(unique_movies['movieId'].tolist(), unique_movies['title'].tolist())}

min_rating = 4.0
df = df[min_rating < df['rating']]  # liked movies
ratings = csr_matrix((np.ones(len(df)), (df['movieId'].values, df['userId'].values)))
os.environ['OPENBLAS_NUM_THREADS'] = '1'

mean_rating = df['rating'].mean()
weighted = (bm25_weight(ratings, B=0.9) * 5).tocsr()

als = AlternatingLeastSquares(factors=32)
# als.fit(weighted)
with open('../tests/models/als.pkl', 'rb') as handle:
    als = pickle.load(handle)

tfidf = TFIDFRecommender()
# tfidf.fit(ratings)
with open('../tests/models/tfidf.pkl', 'rb') as handle:
    tfidf = pickle.load(handle)

users = df['userId'].unique()
user_items = ratings.T.tocsr()

userid = choice(users)
user_history = user_items.getrow(userid).indices
recommendations = als.recommend(userid, user_items, N=10, filter_already_liked_items=True)


@vaex.register_function()
def recommend_als(ar, topk=5, filter_already_liked_items=True):
    ret = []
    for user in ar:
        recommendations = als.recommend(user, user_items, N=topk,
                                        filter_already_liked_items=filter_already_liked_items)
        recommendation = [titles.get(recommendation[0]) for recommendation in recommendations]
        ret.append(recommendation)
    return pa.array(ret)

@vaex.register_function()
def recommend_tfidf(ar, topk=5, filter_already_liked_items=True):
    ret = []
    for user in ar:
        recommendations = tfidf.recommend(user, user_items, N=topk,
                                       filter_already_liked_items=filter_already_liked_items)
        recommendation = [titles.get(recommendation[0]) for recommendation in recommendations]
        ret.append(recommendation)
    return pa.array(ret)

@vaex.register_function(on_expression=False)
def explain(users, items):
    ret = []
    for user,item in zip(users,items):
        score_explained, contributions, W = als.explain(user, user_items, itemid=item)
        items = [i for i, _ in contributions]
        ret.append([titles.get(i) for i in items])
    return pa.array(ret)

df.add_function('recommend_als', recommend_als)
df.add_function('recommend_tfidf', recommend_tfidf)
df.add_function('explain', explain)

df['als'] = df['userId'].recommend_als()
df['tfidf'] = df['userId'].recommend_tfidf()
df['explanation'] = df.func.explain(df['userId'],df['movieId'])


pipeline = Pipeline.from_dataframe(df)
path = 'lakefs://server/recommender/pipeline.pkl'
lakefs_commit(pipeline,path)
data = {'userId': [1, 2, 3]}
columns = ['userId','als','tfidf','explanation']
print(f"Path: {path}\nData: {json.dumps(data)}\nColumns: {','.join(columns)}")
pipeline.inference(data,columns=columns)

Path: lakefs://server/recommender/pipeline.pkl
Data: {"userId": [1, 2, 3]}
Columns: userId,als,tfidf,explanation


#,userId,als,tfidf,explanation
0,1,"""['Videodrome', 'Deep Red (Profondo rosso)', 'Be...","""['Matrix, The', 'Star Wars: Episode IV - A New ...","""['Spider-Man 2', 'Freaks', 'Lord of the Rings: ..."
1,2,"""['Mummy, The', 'Tingler, The', 'Children of the...","""['Raiders of the Lost Ark (Indiana Jones and th...","""['Time Machine, The', 'Creature from the Black ..."
2,3,"""['Wing Commander', 'Red Sonja', 'No Escape', 'A...","""['Aliens', 'Monty Python and the Holy Grail', '...","""['M*A*S*H (a.k.a. MASH)', 'Trading Places', 'Sp..."


#### NLP

In [8]:
# Spacy 
import numpy as np
import pyarrow as pa
import spacy
import vaex
from spacy.cli import download
from spacy.language import Language
from lakeml.vaex import Pipeline


def download_nlp(lang='en_core_web_sm'):
    try:
        nlp = spacy.load(lang)
        return True
    except:
        download(lang)
    return False

download_nlp()

# Build a spacy entities pipeline
@Language.component("ents")
def ents(doc):
    return doc.ents

nlp_entitie = spacy.load('en_core_web_sm')
nlp_entitie.add_pipe('ents', name='ents', last=True)

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million. Apple is doing very well",
    "IBM had their revenue exceeded twelve billion dollars, with a loss of $1b.",
]

df = vaex.from_arrays(text=texts)

@vaex.register_function()
def entities(ar, label='ORG'):
    if not isinstance(ar, list):
        ar = ar.tolist()
    docs = [nlp_entitie(doc) for doc in ar]
    entities = [[str(ent.text) for ent in doc if ent.label_ == label] for doc in docs]
    return pa.array(entities)

df['organisations'] = df.text.entities(label='ORG')
df['money'] = df.text.entities(label='MONEY')

@Language.component("vectorize")
def vectorize(doc):
    return doc.vector

nlp_vectorize = spacy.load('en_core_web_sm', disable=["ner", 'parser'])
nlp_vectorize.add_pipe('vectorize', name='vectorize', last=True)

@vaex.register_function()
def to_vector(ar):
    if not isinstance(ar, list):
        ar = ar.tolist()
    ret = np.array([nlp_vectorize(doc) for doc in ar])
    return ret

df.add_function('entities', entities)
df.add_function('vectorize', vectorize)
df.add_function('to_vector', to_vector)

df['vector'] = df.text.to_vector()

# pipeline = Pipeline.from_dataframe(df)
# path = 'lakefs://server/spacy/pipeline.pkl'
# lakefs_commit(pipeline,path)
# data = {'text': ['Apple and Microsoft are trying a Billion dollar project together']}
# columns = ['userId','als','tfidf','explanation']
# print(f"Path: {path}\nData: {json.dumps(data)}")
# pipeline.inference(data)
df

#,text,organisations,money,vector
0,'Net income was $9.4 million compared to the pri...,['Apple'],"['$9.4 million', '$2.7 million']","'array([ 0.46828035, 0.3082057 , 0.40490758, -..."
1,'IBM had their revenue exceeded twelve billion d...,['IBM'],"['twelve billion dollars', '1b']","'array([ 5.34848511e-01, 3.95896435e-01, -3.327..."


In [2]:
# Huggingface 

import pyarrow as pa
import vaex
from transformers import pipeline as hf_pipeline

classifier = hf_pipeline('sentiment-analysis')

@vaex.register_function()
def sentiment(ar):
    if isinstance(ar, str):
        ar = [ar]
    return pa.array(classifier(ar.tolist()))

df = vaex.from_arrays(text=['We are very happy to include pipeline into the transformers repository.',
                            'ths is real'])
df.add_function('sentiment', sentiment)
df['results'] = df.text.sentiment()
df['label'] = df['results'].apply(lambda x: x.get('label') if isinstance(x, dict) else None,
                                  vectorize=False)
df['score'] = df['results'].apply(lambda x: x.get('score') if isinstance(x, dict) else None, vectorize=False)

# pipeline = Pipeline.from_dataframe(df)
# data ={'text': 'this is my life, and I love it'}
# path = 'lakefs://server/hf/pipeline.pkl'
# lakefs_commit(pipeline, path)
# print(f"Path: {path}\nData: {json.dumps(data)}")
# pipeline.inference(data)
df

#,text,results,label,score
0,'We are very happy to include pipeline into the ...,"{'label': 'POSITIVE', 'score': 0.9978193640708923}",POSITIVE,0.997819
1,ths is real,"{'label': 'POSITIVE', 'score': 0.9997795820236206}",POSITIVE,0.99978


In [5]:
# Auto ML
import vaex
import numpy as np
from vaex.ml.lightgbm import LightGBMModel
from vaex.ml.sklearn import Predictor
from sklearn.metrics import accuracy_score

import json
train, test = vaex.ml.datasets.load_iris_1e5().ml.train_test_split(test_size=0.2, verbose=False)
features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
target = 'class_'

from flaml import AutoML
automl = AutoML()
automl_settings = {
    "time_budget": 10,  # in seconds
    "metric": 'accuracy',
    "task": 'classification'
}


automl.fit(train[features].values, y_train=train[target].values,
           **automl_settings)
print(automl.predict_proba(X_train).shape)
# Export the best model
print(automl.model)

# model = Predictor(model=automl, features=features, target=target)
model.fit(train, )
# print(automl.predict_proba(X_train).shape)
# Export the best model
# print(automl.model)

# booster = LightGBMModel(features=features, 
#                         target=target,                         
#                         prediction_name='lgm_predictions', 
#                         num_boost_round=500, params={'verbose': -1,
#                                                      'objective':'multiclass',
#                                                     'num_class':3})
# booster.fit(train)
# train = booster.transform(train)

# @vaex.register_function()
# def argmax(ar, axis=1):
#     return np.argmax(ar,axis=axis)

# train.add_function('argmax',argmax)
# train['prediction'] = train['lgm_predictions'].argmax()

# pipeline = Pipeline.from_dataframe(train)
# pipeline.set_variable('accuracy', accuracy_score(pipeline.inference(test[features])['prediction'].values, test[target].values))
# path = 'lakefs://server/lightgbm/pipeline.pkl'
# data = test.gl.to_records(0)
# lakefs_commit(pipeline, path) # commit
# print(f"Path: {path}\nData: {json.dumps(data)}")
# pipeline.inference(test).head(1)

[flaml.automl: 09-22 10:40:21] {1431} INFO - Evaluation method: holdout
[flaml.automl: 09-22 10:40:21] {1477} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 09-22 10:40:22] {1514} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 09-22 10:40:22] {1746} INFO - iteration 0, current learner lgbm
[flaml.automl: 09-22 10:40:22] {1931} INFO -  at 0.5s,	best lgbm's error=0.0134,	best lgbm's error=0.0134
[flaml.automl: 09-22 10:40:22] {1746} INFO - iteration 1, current learner lgbm
[flaml.automl: 09-22 10:40:22] {1931} INFO -  at 0.6s,	best lgbm's error=0.0134,	best lgbm's error=0.0134
[flaml.automl: 09-22 10:40:22] {1746} INFO - iteration 2, current learner lgbm
[flaml.automl: 09-22 10:40:22] {1931} INFO -  at 0.6s,	best lgbm's error=0.0134,	best lgbm's error=0.0134
[flaml.automl: 09-22 10:40:22] {1746} INFO - iteration 3, current learner lgbm
[flaml.automl: 09-22 10:40:22] {1931} INFO -  at 0.7s,	best lgbm's error=

[flaml.automl: 09-22 10:40:30] {1746} INFO - iteration 42, current learner xgboost
[flaml.automl: 09-22 10:40:30] {1931} INFO -  at 9.0s,	best xgboost's error=0.0004,	best lgbm's error=0.0004
[flaml.automl: 09-22 10:40:30] {1746} INFO - iteration 43, current learner lgbm
[flaml.automl: 09-22 10:40:30] {1931} INFO -  at 9.3s,	best lgbm's error=0.0004,	best lgbm's error=0.0004
[flaml.automl: 09-22 10:40:30] {1746} INFO - iteration 44, current learner xgboost
[flaml.automl: 09-22 10:40:31] {1931} INFO -  at 9.4s,	best xgboost's error=0.0004,	best lgbm's error=0.0004
[flaml.automl: 09-22 10:40:31] {1746} INFO - iteration 45, current learner extra_tree
[flaml.automl: 09-22 10:40:31] {1931} INFO -  at 9.5s,	best extra_tree's error=0.0004,	best lgbm's error=0.0004
[flaml.automl: 09-22 10:40:31] {1746} INFO - iteration 46, current learner xgboost
[flaml.automl: 09-22 10:40:31] {1931} INFO -  at 9.8s,	best xgboost's error=0.0004,	best lgbm's error=0.0004
[flaml.automl: 09-22 10:40:31] {1746} IN

NameError: name 'X_train' is not defined

In [12]:
# Deep leaarning - todo

In [13]:
# Sklearn pipeline - todo

In [None]:
# MLflow pipeline - todo

# Re-Train

In [None]:
def fit(df):
    print("FIT")
    import vaex
    import numpy as np
    from vaex.ml.lightgbm import LightGBMModel
    from sklearn.metrics import accuracy_score

    
    features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
    target = 'class_'


    booster = LightGBMModel(features=features, 
                            target=target,                         
                            prediction_name='lgm_predictions', 
                            num_boost_round=500, params={'verbose': -1,
                                                         'objective':'multiclass',
                                                        'num_class':3})
    booster.fit(df)
    df = booster.transform(df)

    @vaex.register_function()
    def argmax(ar, axis=1):
        return np.argmax(ar,axis=axis)

    df.add_function('argmax',argmax)
    df['prediction'] = df['lgm_predictions'].argmax()
    return df

train, test = vaex.ml.datasets.load_iris_1e5().ml.train_test_split(test_size=0.2, verbose=False)
pipeline = Pipeline.from_dataframe(train, fit=fit)
pipeline.fit(train)

In [None]:
pipeline.inference(test)
