## Setup

In [None]:
!pip install numpy pandas pyvespa lightgbm

Installing required packages:

In [1]:
import json
import lightgbm as lgb
import numpy as np
import pandas as pd

## Create data

In [2]:
# Create random training set
features = pd.DataFrame({
            "feature_1": np.random.random(100),
            "feature_2": np.random.random(100),
            "feature_3": pd.Series(np.random.choice(["a", "b", "c"], size=100), dtype="category")
    
        })
features.head()

Unnamed: 0,feature_1,feature_2,feature_3
0,0.919458,0.342478,a
1,0.882831,0.521677,b
2,0.462281,0.200733,a
3,0.149167,0.415715,c
4,0.612369,0.387363,c


Generate target variables:

In [3]:
numeric_features = pd.get_dummies(features)
targets = (
    (numeric_features["feature_1"] + 
     numeric_features["feature_2"]  -
     0.5 * numeric_features["feature_3_a"] + 
     0.5 * numeric_features["feature_3_c"]) > 1.0
) * 1.0
targets

0     0.0
1     1.0
2     0.0
3     1.0
4     1.0
     ... 
95    0.0
96    0.0
97    0.0
98    1.0
99    0.0
Length: 100, dtype: float64

## Fit lightgbm model

In [4]:
training_set = lgb.Dataset(features, targets)

# Train the model
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 3,
}
model = lgb.train(params, training_set, num_boost_round=5)

[LightGBM] [Info] Number of positive: 55, number of negative: 45
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550000 -> initscore=0.200671
[LightGBM] [Info] Start training from score 0.200671


## Vespa application package

In [5]:
from vespa.package import ApplicationPackage, Field, RankProfile, Function

app_package = ApplicationPackage(name="lightgbm")
app_package.schema.add_fields(
    Field(name="numeric", type="double", indexing=["summary", "attribute"]),
    Field(name="categorical", type="string", indexing=["summary", "attribute"])
)
app_package.schema.add_rank_profile(
    RankProfile(
        name="classify", 
        functions=[
            Function(name="feature_1", expression="attribute(numeric)"),
            Function(name="feature_2", expression="query(query_value)"),
            Function(name="feature_3", expression="attribute(categorical)")            
            
        ],
        first_phase="lightgbm('lightgbm_model.json')"
    )
)

We can check how the Vespa search defition file will look like:

In [6]:
print(app_package.schema.schema_to_text)

schema lightgbm {
    document lightgbm {
        field numeric type double {
            indexing: summary | attribute
        }
        field categorical type string {
            indexing: summary | attribute
        }
    }
    rank-profile classify {
        function feature_1() {
            expression {
                attribute(numeric)
            }
        }
        function feature_2() {
            expression {
                query(query_value)
            }
        }
        function feature_3() {
            expression {
                attribute(categorical)
            }
        }
        first-phase {
            expression: lightgbm('lightgbm_model.json')
        }
    }
}


We can export the application package files to disk:

In [7]:
from pathlib import Path
Path("lightgbm").mkdir(parents=True, exist_ok=True)
app_package.to_files("lightgbm")

Note that we don't have any models under the `models` folder. We need to export the lightGBM model that we trained earlier to `models/lightgbm.json`.

In [8]:
!tree lightgbm

[01;34mlightgbm[00m
├── [01;34mfiles[00m
├── [01;34mmodels[00m
├── [01;34mschemas[00m
│   └── lightgbm.sd
├── [01;34msearch[00m
│   └── [01;34mquery-profiles[00m
│       ├── default.xml
│       └── [01;34mtypes[00m
│           └── root.xml
└── services.xml

6 directories, 4 files


## Export the model

In [9]:
with open("lightgbm/models/lightgbm_model.json", "w") as f:
    json.dump(model.dump_model(), f, indent=2)

Now we can see that the model is where Vespa expects it to be:

In [10]:
!tree lightgbm

[01;34mlightgbm[00m
├── [01;34mfiles[00m
├── [01;34mmodels[00m
│   └── lightgbm_model.json
├── [01;34mschemas[00m
│   └── lightgbm.sd
├── [01;34msearch[00m
│   └── [01;34mquery-profiles[00m
│       ├── default.xml
│       └── [01;34mtypes[00m
│           └── root.xml
└── services.xml

6 directories, 5 files


## Deploy the application

In [11]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker()
app = vespa_docker.deploy_from_disk(application_name="lightgbm", application_root="lightgbm")

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Waiting for configuration server, 10/300 seconds...
Waiting for application status, 0/300 seconds...
Waiting for application status, 5/300 seconds...
Waiting for application status, 10/300 seconds...
Waiting for application status, 15/300 seconds...
Waiting for application status, 20/300 seconds...
Waiting for application status, 25/300 seconds...
Waiting for application status, 30/300 seconds...
Waiting for application status, 35/300 seconds...
Finished deployment.


## Feed the data

In [12]:
feed_batch = [
    {
        "id": idx, 
        "fields": {"numeric": row["feature_1"],
                   "categorical": row["feature_3"]}
    } for idx, row in features.iterrows()
]

In [13]:
status = app.feed_batch(feed_batch)

Successful documents fed: 100/100.
Batch progress: 1/1.


## Query

In [14]:
hits = app.query(
    body={
        "yql": "select * from sources * where true",
        "ranking": "classify",
        "ranking.features.query(query_value)": 0.1,
        "hits": 100
    }
).hits

## Check Vespa and model predictions match

In [15]:
predictions = pd.DataFrame.from_records(
[
    {
        "vespa_relevance": hit["relevance"], 
        "feature_1": hit["fields"]["numeric"], 
        "feature_2": 0.1,
        "feature_3": hit["fields"]["categorical"]
    } for hit in hits
]
)
predictions["feature_3"] = predictions["feature_3"].astype('category') 

In [16]:
X = predictions[["feature_1", "feature_2", "feature_3"]]
X.head()

Unnamed: 0,feature_1,feature_2,feature_3
0,0.67668,0.1,c
1,0.752287,0.1,c
2,0.778901,0.1,c
3,0.637117,0.1,c
4,0.868025,0.1,c


In [17]:
assert predictions.vespa_relevance.tolist() == model.predict(X).tolist()

In [18]:
predictions.vespa_relevance.tolist()

[0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691

In [19]:
model.predict(X).tolist()

[0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5949984715574806,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691198784,
 0.5430836691

## Clean environment

In [20]:
!rm -fr lightgbm
vespa_docker.container.stop(timeout=600)
vespa_docker.container.remove()