In [54]:
import itertools
import pickle
import math

import lightgbm
import numpy as np
import pandas as pd
import shap
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree
import sklearn.ensemble

In [2]:
sklearn.datasets.load_iris(as_frame=True)["frame"].shape

(150, 5)

In [64]:
import lightgbm
import shap

data = sklearn.datasets.load_iris(as_frame=True)

df = data["frame"].loc[lambda df: df.target.isin([0, 1])].sample(50, random_state=123).reset_index(drop=True)

X, y = df.drop(columns="target"), df["target"]

model = lightgbm.LGBMClassifier(
#     learning_rate=0.3,
#     boost_from_average=False,
    n_estimators=100,
    max_depth=3,
    objective='cross_entropy',
#     num_leaves=50
#     min_child_weight=13,
).fit(X, y)

# explainer = shap.TreeExplainer(model, data=X)
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X)

print(df.target.value_counts(normalize=True).sort_index().values)
print(explainer.expected_value)
print(1 / ( 1 + np.exp(-explainer.expected_value)))

going through node_sample_weight branchg
[0.46 0.54]
0.7510986478445273
0.6794180419009056


In [69]:
mean_logit = model.predict(X, raw_score=True).mean()

In [70]:
mean_logit

0.7510986478445275

In [4]:
for k in sorted(model.booster_.dump_model().keys()):
    if k != 'tree_info':
        print(f'{k}: {model.booster_.dump_model()[k]}')

average_output: False
feature_importances: {'petal_length_(cm)': 1}
feature_infos: {'sepal_length_(cm)': {'min_value': 4.4, 'max_value': 7, 'values': []}, 'sepal_width_(cm)': {'min_value': 2, 'max_value': 4.2, 'values': []}, 'petal_length_(cm)': {'min_value': 1.2, 'max_value': 5, 'values': []}, 'petal_width_(cm)': {'min_value': 0.1, 'max_value': 1.8, 'values': []}}
feature_names: ['sepal_length_(cm)', 'sepal_width_(cm)', 'petal_length_(cm)', 'petal_width_(cm)']
label_index: 0
max_feature_idx: 3
monotone_constraints: []
name: tree
num_class: 1
num_tree_per_iteration: 1
objective: cross_entropy
pandas_categorical: []
version: v3


In [5]:
print(json.dumps(model.booster_.dump_model()['tree_info'][0], indent=4))

{
    "tree_index": 0,
    "num_leaves": 2,
    "num_cat": 0,
    "shrinkage": 1,
    "tree_structure": {
        "split_index": 0,
        "split_feature": 2,
        "split_gain": 46.11800003051758,
        "threshold": 1.8,
        "decision_type": "<=",
        "default_left": true,
        "missing_type": "None",
        "internal_value": 0.160343,
        "internal_weight": 0,
        "internal_count": 50,
        "left_child": {
            "leaf_index": 0,
            "leaf_value": -0.057048660532323575,
            "leaf_weight": 5.46480005979538,
            "leaf_count": 22
        },
        "right_child": {
            "leaf_index": 1,
            "leaf_value": 0.331150104553238,
            "leaf_weight": 6.9552000761032104,
            "leaf_count": 28
        }
    }
}


In [23]:
5.46480005979538 + 6.9552000761032104

12.42000013589859

### Check calculation of leaf_weight

In [6]:
features = X.columns

threshold = 1.8

X_left = X[X[features[2]] <= threshold]

X_right = X[X[features[2]] > threshold]

y_right = y.loc[X_right.index]
print(y_right.value_counts())

y_left = y.loc[X_left.index]
print(y_left.value_counts())

1    27
0     1
Name: target, dtype: int64
0    22
Name: target, dtype: int64


In [7]:
# left child:
t = -0.057048660532323575
y_hat = np.exp(t) / (1 + np.exp(t))
hessian = y_hat * (1 - y_hat)
hessian * 22

5.495527420441854

In [8]:
hessian

0.2497967009291752

In [9]:
5.46480005979538 / 5.495527420441854

0.9944086602985226

In [10]:
# right child:
t = 0.331150104553238
y_hat = 1 / (1 + np.exp(-t))
hessian = y_hat * (1 - y_hat)
hessian * 28

6.811548002699206

In [11]:
hessian

0.24326957152497167

In [12]:
6.811548002699206 / 6.9552000761032104

0.9793460904313067

In [67]:
5.495527420441854 + 6.811548002699206

12.307075423141061

In [58]:
y_hat

0.5820391886541324

In [59]:
model.predict_proba(X[:5])

array([[0.5142583 , 0.4857417 ],
       [0.41796081, 0.58203919],
       [0.41796081, 0.58203919],
       [0.5142583 , 0.4857417 ],
       [0.41796081, 0.58203919]])

### Confirmed in binary classification, internal_value and leaf_value are logits in terms of $p(y=1|\mathbf{x})$.

In [17]:
score = internal_value = 0.160343

This initial logit is calculated at [xentropy_objective.hpp#L134](https://github.com/microsoft/LightGBM/blob/4971a06668df7eabeb7d4bb1987abb442f2970c9/src/objective/xentropy_objective.hpp#L134).

In [18]:
z = np.exp(internal_value) / (1 + np.exp(internal_value))

In [19]:
z * (1 - z)

0.24839999304628652

In [21]:
z * (1 - z) * 22

5.464799847018304

which matches the fraction examples with y=1 in `df`.

In [24]:
model.predict_proba(X[:5])

array([[0.5142583 , 0.4857417 ],
       [0.41796081, 0.58203919],
       [0.41796081, 0.58203919],
       [0.5142583 , 0.4857417 ],
       [0.41796081, 0.58203919]])

In [21]:
model.predict(X[:2], raw_score=True)

array([-0.05704866,  0.3311501 ])

In [29]:
t = model.predict(X[:2], raw_score=True)

In [30]:
t

array([-0.05704866,  0.3311501 ])

In [23]:
np.exp(t) / (1 + np.exp(t))

array([0.4857417 , 0.58203919])

In [24]:
12.42 / 0.2484

50.0

In [33]:
with open("/tmp/tree_dump.json", "wt") as opened:
    json.dump(
        lightgbm.Booster(
            model_file="/Users/zhuyi/Projects/monorepo/monorepo/ds/olm/train/doap/results/model_lgb.txt"
        ).dump_model()["tree_info"][0],
        opened,
    )

In [34]:
with open("/tmp/tree_dump.json", "rt") as opened:
    tree = json.load(opened)

In [47]:
def confirm_hessian_split(tree):
    root = tree["tree_structure"]
    depth = 0
    stack = [(root, depth)]
    while len(stack) > 0:
        node, depth = stack.pop()

        if "left_child" in node:
            lc = node["left_child"]
            rc = node["right_child"]
            stack.extend([(lc, depth + 1), (rc, depth + 1)])

            if depth == 0:
                # at root, the internal weight appears to be always 0 (can be a bug?)
                assert node["internal_weight"] == 0
                continue

            # weights are sum of hessians.
            lc_weight = lc.get("internal_weight") or lc.get("leaf_weight")
            rc_weight = rc.get("internal_weight") or rc.get("leaf_weight")
            assert abs(1 -  node["internal_weight"] / (lc_weight + rc_weight)) < 0.001

In [55]:
for _tree in tqdm(lightgbm.Booster(
        model_file="/Users/zhuyi/Projects/monorepo/monorepo/ds/olm/train/doap/results/model_lgb.txt"
    ).dump_model()["tree_info"]):
    confirm_hessian_split(_tree)

100%|██████████| 50/50 [00:00<00:00, 13535.25it/s]
