In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from faster_lgbm_predictor import FasterLgbmPredictor

### 1.加载数据

In [2]:
df=pd.read_csv("./data/train.csv")

In [3]:
del df["PassengerId"]
del df["Name"]
del df["Sex"]
del df["Ticket"]
del df["Cabin"]
del df["Embarked"]

In [4]:
df=df.fillna(0)
df.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [5]:
target=df["Survived"].values
del df["Survived"]

In [6]:
categorical_features=["Pclass","SibSp","Parch"]

### 2.1 二分类测试

In [7]:
params={"objective":"binary","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(data=df,label=target,categorical_feature=categorical_features),num_boost_round=16)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288




In [8]:
faster_lgbm_predictor=FasterLgbmPredictor(model=lgb_model.dump_model(),cache_num=10)

In [9]:
ori_pred=lgb_model.predict(df)

In [10]:
fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

In [11]:
np.sum(np.abs(fast_pred-ori_pred))

0.0

### 2.2 回归测试

In [12]:
params={"objective":"regression","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(data=df,label=target,categorical_feature=categorical_features),num_boost_round=16)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] Start training from score 0.383838


In [13]:
faster_lgbm_predictor=FasterLgbmPredictor(model=lgb_model.dump_model(),cache_num=10)

In [14]:
ori_pred=lgb_model.predict(df)

In [15]:
fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

In [16]:
np.sum(np.abs(fast_pred-ori_pred))

0.0

### 2.3 指数分布回归测试¶

In [17]:
params={"objective":"tweedie","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(df,target,categorical_feature=categorical_features),num_boost_round=16)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] Start training from score -0.957534


In [18]:
faster_lgbm_predictor=FasterLgbmPredictor(model=lgb_model.dump_model(),cache_num=10)

In [19]:
ori_pred=lgb_model.predict(df)

In [20]:
fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

In [21]:
np.sum(np.abs(fast_pred-ori_pred))

0.0

### 性能对比

In [22]:
from tqdm import tqdm

In [23]:
new_data=df.to_dict("records")
for data in tqdm(new_data):
    faster_lgbm_predictor.predict(data)

100%|██████████████████████████████████████████████████████████████████████████████| 891/891 [00:00<00:00, 5431.20it/s]


In [24]:
new_data=[]
for i in list(range(len(df))):
    new_data.append(df[i:i+1])
for data in tqdm(new_data):
    lgb_model.predict(data)

100%|██████████████████████████████████████████████████████████████████████████████| 891/891 [00:00<00:00, 1240.67it/s]
