In [1]:
import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


def log_loss(y_true, y_pred, pos_weight=None):
    is_positive = y_true == 1#is_positive的mask
    loss = np.zeros_like(y_pred)# loss初始化为0,维度与y_pred相同
    loss[is_positive] = -np.log(y_pred[is_positive])
    loss[~is_positive] = -np.log(1.0 - y_pred[~is_positive])
    if pos_weight:
        weights = np.ones_like(y_pred)
        weights[is_positive] = pos_weight
        return np.average(loss, weights=weights)
    else:
        return np.average(loss)

In [18]:
y_pred=np.array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0])
y_true=np.array([0, 1, 1, 0, 1, 0, 0, 0, 1, 0])
is_positive = y_true == 1
loss = np.zeros_like(y_pred)# loss初始化为0
loss[is_positive] = -np.log(y_pred[is_positive])
loss[~is_positive] = -np.log(1.0 - y_pred[~is_positive])
loss

  """
  


array([          0,           0, -2147483648, -2147483648,           0,
                 0,           0,           0,           0,           0])

In [None]:
loss[is_positive] = -np.log(y_pred[is_positive])
loss[~is_positive] = -np.log(1.0 - y_pred[~is_positive])

In [1]:
def train_lightgbm(train_X, test_X, train_y, test_y, pos_weight, set_unbalance=True):
    params = {
        "num_iterations": 10,
        "objective": "binary",
        "metrics": ["binary_logloss"],
        "seed": 0
    }
    if set_unbalance:
        params["is_unbalance"] = True
        train_dataset = lightgbm.Dataset(train_X, train_y)
        test_dataset = lightgbm.Dataset(test_X, test_y, reference=train_dataset)
    else:
        train_weights = np.ones_like(train_y)
        train_weights[train_y == 1] = pos_weight
        test_weights = np.ones_like(test_y)
        test_weights[test_y == 1] = pos_weight
        train_dataset = lightgbm.Dataset(train_X, train_y, weight=train_weights)
        test_dataset= lightgbm.Dataset(test_X, test_y, weight=test_weights, reference=train_dataset)

    model = lightgbm.train(params,
                           train_dataset,
                           valid_sets=[train_dataset, test_dataset],
                           valid_names=["train", "test"])
    train_preds = model.predict(train_X)
    test_preds = model.predict(test_X)
    print("[Without weight] train binary logloss: {}, test binary logloss: {}".format(log_loss(train_y, train_preds),
                                                                     log_loss(test_y, test_preds)))
    print("[With weight] train binary logloss: {}, test binary logloss: {}".format(log_loss(train_y, train_preds, pos_weight=pos_weight),
                                                                     log_loss(test_y, test_preds, pos_weight=pos_weight)))


def main():
    X, y = make_classification(5000, weights=[0.9, 0.1], flip_y=0.0, random_state=0)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)

    n_train_pos = train_y.sum()
    n_test_pos = test_y.sum()
    print("Train data, positive: {}, negative: {}, positive ratio: {:.2f}".format(n_train_pos,
                                                                                  len(train_y) - n_train_pos,
                                                                                  n_train_pos / len(train_y)))
    print("Test data, positive: {}, negative: {}, positive ratio: {:.2f}".format(n_test_pos,
                                                                                  len(test_y) - n_test_pos,
                                                                                  n_test_pos / len(test_y)))
    train_lightgbm(train_X, test_X, train_y, test_y, pos_weight=9, set_unbalance=True)
    train_lightgbm(train_X, test_X, train_y, test_y, pos_weight=9, set_unbalance=False)


if __name__ == "__main__":
    main()

Train data, positive: 349, negative: 3151, positive ratio: 0.10
Test data, positive: 151, negative: 1349, positive ratio: 0.10
[LightGBM] [Info] Number of positive: 349, number of negative: 3151
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099714 -> initscore=-2.200403
[LightGBM] [Info] Start training from score -2.200403
[1]	train's binary_logloss: 0.232839	test's binary_logloss: 0.236435
[2]	train's binary_logloss: 0.19447	test's binary_logloss: 0.199018
[3]	train's binary_logloss: 0.167684	test's binary_logloss: 0.172805
[4]	train's binary_logloss: 0.147018	test's binary_logloss: 0.152877
[5]	train's binary_logloss: 0.129909	test's binary_logloss: 0.13641
[6]	train's binary_logloss: 0.116018	test's binary_logloss: 0.122999
[7]	train's binary_logloss: 0.104074	test's binary_logloss: 0.111615
[8]	tr

