In [1]:
import sys, os, io, json, numpy as np, random
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [3]:
params = dict(
    alpha = 0.3, 
    s_max = 0.98+1,
    height = 300000,
    features = [
        dict(
            lift = 3,
            linlift = dict(k= 1, b = 0.5),
            beta = 0.2,
        ),
        dict(
            lift = 2,
            linlift = dict(k=-1, b = 1.5),
            beta = 0.1,
        ),
    ]
)
for i in range(13):
    params['features'].append(params['features'][1])
print(len(params['features']))

15


In [4]:
def generate_dataset(params):
    random.seed(42)
    np.random.seed(42)
    alpha = params["alpha"]
    h, w = params['height'], len(params['features'])
    label = (np.random.rand(h, 1) < params['alpha'])*1
    f_time = np.random.rand(h, 1)
    betas = np.array([f["beta"] for f in params['features']])
    # lifts = np.array([f["lift"] for f in params['features']])
    liftk = np.array([f["linlift"]['k'] for f in params['features']])
    liftb = np.array([f["linlift"]['b'] for f in params['features']])
    lifts = liftk * f_time + liftb
    gammas = (1/(alpha * lifts) - 1) / (1/alpha - 1) * betas
    
    s = (np.random.rand(h, 1) < params['s_max']) * 1
    probs = s * (label * betas + (1 - label) * gammas ) + (1-s)*0.5
    features = np.random.rand(h, w) < probs
    #print(probs, file= sys.stderr)
    #print(s, betas, lifts, gammas, file=sys.stderr)
    return features, f_time, label

In [5]:
features, f_time, label = generate_dataset(params)
np.concatenate((features, f_time, label), axis=1)

array([[1.        , 0.        , 0.        , ..., 0.        , 0.15705398,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.09550877,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.1379392 ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.71214818,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.95571873,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.65507034,
        0.        ]])

In [6]:
avg_target = np.mean(label)
avg_target

0.29917333333333335

In [7]:
def get_time_cond(f_time, interval = (0,1)):
    return np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
def cal_lift(features, f_time, label, feat_index = 0, interval = (0,1)):
    feat_cond = features[:,feat_index]==1
    #print("f", feat_cond.shape, file=sys.stderr)
    time_cond = np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
    #print("t", time_cond.shape, file=sys.stderr)
    full_cond = np.logical_and(feat_cond, time_cond)
    #print("full", full_cond.shape, file=sys.stderr)
    cond_avg_target = np.mean(label[full_cond])
    avg_target = np.mean(label)
    return cond_avg_target/avg_target

In [8]:
cal_lift(features, f_time, label, feat_index=3, interval=(0.9,1))

0.5371290835457543

In [9]:
def subpool(features, f_time, label, interval = (0,1)):
    time_cond = np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
    return features[time_cond], f_time[time_cond], label[time_cond]

In [10]:
LX, LT, LY = subpool(features, f_time, label, interval=(0,0.2))
N = LX.shape[0]
NL = N * 80//100
print(NL)

47852


In [12]:
np.savez_compressed('pool_00', features=features, f_time=f_time, label=label)

In [112]:
LDM = xgb.DMatrix(data=LX[:NL, :], label=LY[:NL, 0])
TDM = xgb.DMatrix(data=LX[NL:, :], label=LY[NL:, 0])

In [113]:
xgbpar = {'objective':'binary:logistic', 'eta' : 0.07,
          'eval_metric':['auc','logloss']}
model = xgb.train(xgbpar, LDM, num_boost_round=1000, 
                  evals=((LDM, "tr"), (TDM, "ts")),
                  early_stopping_rounds = 100
                 )

[0]	tr-auc:0.644356	tr-logloss:0.678893	ts-auc:0.641636	ts-logloss:0.678858
Multiple eval metrics have been passed: 'ts-logloss' will be used for early stopping.

Will train until ts-logloss hasn't improved in 100 rounds.
[1]	tr-auc:0.666129	tr-logloss:0.666446	ts-auc:0.661682	ts-logloss:0.666423
[2]	tr-auc:0.674787	tr-logloss:0.655512	ts-auc:0.670374	ts-logloss:0.65545
[3]	tr-auc:0.677379	tr-logloss:0.645885	ts-auc:0.673209	ts-logloss:0.645823
[4]	tr-auc:0.679332	tr-logloss:0.637394	ts-auc:0.675453	ts-logloss:0.637327
[5]	tr-auc:0.680411	tr-logloss:0.629861	ts-auc:0.676426	ts-logloss:0.629829
[6]	tr-auc:0.681216	tr-logloss:0.623178	ts-auc:0.677524	ts-logloss:0.623106
[7]	tr-auc:0.681396	tr-logloss:0.617243	ts-auc:0.677656	ts-logloss:0.617181
[8]	tr-auc:0.681614	tr-logloss:0.611955	ts-auc:0.677918	ts-logloss:0.611869
[9]	tr-auc:0.68223	tr-logloss:0.607235	ts-auc:0.678612	ts-logloss:0.607167
[10]	tr-auc:0.682286	tr-logloss:0.603024	ts-auc:0.678522	ts-logloss:0.602986
[11]	tr-auc:0.68266

In [116]:
predictions = model.predict(xgb.DMatrix(features))

In [118]:
predictions

array([0.7054956 , 0.10541746, 0.36589977, ..., 0.7276575 , 0.23678562,
       0.48518702], dtype=float32)

In [121]:
K = 20
for i in range(K):
    time_interval = (i*1.0/K, (i+1)*1.0/K)
    time_cond = get_time_cond(f_time, time_interval)
    preds = predictions[time_cond]
    y = label[time_cond]
    auc = roc_auc_score(y, preds)
    print(time_interval, 'score =',auc)

(0.0, 0.05) score = 0.7253250285200782
(0.05, 0.1) score = 0.7031348818208171
(0.1, 0.15) score = 0.6756916465502522
(0.15, 0.2) score = 0.6514390379742208
(0.2, 0.25) score = 0.6292827015539875
(0.25, 0.3) score = 0.609488715431795
(0.3, 0.35) score = 0.5815539851727588
(0.35, 0.4) score = 0.5627084733941038
(0.4, 0.45) score = 0.5357251635804632
(0.45, 0.5) score = 0.5116797018262662
(0.5, 0.55) score = 0.4866488325308987
(0.55, 0.6) score = 0.4583578753330661
(0.6, 0.65) score = 0.43200021155884916
(0.65, 0.7) score = 0.39908694080806023
(0.7, 0.75) score = 0.36863770544227575
(0.75, 0.8) score = 0.33405624750044965
(0.8, 0.85) score = 0.29817314538536926
(0.85, 0.9) score = 0.2611345532059892
(0.9, 0.95) score = 0.2237111235999157
(0.95, 1.0) score = 0.18100900187680075
