In [1]:
import sys, os, io, json, numpy as np, random
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [3]:
params = dict(
    alpha = 0.3, 
    s_max = 0.98+1,
    height = 300000,
    features = [
        dict(
            lift = 3,
            linlift = dict(k= 1, b = 0.5),
            beta = 0.2,
        ),
        dict(
            lift = 2,
            linlift = dict(k=-1, b = 1.5),
            beta = 0.1,
        ),
    ]
)
for i in range(13):
    params['features'].append(params['features'][1])
print(len(params['features']))

15


In [2]:
params = dict(
    alpha = 0.3, 
    s_max = 0.98+1,
    height = 300000,
    features = [
        dict(
            lift = 3,
            linlift = dict(k= 0.75, b = 0.5),
            beta = 0.2,
        ),
        dict(
            lift = 2,
            linlift = dict(k=-0.75, b = 1.25),
            beta = 0.1,
        ),
    ]
)
for i in range(15):
    params['features'].append(params['features'][(i<7)*1])

print(len(params['features']))

17


In [3]:
def generate_dataset(params):
    random.seed(42)
    np.random.seed(42)
    alpha = params["alpha"]
    h, w = params['height'], len(params['features'])
    label = (np.random.rand(h, 1) < params['alpha'])*1
    f_time = np.random.rand(h, 1)
    betas = np.array([f["beta"] for f in params['features']])
    # lifts = np.array([f["lift"] for f in params['features']])
    liftk = np.array([f["linlift"]['k'] for f in params['features']])
    liftb = np.array([f["linlift"]['b'] for f in params['features']])
    lifts = liftk * f_time + liftb
    gammas = (1/(alpha * lifts) - 1) / (1/alpha - 1) * betas
    
    s = (np.random.rand(h, 1) < params['s_max']) * 1
    probs = s * (label * betas + (1 - label) * gammas ) + (1-s)*0.5
    features = np.random.rand(h, w) < probs
    #print(probs, file= sys.stderr)
    #print(s, betas, lifts, gammas, file=sys.stderr)
    return features, f_time, label

In [4]:
features, f_time, label = generate_dataset(params)
# np.concatenate((features, f_time, label), axis=1)

In [5]:
avg_target = np.mean(label)
avg_target

0.29917333333333335

In [6]:
def get_time_cond(f_time, interval = (0,1)):
    return np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
def cal_lift(features, f_time, label, feat_index = 0, feat_val=1, interval = (0,1)):
    feat_cond = features[:,feat_index]==feat_val
    #print("f", feat_cond.shape, file=sys.stderr)
    time_cond = np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
    #print("t", time_cond.shape, file=sys.stderr)
    full_cond = np.logical_and(feat_cond, time_cond)
    #print("full", full_cond.shape, file=sys.stderr)
    cond_avg_target = np.mean(label[full_cond])
    avg_target = np.mean(label)
    return cond_avg_target/avg_target

In [7]:
ml = label[f_time[:,0] < 0.75].mean()

In [8]:
(
    cal_lift(features, f_time, label, feat_index=2, feat_val=1, interval=(0.0,0.75)),
    cal_lift(features, f_time, label, feat_index=2, feat_val=0, interval=(0.0,0.75))*ml,
    cal_lift(features, f_time, label, feat_index=2, feat_val=1, interval=(0.0,0.75))*ml
)

(0.9212832612247759, 0.30206879833507755, 0.2756616202804939)

In [9]:
#features[:,0].sum()/features[:,1].shape[0]

In [11]:
def subpool(features, f_time, label, interval = (0,1)):
    time_cond = np.logical_and(interval[0] <= f_time[:,0], f_time[:,0] < interval[1])
    return features[time_cond], f_time[time_cond], label[time_cond]

In [12]:
LX, LT, LY = subpool(features, f_time, label, interval=(0,0.2))
N = LX.shape[0]
NL = N * 80//100
print(NL)

47852


In [13]:
np.savez_compressed('pool_cross_00', features=features, f_time=f_time, label=label)

In [14]:
LDM = xgb.DMatrix(data=LX[:NL, :], label=LY[:NL, 0])
TDM = xgb.DMatrix(data=LX[NL:, :], label=LY[NL:, 0])

In [15]:
xgbpar = {'objective':'binary:logistic', 'eta' : 0.07,
          'eval_metric':['auc','logloss']}
model = xgb.train(xgbpar, LDM, num_boost_round=1000, 
                  evals=((LDM, "tr"), (TDM, "ts")),
                  early_stopping_rounds = 100
                 )

[0]	tr-auc:0.79323	tr-logloss:0.669076	ts-auc:0.786267	ts-logloss:0.66941
Multiple eval metrics have been passed: 'ts-logloss' will be used for early stopping.

Will train until ts-logloss hasn't improved in 100 rounds.
[1]	tr-auc:0.812839	tr-logloss:0.647806	ts-auc:0.806952	ts-logloss:0.648504
[2]	tr-auc:0.82098	tr-logloss:0.628912	ts-auc:0.816906	ts-logloss:0.629728
[3]	tr-auc:0.827427	tr-logloss:0.612029	ts-auc:0.824005	ts-logloss:0.613045
[4]	tr-auc:0.829352	tr-logloss:0.596933	ts-auc:0.826317	ts-logloss:0.598062
[5]	tr-auc:0.832652	tr-logloss:0.583317	ts-auc:0.829753	ts-logloss:0.584714
[6]	tr-auc:0.834798	tr-logloss:0.571021	ts-auc:0.832739	ts-logloss:0.572401
[7]	tr-auc:0.835541	tr-logloss:0.559958	ts-auc:0.834157	ts-logloss:0.561275
[8]	tr-auc:0.836325	tr-logloss:0.549884	ts-auc:0.835216	ts-logloss:0.551285
[9]	tr-auc:0.836745	tr-logloss:0.540736	ts-auc:0.835506	ts-logloss:0.542252
[10]	tr-auc:0.837193	tr-logloss:0.532395	ts-auc:0.835717	ts-logloss:0.53402
[11]	tr-auc:0.837437	

In [16]:
predictions = model.predict(xgb.DMatrix(features))

In [17]:
predictions

array([0.0185776 , 0.10039712, 0.03509774, ..., 0.40956512, 0.22983895,
       0.6221115 ], dtype=float32)

In [121]:
K = 20
for i in range(K):
    time_interval = (i*1.0/K, (i+1)*1.0/K)
    time_cond = get_time_cond(f_time, time_interval)
    preds = predictions[time_cond]
    y = label[time_cond]
    auc = roc_auc_score(y, preds)
    print(time_interval, 'score =',auc)

(0.0, 0.05) score = 0.7253250285200782
(0.05, 0.1) score = 0.7031348818208171
(0.1, 0.15) score = 0.6756916465502522
(0.15, 0.2) score = 0.6514390379742208
(0.2, 0.25) score = 0.6292827015539875
(0.25, 0.3) score = 0.609488715431795
(0.3, 0.35) score = 0.5815539851727588
(0.35, 0.4) score = 0.5627084733941038
(0.4, 0.45) score = 0.5357251635804632
(0.45, 0.5) score = 0.5116797018262662
(0.5, 0.55) score = 0.4866488325308987
(0.55, 0.6) score = 0.4583578753330661
(0.6, 0.65) score = 0.43200021155884916
(0.65, 0.7) score = 0.39908694080806023
(0.7, 0.75) score = 0.36863770544227575
(0.75, 0.8) score = 0.33405624750044965
(0.8, 0.85) score = 0.29817314538536926
(0.85, 0.9) score = 0.2611345532059892
(0.9, 0.95) score = 0.2237111235999157
(0.95, 1.0) score = 0.18100900187680075


In [18]:
K = 20
for i in range(K):
    time_interval = (i*1.0/K, (i+1)*1.0/K)
    time_cond = get_time_cond(f_time, time_interval)
    preds = predictions[time_cond]
    y = label[time_cond]
    auc = roc_auc_score(y, preds)
    print(time_interval, 'score =',auc)

(0.0, 0.05) score = 0.9045238369271478
(0.05, 0.1) score = 0.8627839612556528
(0.1, 0.15) score = 0.8319184450273336
(0.15, 0.2) score = 0.7993923763007279
(0.2, 0.25) score = 0.7474637611125556
(0.25, 0.3) score = 0.7171748386716642
(0.3, 0.35) score = 0.6745491746655123
(0.35, 0.4) score = 0.6429669625182832
(0.4, 0.45) score = 0.6256383813694489
(0.45, 0.5) score = 0.5799928149755085
(0.5, 0.55) score = 0.5444379641493202
(0.55, 0.6) score = 0.5279781039406892
(0.6, 0.65) score = 0.5010279935316119
(0.65, 0.7) score = 0.47514997732024933
(0.7, 0.75) score = 0.4440856140367465
(0.75, 0.8) score = 0.41466018835676477
(0.8, 0.85) score = 0.3973962100280298
(0.85, 0.9) score = 0.37351616370939067
(0.9, 0.95) score = 0.3495481973024354
(0.95, 1.0) score = 0.32557655666430346
