- https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
- https://github.com/optuna/optuna/blob/master/examples/lightgbm_tuner_simple.py

# start

In [1]:
PRFX='0324_2'
trntmstmp=1584412344
valtmstmp=1585069785
import datetime
print([datetime.datetime.fromtimestamp(o).strftime('%Y-%m-%d %H:%M:%S') for o in (trntmstmp, valtmstmp)])

grand_total=1.5e8
MIN_TM_TRN=1580947200
MIN_TM_TST=1581552000
print([datetime.datetime.fromtimestamp(o).strftime('%Y-%m-%d %H:%M:%S') for o in (MIN_TM_TRN, MIN_TM_TST)])


CHNKSZ=1e3
POST_RATE_WANTED=0.1

['2020-03-17 02:32:24', '2020-03-24 17:09:45']
['2020-02-06 00:00:00', '2020-02-13 00:00:00']


# setup

In [2]:
from pathlib import Path
# import dask
# print('dask.__version__', dask.__version__)
import xgboost as xgb
# import lightgbm as lgb

import optuna
import optuna.integration.lightgbm as lgb


# optuna.logging.CRITICAL, optuna.logging.FATAL
# optuna.logging.ERROR
# optuna.logging.WARNING, optuna.logging.WARN
# optuna.logging.INFO
# optuna.logging.DEBUG
optuna.logging.set_verbosity(optuna.logging.ERROR)


# import dask_xgboost
# import dask.dataframe as dd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from dask.distributed import Client
import pickle
from tqdm import tqdm
from collections import Counter
pd.set_option('display.max_rows', 500)

from functools import reduce
import datetime
def dtnow(): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

SEED=101
HOME='/data/git/recsys20'
p_in=f'{HOME}/input'
p_out=f'{HOME}/output/{PRFX}'
Path(p_out).mkdir(exist_ok=True, parents=True)

from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# https://towardsdatascience.com/how-to-calibrate-undersampled-model-scores-8f3319c1ea5b
# How to use the function?
# Let’s say your goal is to generate a model that shows the credit default probabilities and your original 
# training data has 50,000 rows with only 500 of them labeled as target class. When you sample your non-target 
# instances randomly and reduce the total row count to 10,000, while conserving 500 target rows, our calibration
# function becomes:
# calibration(model_results, 50000, 500, 10000, 500)
# Here model_results is your model probability output array. After you train your model and put the results in it, your function is ready to use. 

def calibration(data, train_pop, target_pop, sampled_train_pop, sampled_target_pop):
    calibrated_data = \
    ((data * (target_pop / train_pop) / (sampled_target_pop / sampled_train_pop)) /
    ((
        (1 - data) * (1 - target_pop / train_pop) / (1 - sampled_target_pop / sampled_train_pop)
     ) +
     (
        data * (target_pop / train_pop) / (sampled_target_pop / sampled_train_pop)
     )))

    return calibrated_data

cols=[
'toks',
'hshtgs',
'twtid',
'media',
'links',
'domns',
'twttyp',
'lang',
'tm',

'u1id',
'u1_fllwer_cnt',
'u1_fllwng_cnt',
'u1_vrfed',
'u1_create_tm',

'u2id',
'u2_fllwer_cnt',
'u2_fllwng_cnt',
'u2_vrfed',
'u2_create_tm',

'u1_fllw_u2',
'reply_tm',
'retwt_tm',
'retwt_cmmnt_tm',
'like_tm',
]
cols_cat = ['twttyp','lang']
cols_val = cols[:-4]
cols_tgt_tmstmp=[
    'retwt_tm',
    'reply_tm',
    'like_tm',
    'retwt_cmmnt_tm',
]
cols_tgt=[o.split('_tm')[0] for o in cols_tgt_tmstmp]
tgts             = ['Retweet','Reply','Like','RTwCmnt',]
assert cols_tgt == ['retwt',  'reply','like','retwt_cmmnt',]
ntgts=len(tgts)



tgt2col=dict(zip(tgts,cols_tgt))
tgt2col

{'Retweet': 'retwt',
 'Reply': 'reply',
 'Like': 'like',
 'RTwCmnt': 'retwt_cmmnt'}

# prepare data

In [3]:
chnks_trn = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',
                    header=None,names=cols, 
                        chunksize=CHNKSZ)
# first chunk as validate data
for ichnk,df in enumerate(chnks_trn):
    df
    break

In [4]:
istrn=True
tm_min = MIN_TM_TRN if istrn else MIN_TM_TST
df['len_toks'] = df.toks.apply(len)
for media in ['Photo', 'Video', 'GIF']:
    df[f'has_media_{media}'] = df.media.fillna('').apply(lambda x: media in x)
for col in ['hshtgs', 'links', 'domns',]:
    df[f'num_{col}'] = df[col].fillna('').apply(lambda x: len(x.split('\t')) if len(x) else 0)

df['twt_age'] = df.tm - tm_min
df['u1_age']  = df.tm - df.u1_create_tm
df['u2_age']  = df.tm - df.u2_create_tm

tm_dt=pd.to_datetime(df.tm, unit='s')
df['tm_dayofweek']=tm_dt.dt.dayofweek
df['tm_hour']=tm_dt.dt.hour

df['tmdlta_u2u1']  = df.u2_create_tm - df.u1_create_tm

df['u1_fllwer_cnt_by_age'] = df.u1_fllwer_cnt / df.u1_age
df['u1_fllwng_cnt_by_age'] = df.u2_fllwng_cnt / df.u2_age

for col in ['twttyp','lang']:
    df[col]=df[col].astype('category')

if istrn: 
    df[cols_tgt]=df[cols_tgt_tmstmp].notna().astype('int8')
    df.drop(inplace=True, columns=['toks', 'hshtgs', 'media', 'links', 'domns',  
                                   'tm', 'u1_create_tm','u2_create_tm', 'u1id', 'u2id', 'twtid', ]+cols_tgt_tmstmp, )
else:
    df.drop(inplace=True, columns=['toks', 'hshtgs', 'media', 'links', 'domns', 
                                   'tm', 'u1_create_tm','u2_create_tm', 'u1id', ])   

In [5]:
df.dtypes

twttyp                  category
lang                    category
u1_fllwer_cnt              int64
u1_fllwng_cnt              int64
u1_vrfed                    bool
u2_fllwer_cnt              int64
u2_fllwng_cnt              int64
u2_vrfed                    bool
u1_fllw_u2                  bool
len_toks                   int64
has_media_Photo             bool
has_media_Video             bool
has_media_GIF               bool
num_hshtgs                 int64
num_links                  int64
num_domns                  int64
twt_age                    int64
u1_age                     int64
u2_age                     int64
tm_dayofweek               int64
tm_hour                    int64
tmdlta_u2u1                int64
u1_fllwer_cnt_by_age     float64
u1_fllwng_cnt_by_age     float64
retwt                       int8
reply                       int8
like                        int8
retwt_cmmnt                 int8
dtype: object

## prep func

In [6]:
def prp_df(df, istrn=True):
    tm_min = MIN_TM_TRN if istrn else MIN_TM_TST
    df['len_toks'] = df.toks.apply(len)
    for media in ['Photo', 'Video', 'GIF']:
        df[f'has_media_{media}'] = df.media.fillna('').apply(lambda x: media in x)
    for col in ['hshtgs', 'links', 'domns',]:
        df[f'num_{col}'] = df[col].fillna('').apply(lambda x: len(x.split('\t')) if len(x) else 0)

    df['twt_age'] = df.tm - tm_min
    df['u1_age']  = df.tm - df.u1_create_tm
    df['u2_age']  = df.tm - df.u2_create_tm

    tm_dt=pd.to_datetime(df.tm, unit='s')
    df['tm_dayofweek']=tm_dt.dt.dayofweek
    df['tm_hour']=tm_dt.dt.hour

    df['tmdlta_u2u1']  = df.u2_create_tm - df.u1_create_tm

    df['u1_fllwer_cnt_by_age'] = df.u1_fllwer_cnt / df.u1_age
    df['u1_fllwng_cnt_by_age'] = df.u2_fllwng_cnt / df.u2_age

    for col in cols_cat:
        df[col]=df[col].astype('category')

    if istrn: 
        df[cols_tgt]=df[cols_tgt_tmstmp].notna().astype('int8')
        df.drop(inplace=True, columns=['toks', 'hshtgs', 'media', 'links', 'domns',  
                                       'tm', 'u1_create_tm','u2_create_tm', 'u1id', 'u2id', 'twtid', ]+cols_tgt_tmstmp, )
    else:
        df.drop(inplace=True, columns=['toks', 'hshtgs', 'media', 'links', 'domns', 
                                       'tm', 'u1_create_tm','u2_create_tm', 'u1id', ])   
    return df

In [7]:
grand_total, grand_total/CHNKSZ

(150000000.0, 150000.0)

## valid data

In [8]:
chnks_trn = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',
                    header=None,names=cols, 
                        chunksize=CHNKSZ)
# first chunk as validate data
for ichnk,df in enumerate(chnks_trn):
    print(dtnow(), 'chunk', ichnk)
#     print([datetime.datetime.fromtimestamp(o).strftime('%Y-%m-%d %H:%M:%S') 
#            for o in (df.tm.min(), df.tm.max())])
    dfvalid = prp_df(df)
    break
print('dfvalid.shape:',dfvalid.shape)

cols_feat=[o for o in dfvalid.columns if o not in cols_tgt]

display(dfvalid[cols_feat].dtypes)

2020-03-24 15:37:51 chunk 0
dfvalid.shape: (1000, 28)


twttyp                  category
lang                    category
u1_fllwer_cnt              int64
u1_fllwng_cnt              int64
u1_vrfed                    bool
u2_fllwer_cnt              int64
u2_fllwng_cnt              int64
u2_vrfed                    bool
u1_fllw_u2                  bool
len_toks                   int64
has_media_Photo             bool
has_media_Video             bool
has_media_GIF               bool
num_hshtgs                 int64
num_links                  int64
num_domns                  int64
twt_age                    int64
u1_age                     int64
u2_age                     int64
tm_dayofweek               int64
tm_hour                    int64
tmdlta_u2u1                int64
u1_fllwer_cnt_by_age     float64
u1_fllwng_cnt_by_age     float64
dtype: object

## trnval data func

In [9]:
def getdftrvl(tgt):
    print(tgt)
    tgtcol=tgt2col[tgt]
    chnks_trn = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',
                        header=None,names=cols, 
                            chunksize=CHNKSZ)
    len_df_wanted = int(CHNKSZ)
    # retwt          0.113031
    # reply          0.027488
    # like           0.439499
    # retwt_cmmnt    0.007742
    pos_rate_wanted = POST_RATE_WANTED
    n_pos_wanted = int(len_df_wanted*pos_rate_wanted)
    print('n_pos_wanted', n_pos_wanted)
    np.random.seed(SEED)
    lst_df = []
    n_pos_ttl = 0
    for ichnk,df in enumerate(chnks_trn):
        #skip first chunk (it was validate data)
        if ichnk==0: continue
        print(dtnow(), 'chunk', ichnk)
        df = prp_df(df)
        n_pos_ttl+= df[tgtcol].sum()
        lst_df.append(df)
        if n_pos_ttl>=n_pos_wanted: break

    df = pd.concat(lst_df)
    df.reset_index(drop=True,inplace=True)


    # https://stackoverflow.com/questions/28556942/pandas-remove-rows-at-random-without-shuffling-dataset
    idx_neg=np.where(df[tgtcol]==0)[0]
    n_neg = len(idx_neg)
    n_pos = len(df)-len(idx_neg)
    n_neg2keep = len_df_wanted-n_pos
    n_neg2rmv = n_neg-n_neg2keep
    idx_neg2rmv = np.random.choice(idx_neg, n_neg2rmv, replace=False)
    dftrvl = df.drop(idx_neg2rmv)
    dftrvl = dftrvl.sample(len(dftrvl))
    for col in cols_cat:
        dftrvl[col]=dftrvl[col].astype('category')
    
#     display(dftrvl.dtypes)
    print('dftrvl.shape:',dftrvl.shape,'dftrvl[tgtcol].mean():',dftrvl[tgtcol].mean())
    
    pops={
        'train_pop':len(dftrvl),
        'target_pop':n_pos,
        'sampled_train_pop':len_df_wanted,
        'sampled_target_pop':n_pos,
    }
    print(pops)
    return dftrvl, pops

# model

In [10]:
def train(params,dtr,dvl):
    print(params)
    best_params, tuning_history = dict(), list()
    evallist = [(dtr, 'train'), (dvl, 'eval')]
    bst = lgb.train(params=params, 
                    train_set=dtr, 
                    valid_sets=[dtr, dvl],
                    best_params=best_params,
                    tuning_history=tuning_history,
                    verbose_eval=100,
                    early_stopping_rounds=100,
                   )
                    
    return bst,best_params,tuning_history

def valid(bst,dftr,dfvl):
    prdtr = bst.predict(dftr[cols_feat],num_iteration=bst.best_iteration)
    prdvl = bst.predict(dfvl[cols_feat],num_iteration=bst.best_iteration)
    return prdtr,prdvl

def do_tgt(tgt):
    params=tgt2params[tgt]
    tgtcol=tgt2col[tgt]
    dftrvl, pops=getdftrvl(tgt)
    split=int(len(dftrvl)*0.85)
    dftr,dfvl=dftrvl[:split],dftrvl[split:]
    dtr = lgb.Dataset(dftr[cols_feat], label=dftr[tgtcol])
    dvl = lgb.Dataset(dfvl[cols_feat], label=dfvl[tgtcol])
    bst,best_params,tuning_history=train(params,dtr,dvl)
    prdtr,prdvl=valid(bst,dftr,dfvl)
    
    tgt2bst[tgt]=bst
    tgt2best_params[tgt]=best_params
    tgt2tuning_history[tgt]=tuning_history
    tgt2ytr[tgt]=dftr[tgtcol]
    tgt2yvl[tgt]=dfvl[tgtcol]
    tgt2pops[tgt]=pops
    tgt2prdtr[tgt]=prdtr
    tgt2prdvl[tgt]=prdvl


In [11]:
params_shared = {
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": 0,
    "boosting_type": "gbdt",
}
tgt2params = {k:params_shared for k in tgts}

tgt2bst={}
tgt2best_params={}
tgt2tuning_history={}
tgt2ytr={}
tgt2yvl={}
tgt2prdtr={}
tgt2prdvl={}
tgt2pops={}
for tgt in tgts:
    print(dtnow(), tgt, '*'*80)
    do_tgt(tgt)
    
pickle.dump(tgt2bst, open(f"{p_out}/tgt2bst.p", "wb"))

2020-03-24 15:38:05 Retweet ********************************************************************************
Retweet
n_pos_wanted 100
2020-03-24 15:38:05 chunk 1
2020-03-24 15:38:05 chunk 2
dftrvl.shape: (1000, 28) dftrvl[tgtcol].mean(): 0.212


twttyp                  category
lang                    category
u1_fllwer_cnt              int64
u1_fllwng_cnt              int64
u1_vrfed                    bool
u2_fllwer_cnt              int64
u2_fllwng_cnt              int64
u2_vrfed                    bool
u1_fllw_u2                  bool
len_toks                   int64
has_media_Photo             bool
has_media_Video             bool
has_media_GIF               bool
num_hshtgs                 int64
num_links                  int64
num_domns                  int64
twt_age                    int64
u1_age                     int64
u2_age                     int64
tm_dayofweek               int64
tm_hour                    int64
tmdlta_u2u1                int64
u1_fllwer_cnt_by_age     float64
u1_fllwng_cnt_by_age     float64
retwt                       int8
reply                       int8
like                        int8
retwt_cmmnt                 int8
dtype: object

{'train_pop': 1000, 'target_pop': 212, 'sampled_train_pop': 1000, 'sampled_target_pop': 212}
{'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': 0, 'boosting_type': 'gbdt'}


tune_feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0372239	valid_1's binary_logloss: 0.676846
Early stopping, best iteration is:
[28]	training's binary_logloss: 0.217313	valid_1's binary_logloss: 0.495529


tune_feature_fraction, val_score: 0.495529:  14%|#4        | 1/7 [00:02<00:15,  2.65s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0327378	valid_1's binary_logloss: 0.687035
Early stopping, best iteration is:
[13]	training's binary_logloss: 0.322383	valid_1's binary_logloss: 0.496011


tune_feature_fraction, val_score: 0.495529:  29%|##8       | 2/7 [00:05<00:12,  2.59s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0294413	valid_1's binary_logloss: 0.702776
Early stopping, best iteration is:
[17]	training's binary_logloss: 0.283621	valid_1's binary_logloss: 0.509902


tune_feature_fraction, val_score: 0.495529:  43%|####2     | 3/7 [00:07<00:10,  2.56s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0267399	valid_1's binary_logloss: 0.688221
Early stopping, best iteration is:
[24]	training's binary_logloss: 0.219965	valid_1's binary_logloss: 0.485437


tune_feature_fraction, val_score: 0.485437:  57%|#####7    | 4/7 [00:11<00:08,  2.75s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0244557	valid_1's binary_logloss: 0.717433
Early stopping, best iteration is:
[9]	training's binary_logloss: 0.355337	valid_1's binary_logloss: 0.506939


tune_feature_fraction, val_score: 0.485437:  71%|#######1  | 5/7 [00:13<00:05,  2.71s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0235907	valid_1's binary_logloss: 0.7071
Early stopping, best iteration is:
[11]	training's binary_logloss: 0.32465	valid_1's binary_logloss: 0.49389


tune_feature_fraction, val_score: 0.485437:  86%|########5 | 6/7 [00:15<00:02,  2.61s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.021596	valid_1's binary_logloss: 0.731467
Early stopping, best iteration is:
[12]	training's binary_logloss: 0.31415	valid_1's binary_logloss: 0.496003


tune_feature_fraction, val_score: 0.485437: 100%|##########| 7/7 [00:18<00:00,  2.59s/it]
tune_num_leaves, val_score: 0.485437:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:   5%|5         | 1/20 [00:03<00:57,  3.02s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  10%|#         | 2/20 [00:06<00:54,  3.00s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  15%|#5        | 3/20 [00:09<00:51,  3.05s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  20%|##        | 4/20 [00:12<00:49,  3.12s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  25%|##5       | 5/20 [00:16<00:47,  3.19s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  30%|###       | 6/20 [00:19<00:43,  3.14s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  35%|###5      | 7/20 [00:22<00:40,  3.15s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  40%|####      | 8/20 [00:23<00:33,  2.77s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  45%|####5     | 9/20 [00:25<00:27,  2.54s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  50%|#####     | 10/20 [00:28<00:23,  2.40s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.238082	valid_1's binary_logloss: 0.527882
Early stopping, best iteration is:
[45]	training's binary_logloss: 0.325119	valid_1's binary_logloss: 0.490359


tune_num_leaves, val_score: 0.485437:  55%|#####5    | 11/20 [00:28<00:17,  1.93s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  60%|######    | 12/20 [00:31<00:16,  2.05s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  65%|######5   | 13/20 [00:33<00:15,  2.18s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0266154	valid_1's binary_logloss: 0.698874
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.249204	valid_1's binary_logloss: 0.488118


tune_num_leaves, val_score: 0.485437:  70%|#######   | 14/20 [00:36<00:13,  2.32s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.330625	valid_1's binary_logloss: 0.494024
Early stopping, best iteration is:
[48]	training's binary_logloss: 0.38734	valid_1's binary_logloss: 0.477118


tune_num_leaves, val_score: 0.477118:  75%|#######5  | 15/20 [00:37<00:09,  1.85s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.330625	valid_1's binary_logloss: 0.494024
Early stopping, best iteration is:
[48]	training's binary_logloss: 0.38734	valid_1's binary_logloss: 0.477118


tune_num_leaves, val_score: 0.477118:  80%|########  | 16/20 [00:37<00:05,  1.45s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.374678	valid_1's binary_logloss: 0.495209


KeyboardInterrupt: 

In [12]:
import lightgbm

In [14]:
lightgbm.__version__

'2.3.1'

# analyze

## tr vl

In [None]:
tgt2auc_tr={}
tgt2rce_tr={}
tgt2auc_vl={}
tgt2rce_vl={}
for tgt in tgt2bst:
    print(tgt)
    prdtr_i, prdvl_i = tgt2prdtr[tgt], tgt2prdvl[tgt]
    ytr_i, yvl_i = tgt2ytr[tgt], tgt2yvl[tgt]
    scr_auc_tr=compute_prauc(prdtr_i, ytr_i)
    scr_rce_tr=compute_rce(prdtr_i, ytr_i)
    scr_auc_vl=compute_prauc(prdvl_i, yvl_i)
    scr_rce_vl=compute_rce(prdvl_i, yvl_i)

    tgt2auc_tr[tgt]=scr_auc_tr
    tgt2rce_tr[tgt]=scr_rce_tr
    tgt2auc_vl[tgt]=scr_auc_vl
    tgt2rce_vl[tgt]=scr_rce_vl
    
    print('tr prauc:', f'{scr_auc_tr:.4f}','tr rce:', f'{scr_rce_tr:.4f}', )
    print('vl prauc:', f'{scr_auc_vl:.4f}','vl rce:', f'{scr_rce_vl:.4f}', )

In [None]:
lsttr=[]
lstvl=[]
for tgt in ['Retweet','Reply','Like','RTwCmnt',]:
    if tgt not in tgt2bst: continue
    lsttr+=[(f'PRAUC {tgt}',tgt2auc_tr[tgt]),
          (f'RCE {tgt}',tgt2rce_tr[tgt])]
    lstvl+=[(f'PRAUC {tgt}',tgt2auc_vl[tgt]),
          (f'RCE {tgt}',tgt2rce_vl[tgt])]

dfscrtr=pd.DataFrame(lsttr)
dfscrtr.columns=['metric','scr']
dfscrvl=pd.DataFrame(lstvl)
dfscrvl.columns=['metric','scr']
dfscr = pd.merge(dfscrtr, dfscrvl, on='metric', suffixes=('tr','vl'))
dfscr.T

## valid

In [None]:
# tgt=tgts[1]
# tgtcol=tgt2col[tgt]
# bst=tgt2bst[tgt]

# dvalid=xgb.DMatrix(dfvalid[cols_feat], label=dfvalid[tgtcol], feature_names=cols_feat)

# prdvalid = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit)

# pops=tgt2pops[tgt]

# prdvalid[:10]
# # array([0.11734424, 0.09971393, 0.05619054, 0.03059793, 0.07979691,
# #        0.01358252, 0.05293725, 0.27954698, 0.05738379, 0.01741553],
# #       dtype=float32)


# pops
# # {'train_pop': 4000000,
# #  'target_pop': 109752,
# #  'sampled_train_pop': 1000000,
# #  'sampled_target_pop': 109752}

# prdvalid_calib = calibration(prdvalid, **pops)

# prdvalid_calib[:10]
# # array([0.02952491, 0.02471944, 0.01344113, 0.00717127, 0.01945818,
# #        0.00314114, 0.0126298 , 0.08155248, 0.01373977, 0.00403964],
# #       dtype=float32)

In [None]:
def do_post_valid(tgt):
    tgtcol=tgt2col[tgt]
    bst=tgt2bst[tgt]
    pops=tgt2pops[tgt]
    prdvalid = bst.predict(dfvalid[cols_feat])
    prdvalid_calib = calibration(prdvalid, **pops)
    return prdvalid,prdvalid_calib

In [None]:
tgt2yvalid={tgt:dfvalid[tgt2col[tgt]] for tgt in tgts}
tgt2prdvalid={}
tgt2prdvalid_calib={}
for tgt in tgts:
    print(dtnow(), tgt)
    tgt2prdvalid[tgt],tgt2prdvalid_calib[tgt]=do_post_valid(tgt)


In [None]:
tgt2auc_valid={}
tgt2rce_valid={}
tgt2auc_valid_calib={}
tgt2rce_valid_calib={}
for tgt in tgts:
    print(dtnow(), tgt)
    prdvalid, prdvalid_calib = tgt2prdvalid[tgt], tgt2prdvalid_calib[tgt]
    yvalid = tgt2yvalid[tgt]
    scr_auc_valid=compute_prauc(prdvalid, yvalid)
    scr_rce_valid=compute_rce(prdvalid, yvalid)
    scr_auc_valid_calib=compute_prauc(prdvalid_calib, yvalid)
    scr_rce_valid_calib=compute_rce(prdvalid_calib, yvalid)

    tgt2auc_valid[tgt]=scr_auc_valid
    tgt2rce_valid[tgt]=scr_rce_valid
    tgt2auc_valid_calib[tgt]=scr_auc_valid_calib
    tgt2rce_valid_calib[tgt]=scr_rce_valid_calib

In [None]:
for tgt in tgts:
    print(tgt)
    print('tr          prauc:', f'{tgt2auc_tr[tgt]:.4f}','tr rce:', f'{tgt2rce_tr[tgt]:.4f}', )
    print('vl          prauc:', f'{tgt2auc_vl[tgt]:.4f}','tr rce:', f'{tgt2rce_vl[tgt]:.4f}', )
    print('valid       prauc:', f'{tgt2auc_valid[tgt]:.4f}','tr rce:', f'{tgt2rce_valid[tgt]:.4f}', )
    print('valid_calib prauc:', f'{tgt2auc_valid_calib[tgt]:.4f}','tr rce:', f'{tgt2rce_valid_calib[tgt]:.4f}', )
    

In [None]:
lsttr=[]
lstvl=[]
lstvalid=[]
lstvalid_calib=[]
for tgt in ['Retweet','Reply','Like','RTwCmnt',]:
    if tgt not in tgt2bst: continue
    lsttr+=[(f'PRAUC {tgt}',tgt2auc_tr[tgt]),
          (f'RCE {tgt}',tgt2rce_tr[tgt])]
    lstvl+=[(f'PRAUC {tgt}',tgt2auc_vl[tgt]),
          (f'RCE {tgt}',tgt2rce_vl[tgt])]
    lstvalid+=[(f'PRAUC {tgt}',tgt2auc_valid[tgt]),
          (f'RCE {tgt}',tgt2rce_valid[tgt])]
    lstvalid_calib+=[(f'PRAUC {tgt}',tgt2auc_valid_calib[tgt]),
          (f'RCE {tgt}',tgt2rce_valid_calib[tgt])]

dfscrtr=pd.DataFrame(lsttr)
dfscrtr.columns=['metric','scr']
dfscrvl=pd.DataFrame(lstvl)
dfscrvl.columns=['metric','scr']
dfscrvalid=pd.DataFrame(lstvalid)
dfscrvalid.columns=['metric','scr']
dfscrvalid_calib=pd.DataFrame(lstvalid_calib)
dfscrvalid_calib.columns=['metric','scr']

dfscr = reduce(lambda df1,df2: pd.merge(df1,df2,on='metric'), 
            [dfscrtr,dfscrvl,dfscrvalid,dfscrvalid_calib])

dfscr.columns=['scr','tr','vl','valid','valid_calib']
dfscr.T

# infer

In [None]:
%%time
dftst=pd.read_csv(
    f'{p_in}/val_{valtmstmp}.tsv',
#     f'{p_in}/val_259A6F6DFD672CB1F883CBEC01B99F2D_1584405047.tsv',
    sep='\x01', header=None, names=cols_val,)


In [None]:
%%time
dftst = prp_df(dftst, istrn=False)

In [None]:
%%time
dtst = xgb.DMatrix(dftst[cols_feat], feature_names=cols_feat)
tgt2prdtst={}
for tgt in tgts:
    print(dtnow(), tgt)
    bst = tgt2bst[tgt]
    pops=tgt2pops[tgt]
    prdtst = bst.predict(dtst, ntree_limit=bst.best_ntree_limit)
    prdtst_calib = calibration(prdtst, **pops)
    tgt2prdtst[tgt] = prdtst_calib

In [None]:
dfsub_ids = dftst[['twtid','u2id',]]

tgt2dfsub = {}
for tgt,prdtst in tgt2prdtst.items():
    dfsub = dfsub_ids.copy()
    dfsub['scr'] = prdtst
    tgt2dfsub[tgt]=dfsub

In [None]:
%%time
for i,tgt in enumerate(['Retweet','Reply','RTwCmnt','Like',]):
    dfsub = tgt2dfsub[tgt]
    print(dtnow(), tgt)
    dfsub.to_csv(f'{p_out}/{i}_{tgt}__{valtmstmp}__{PRFX}.csv',index=False,header=False)