In [146]:
import pandas as pd
import numpy as np
import os, sys, joblib, math, time
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from itertools import combinations
import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import operator

In [110]:
# GLOBALS
LOCAL_ROOT = '/Users/varunnathan/Documents/General/ExternalTest/'
PROJ_DIR = os.path.join(LOCAL_ROOT, 'Bukukas')
RAW_DIR = os.path.join(PROJ_DIR, 'raw')
INTER_DIR = os.path.join(PROJ_DIR, 'inter')
MODEL_DIR = os.path.join(PROJ_DIR, 'model')
INP_FN = os.path.join(RAW_DIR, 'data.xlsx')
DV = 'habit4'
USER = 'user_id'

In [149]:
%%time
print('read data')
df = pd.read_excel(INP_FN, sheet_name='data')

print('formatting data')
column = df.columns.tolist()[0]
df[column] = df[column].str.split(',')
new_columns = column.split(',')
df = pd.DataFrame(df[column].to_list(), columns=new_columns)

print(df.shape)
df.head()

read data
formatting data
(12011, 62)
CPU times: user 552 ms, sys: 54.6 ms, total: 607 ms
Wall time: 608 ms


Unnamed: 0,user_id,ts_install,ts_otp,profile_completed_at,ts_first_key_event,ts_second_key_event,ts_third_key_event,first_key_event,second_key_event,third_key_event,...,no_inventory_24h,no_payable_72h,no_receivable_72h,no_payment_in_72h,no_payment_out_72h,no_purchase_72h,no_sale_w_cogs_72h,no_sale_72h,no_inventory_72h,habit4
0,2999895,,,2021-01-28 06:49:47.554186 UTC,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,3002178,2021-03-05 08:10:39 UTC,,2021-01-28 11:09:21.50913 UTC,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,2999247,,,2021-01-28 03:54:45.229602 UTC,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,3157635,,,2021-02-05 22:39:56.262401 UTC,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,3259592,,,2021-02-12 05:23:34.621182 UTC,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [150]:
# user_id column uniqueness
assert df['user_id'].nunique() == df.shape[0]

In [151]:
# dtype adjustment
numeric_cols = ['no_bus_created_1h','no_bus_created_12h','no_bus_created_24h',
                'no_bus_created_72h','no_screens_1h','no_screens_12h','no_screens_24h',
                'no_screens_72h','no_sessions_1h','no_sessions_12h','no_sessions_24h',
                'no_sessions_72h','no_payable_1h','no_receivable_1h','no_payment_in_1h',
                'no_payment_out_1h','no_purchase_1h','no_sale_w_cogs_1h','no_sale_1h',
                'no_inventory_1h','no_payable_12h','no_receivable_12h','no_payment_in_12h',
                'no_payment_out_12h','no_purchase_12h','no_sale_w_cogs_12h','no_sale_12h',
                'no_inventory_12h','no_payable_24h','no_receivable_24h','no_payment_in_24h',
                'no_payment_out_24h','no_purchase_24h','no_sale_w_cogs_24h','no_sale_24h',
                'no_inventory_24h','no_payable_72h','no_receivable_72h','no_payment_in_72h',
                'no_payment_out_72h','no_purchase_72h','no_sale_w_cogs_72h','no_sale_72h',
                'no_inventory_72h']
cat_cols = ['first_key_event','second_key_event','third_key_event','city','os','source_l0',
            'source_l1','source_l2','user_segment','bus_cat_most_used']
date_cols = ['ts_install','ts_otp','profile_completed_at','ts_first_key_event',
             'ts_second_key_event','ts_third_key_event']

In [152]:
assert len(numeric_cols) + len(cat_cols) + len(date_cols) + 2 == df.shape[1]

In [153]:
print('numeric')
for col in numeric_cols+[USER, DV]:
    df[col] = pd.to_numeric(df[col])

print('date')
for col in date_cols:
    df[col] = pd.to_datetime(df[col])

print('categorical')
for col in cat_cols:
    print(col, df[col].nunique())

numeric
date
categorical
first_key_event 9
second_key_event 9
third_key_event 9
city 9
os 3
source_l0 3
source_l1 12
source_l2 12
user_segment 3
bus_cat_most_used 12


In [154]:
df.dtypes.tolist()

[dtype('int64'),
 datetime64[ns, UTC],
 datetime64[ns, UTC],
 datetime64[ns, UTC],
 datetime64[ns, UTC],
 datetime64[ns, UTC],
 datetime64[ns, UTC],
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('

In [155]:
print('DV Rate: ', df[DV].mean())

DV Rate:  0.054033802347847805


In [157]:
df[date_cols].isnull().sum()

ts_install              4762
ts_otp                  2095
profile_completed_at       0
ts_first_key_event      4728
ts_second_key_event     6354
ts_third_key_event      7276
dtype: int64

In [158]:
mask = df['ts_install'] > df['profile_completed_at']
print(mask.sum())
print(df.loc[mask, :].head())

215
     user_id                ts_install ts_otp  \
1    3002178 2021-03-05 08:10:39+00:00    NaT   
17   3041315 2021-02-26 10:31:24+00:00    NaT   
138  3090455 2021-02-11 17:50:32+00:00    NaT   
153  3042165 2021-03-03 02:27:15+00:00    NaT   
186  3021632 2021-02-24 11:35:45+00:00    NaT   

                profile_completed_at               ts_first_key_event  \
1   2021-01-28 11:09:21.509130+00:00                              NaT   
17  2021-01-29 18:55:48.540481+00:00        2021-01-29 18:56:28+00:00   
138 2021-02-02 00:00:37.305996+00:00                              NaT   
153 2021-01-29 19:54:31.842994+00:00 2021-01-29 19:55:46.067000+00:00   
186 2021-01-28 22:00:49.790929+00:00 2021-01-28 22:02:00.411000+00:00   

    ts_second_key_event ts_third_key_event first_key_event second_key_event  \
1                   NaT                NaT                                    
17                  NaT                NaT      receivable                    
138                 NaT  

In [251]:
def getCountVar(compute_df, count_df, var_name, count_var):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])


def getDVEncodeVar(compute_df, target_df, var_name, target_var):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(train_df, test_df, cols_to_encode, target_col, encode_type, n_splits=3):
        
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    for col in cols_to_encode:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_col]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            
            if encode_type == 'dv':
                train_enc_values[val_index] =  np.array( 
                    getDVEncodeVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getDVEncodeVar(test_df[[col]], dev_X, col, target_col))
            elif encode_type == 'count':
                train_enc_values[val_index] =  np.array( 
                    getCountVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getCountVar(test_df[[col]], dev_X, col, target_col))
        
        test_enc_values /= n_splits
        train_df[col + "_{}_enc_{}".format(encode_type, target_col)] = train_enc_values
        test_df[col + "_{}_enc_{}".format(encode_type, target_col)] = test_enc_values
        
    return train_df, test_df


def create_feature_map(features):
    out_fn = os.path.join(MODEL_DIR, 'xgb.fmap')
    outfile = open(out_fn, 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = "auc"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        xgb_model_fn = os.path.join(MODEL_DIR, 'xgbmodel.txt')
        xgb_fmap_fn = os.path.join(MODEL_DIR, 'xgb.fmap')
        xgb_imp_fn = os.path.join(MODEL_DIR, 'imp_feat.txt')
        model.dump_model(xgb_model_fn, xgb_fmap_fn, with_stats=True)
        importance = model.get_fscore(fmap=xgb_fmap_fn)
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv(xgb_imp_fn, index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary"
    params['metric'] = "auc"
    params['seed'] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 70
    params["min_data_in_leaf"] = 20
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def trainModel(train_X, train_y, test_X, test_y, n_splits, model_name, feats, **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test

    pred_test_full /= n_splits
    auc = metrics.roc_auc_score(train_y, pred_val_full)
    test_auc = metrics.roc_auc_score(test_y, pred_test_full)
    return pred_val_full, pred_test_full, cv_scores, auc, test_auc


def calc_prec_rec_f(pred_class, y_test):
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, pred_class).ravel()
    prec = 1.*tp/(tp+fp+0.000000001)
    rec = 1.*tp/(tp+fn+0.000000001)
    f_score = 2.*prec*rec/(prec + rec + 0.000000001)
    return prec, rec, f_score


def optimal_cutoff(pred_test, y_test, low=0, high=1000):
    out = []
    for i in range(low, high, 1):
        if i % 20 == 0:
            print(i)
        pred_class = list(map(lambda x: 1 if x > (i/1000.) else 0, pred_test))
        prec, rec, f_score = calc_prec_rec_f(pred_class, y_test)
        d = {'precision': prec, 'recall': rec, 'f_score': f_score, 'threshold': i/1000.}
        out.append(d)
    out = pd.DataFrame(out)
    out.sort_values('f_score', ascending=False, inplace=True)
    out.reset_index(drop=True, inplace=True)
    return out

In [160]:
print('mapping for categorical vars\n')
le_pipes = []
for var in cat_cols:
    le = LabelEncoder()
    le.fit(df[var].values)

    df[var] = le.transform(df[var].values)
    le_pipes.append((var, le))

mapping for categorical vars



In [161]:
"""
date features
1. time_to_register_from_installation
2. time_to_register_from_otp
3. time_to_receive_otp_from_installation
4. time_to_first_event_from_registration
5. time_to_first_event_from_installation
6. time_to_second_event_from_registration
7. time_to_second_event_from_installation
8. time_to_third_event_from_registration
9. time_to_third_event_from_installation
"""
def get_time_bet_two_dates(date1, date2):
    return (date2 - date1).days


for col1, col2, name in [('ts_install', 'profile_completed_at',
                          'time_to_register_from_installation'),
                         ('ts_otp', 'profile_completed_at',
                          'time_to_register_from_otp'),
                        ('ts_install', 'ts_otp',
                          'time_to_receive_otp_from_installation'),
                        ('profile_completed_at', 'ts_first_key_event',
                          'time_to_first_event_from_registration'),
                        ('ts_install', 'ts_first_key_event',
                          'time_to_first_event_from_installation'),
                        ('profile_completed_at', 'ts_second_key_event',
                          'time_to_second_event_from_registration'),
                        ('ts_install', 'ts_second_key_event',
                          'time_to_second_event_from_installation'),
                        ('profile_completed_at', 'ts_third_key_event',
                          'time_to_third_event_from_registration'),
                        ('ts_install', 'ts_third_key_event',
                          'time_to_third_event_from_installation')]:
    print(name)
    df[name] = list(map(lambda date1, date2: get_time_bet_two_dates(date1, date2),
                        df[col1], df[col2]))
    numeric_cols.append(name)

time_to_register_from_installation
time_to_register_from_otp
time_to_receive_otp_from_installation
time_to_first_event_from_registration
time_to_first_event_from_installation
time_to_second_event_from_registration
time_to_second_event_from_installation
time_to_third_event_from_registration
time_to_third_event_from_installation


In [162]:
mask1 = df['ts_install'].notnull()
mask2 = df['ts_third_key_event'].notnull()
df.loc[mask1&mask2, ['ts_install', 'ts_third_key_event',
                     'time_to_third_event_from_installation']].isnull().sum()

ts_install                               0
ts_third_key_event                       0
time_to_third_event_from_installation    0
dtype: int64

In [163]:
print('encoding cat_cols by aggregating numeric cols\n')
for cat_col in cat_cols:
    for num_col in numeric_cols:
        for func in ['mean', 'std']:
            feat_name = '_'.join([func, num_col, 'per', cat_col])
            print(feat_name)
            df[feat_name] = df.groupby([cat_col])[num_col].transform(func)
            print('\n')

print('count encodings for cat cols\n')
for cat_col in cat_cols:
    print(cat_col)
    df[cat_col+'_count'] = df.groupby(cat_col)[USER].transform('count')

print('encoding cat_cols interactions by aggregating numeric cols\n')
iter_cat_cols = list(combinations(cat_cols, 2))
for f1, f2 in iter_cat_cols:
    df[f1+'_'+f2+'_count'] = df.groupby([f1, f2])[USER].transform('count')

encoding cat_cols by aggregating numeric cols

mean_no_bus_created_1h_per_first_key_event


std_no_bus_created_1h_per_first_key_event


mean_no_bus_created_12h_per_first_key_event


std_no_bus_created_12h_per_first_key_event


mean_no_bus_created_24h_per_first_key_event


std_no_bus_created_24h_per_first_key_event


mean_no_bus_created_72h_per_first_key_event


std_no_bus_created_72h_per_first_key_event


mean_no_screens_1h_per_first_key_event


std_no_screens_1h_per_first_key_event


mean_no_screens_12h_per_first_key_event


std_no_screens_12h_per_first_key_event


mean_no_screens_24h_per_first_key_event


std_no_screens_24h_per_first_key_event


mean_no_screens_72h_per_first_key_event


std_no_screens_72h_per_first_key_event


mean_no_sessions_1h_per_first_key_event


std_no_sessions_1h_per_first_key_event


mean_no_sessions_12h_per_first_key_event


std_no_sessions_12h_per_first_key_event


mean_no_sessions_24h_per_first_key_event


std_no_sessions_24h_per_first_key_event


mean_no_



std_time_to_receive_otp_from_installation_per_second_key_event


mean_time_to_first_event_from_registration_per_second_key_event


std_time_to_first_event_from_registration_per_second_key_event


mean_time_to_first_event_from_installation_per_second_key_event


std_time_to_first_event_from_installation_per_second_key_event


mean_time_to_second_event_from_registration_per_second_key_event


std_time_to_second_event_from_registration_per_second_key_event


mean_time_to_second_event_from_installation_per_second_key_event


std_time_to_second_event_from_installation_per_second_key_event


mean_time_to_third_event_from_registration_per_second_key_event


std_time_to_third_event_from_registration_per_second_key_event


mean_time_to_third_event_from_installation_per_second_key_event


std_time_to_third_event_from_installation_per_second_key_event


mean_no_bus_created_1h_per_third_key_event


std_no_bus_created_1h_per_third_key_event


mean_no_bus_created_12h_per_third_key_event


std_no_b



std_no_payment_out_12h_per_os


mean_no_purchase_12h_per_os


std_no_purchase_12h_per_os


mean_no_sale_w_cogs_12h_per_os


std_no_sale_w_cogs_12h_per_os


mean_no_sale_12h_per_os


std_no_sale_12h_per_os


mean_no_inventory_12h_per_os


std_no_inventory_12h_per_os


mean_no_payable_24h_per_os


std_no_payable_24h_per_os


mean_no_receivable_24h_per_os


std_no_receivable_24h_per_os


mean_no_payment_in_24h_per_os


std_no_payment_in_24h_per_os


mean_no_payment_out_24h_per_os


std_no_payment_out_24h_per_os


mean_no_purchase_24h_per_os


std_no_purchase_24h_per_os


mean_no_sale_w_cogs_24h_per_os


std_no_sale_w_cogs_24h_per_os


mean_no_sale_24h_per_os


std_no_sale_24h_per_os


mean_no_inventory_24h_per_os


std_no_inventory_24h_per_os


mean_no_payable_72h_per_os


std_no_payable_72h_per_os


mean_no_receivable_72h_per_os


std_no_receivable_72h_per_os


mean_no_payment_in_72h_per_os


std_no_payment_in_72h_per_os


mean_no_payment_out_72h_per_os


std_no_payment_out_72h_per_os




mean_no_bus_created_12h_per_source_l2


std_no_bus_created_12h_per_source_l2


mean_no_bus_created_24h_per_source_l2


std_no_bus_created_24h_per_source_l2


mean_no_bus_created_72h_per_source_l2


std_no_bus_created_72h_per_source_l2


mean_no_screens_1h_per_source_l2


std_no_screens_1h_per_source_l2


mean_no_screens_12h_per_source_l2


std_no_screens_12h_per_source_l2


mean_no_screens_24h_per_source_l2


std_no_screens_24h_per_source_l2


mean_no_screens_72h_per_source_l2


std_no_screens_72h_per_source_l2


mean_no_sessions_1h_per_source_l2


std_no_sessions_1h_per_source_l2


mean_no_sessions_12h_per_source_l2


std_no_sessions_12h_per_source_l2


mean_no_sessions_24h_per_source_l2


std_no_sessions_24h_per_source_l2


mean_no_sessions_72h_per_source_l2


std_no_sessions_72h_per_source_l2


mean_no_payable_1h_per_source_l2


std_no_payable_1h_per_source_l2


mean_no_receivable_1h_per_source_l2


std_no_receivable_1h_per_source_l2


mean_no_payment_in_1h_per_source_l2


std_no_



std_no_sale_24h_per_bus_cat_most_used


mean_no_inventory_24h_per_bus_cat_most_used


std_no_inventory_24h_per_bus_cat_most_used


mean_no_payable_72h_per_bus_cat_most_used


std_no_payable_72h_per_bus_cat_most_used


mean_no_receivable_72h_per_bus_cat_most_used


std_no_receivable_72h_per_bus_cat_most_used


mean_no_payment_in_72h_per_bus_cat_most_used


std_no_payment_in_72h_per_bus_cat_most_used


mean_no_payment_out_72h_per_bus_cat_most_used


std_no_payment_out_72h_per_bus_cat_most_used


mean_no_purchase_72h_per_bus_cat_most_used


std_no_purchase_72h_per_bus_cat_most_used


mean_no_sale_w_cogs_72h_per_bus_cat_most_used


std_no_sale_w_cogs_72h_per_bus_cat_most_used


mean_no_sale_72h_per_bus_cat_most_used


std_no_sale_72h_per_bus_cat_most_used


mean_no_inventory_72h_per_bus_cat_most_used


std_no_inventory_72h_per_bus_cat_most_used


mean_time_to_register_from_installation_per_bus_cat_most_used


std_time_to_register_from_installation_per_bus_cat_most_used


mean_time_to_reg

In [192]:
print('split df into train and test\n')
features = [x for x in list(df.columns) if x not in [USER, DV]+date_cols]
print('# features: ', len(features))
x_train, x_test, y_train, y_test = train_test_split(
    df[features+[USER]], df[DV], test_size=0.2, random_state=123)
# dev and val samples
dev = pd.concat([x_train, y_train], axis=1)
val = pd.concat([x_test, y_test], axis=1)
cols = features + [DV, USER]
dev.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

print(dev.shape, val.shape)

split df into train and test

# features:  1178
(9608, 1180) (2403, 1180)


In [193]:
print('release memory\n')
del x_train, x_test, y_train, y_test

release memory



In [194]:
print('DV encodings\n')
dev, val = do_target_encode(dev, val, cat_cols, DV, 'dv', 3)

DV encodings



In [195]:
print('drop certain columns\n')
dev.drop(cat_cols, axis=1, inplace=True)
val.drop(cat_cols, axis=1, inplace=True)

print(dev.shape, val.shape)

drop certain columns

(9608, 1180) (2403, 1180)


In [196]:
print('prefix for features\n')
FEAT_PREFIX = 'BUK'
cols = list(dev.columns)
new_cols = [FEAT_PREFIX + '_'+ col if col not in (USER, DV) else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
dev.rename(columns=rename_dct, inplace=True)
val.rename(columns=rename_dct, inplace=True)

prefix for features



In [203]:
print('prepare data for modelling\n')
feat_cols = [x for x in list(dev.columns) if x.startswith(FEAT_PREFIX)]
print('# features: ', len(feat_cols))
x_train = dev[feat_cols]
y_train = dev[DV]
x_test = val[feat_cols]
y_test = val[DV]
print('shape ', x_train.shape, x_test.shape)

print('release memory\n')
del dev, val

prepare data for modelling

# features:  1178
shape  (9608, 1178) (2403, 1178)
release memory



In [247]:
print('modelling begins...\n')
print('XGB\n')
params = {'rounds': 600, 'depth': 3, 'eta': 0.05}
start = time.time()
pred_val_full, pred_test_full, cv_scores, auc, test_auc = trainModel(x_train, y_train,
                                                                     x_test, y_test, 3, "XGB",
                                                                     feat_cols, **params)
print('time taken: %0.2f' % (time.time() - start))

print('cv scores: ', cv_scores)
print('Final CV AUC: ', auc)
print('Test AUC: ', test_auc)

modelling begins...

XGB

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.89020	test-auc:0.89504
[20]	train-auc:0.92353	test-auc:0.91323
[40]	train-auc:0.93221	test-auc:0.92010
[60]	train-auc:0.93793	test-auc:0.92185
[80]	train-auc:0.94384	test-auc:0.92429
[100]	train-auc:0.94882	test-auc:0.92605
[120]	train-auc:0.95592	test-auc:0.92755
[140]	train-auc:0.96205	test-auc:0.92905
[160]	train-auc:0.96578	test-auc:0.92920
[180]	train-auc:0.97061	test-auc:0.92953
[200]	train-auc:0.97352	test-auc:0.92841
[220]	train-auc:0.97708	test-auc:0.92849
[240]	train-auc:0.97972	test-auc:0.92814
[260]	train-auc:0.98208	test-auc:0.92823
[280]	train-auc:0.98456	test-auc:0.92863
Parameters: { silent } might not be used.

  This may not be accurate due to some p

In [249]:
perf_df = optimal_cutoff(pred_test_full, y_test)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980


In [250]:
mask1 = perf_df['precision'].notnull()
mask2 = perf_df['precision'] >= 0.3
perf_df.loc[mask1&mask2, :].head()

Unnamed: 0,precision,recall,f_score,threshold
0,0.321981,0.753623,0.451193,0.13
1,0.320988,0.753623,0.450216,0.129
2,0.320872,0.746377,0.448802,0.131
3,0.318043,0.753623,0.447312,0.128
4,0.315315,0.76087,0.44586,0.127


In [253]:
print('modelling begins...\n')
print('LGB\n')
params = {'rounds': 2000, 'depth': 4, 'eta': 0.01}
start = time.time()
pred_val_full, pred_test_full, cv_scores, auc, test_auc = trainModel(x_train, y_train,
                                                                     x_test, y_test, 3, "LGB",
                                                                     feat_cols, **params)
print('time taken: %0.2f' % (time.time() - start))

print('cv scores: ', cv_scores)
print('Final CV AUC: ', auc)
print('Test AUC: ', test_auc)

modelling begins...

LGB





You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.917881
[40]	valid_0's auc: 0.922206
[60]	valid_0's auc: 0.923867
[80]	valid_0's auc: 0.924273
[100]	valid_0's auc: 0.925927
[120]	valid_0's auc: 0.926084
[140]	valid_0's auc: 0.926269


[160]	valid_0's auc: 0.92654
[180]	valid_0's auc: 0.926815
[200]	valid_0's auc: 0.927009
[220]	valid_0's auc: 0.926849
[240]	valid_0's auc: 0.927384
[260]	valid_0's auc: 0.927578
[280]	valid_0's auc: 0.92727
[300]	valid_0's auc: 0.927643


[320]	valid_0's auc: 0.927742
[340]	valid_0's auc: 0.927528
[360]	valid_0's auc: 0.927692
[380]	valid_0's auc: 0.927706
[400]	valid_0's auc: 0.9276
[420]	valid_0's auc: 0.928086
[440]	valid_0's auc: 0.927843
[460]	valid_0's auc: 0.927816


[480]	valid_0's auc: 0.9278
[500]	valid_0's auc: 0.927641
Early stopping, best iteration is:
[413]	valid_0's auc: 0.928111




You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.905469
[40]	valid_0's auc: 0.906775
[60]	valid_0's auc: 0.908586
[80]	valid_0's auc: 0.90904
[100]	valid_0's auc: 0.910339
[120]	valid_0's auc: 0.910983


[140]	valid_0's auc: 0.911523
[160]	valid_0's auc: 0.912011
[180]	valid_0's auc: 0.912943
[200]	valid_0's auc: 0.913936
[220]	valid_0's auc: 0.914413
[240]	valid_0's auc: 0.915228
[260]	valid_0's auc: 0.915532
[280]	valid_0's auc: 0.915879


[300]	valid_0's auc: 0.915116
[320]	valid_0's auc: 0.91529
[340]	valid_0's auc: 0.915376
[360]	valid_0's auc: 0.916024
[380]	valid_0's auc: 0.916318
[400]	valid_0's auc: 0.916123
[420]	valid_0's auc: 0.915894


[440]	valid_0's auc: 0.915967
[460]	valid_0's auc: 0.916281
[480]	valid_0's auc: 0.916184
Early stopping, best iteration is:
[384]	valid_0's auc: 0.916494




You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.912152
[40]	valid_0's auc: 0.917099
[60]	valid_0's auc: 0.918752
[80]	valid_0's auc: 0.918235
[100]	valid_0's auc: 0.919647
[120]	valid_0's auc: 0.920579
[140]	valid_0's auc: 0.920977
[160]	valid_0's auc: 0.920782


[180]	valid_0's auc: 0.921218
[200]	valid_0's auc: 0.921172
[220]	valid_0's auc: 0.921224
[240]	valid_0's auc: 0.921566
[260]	valid_0's auc: 0.92179
[280]	valid_0's auc: 0.921868
[300]	valid_0's auc: 0.922379
[320]	valid_0's auc: 0.922521


[340]	valid_0's auc: 0.92323
[360]	valid_0's auc: 0.924139
[380]	valid_0's auc: 0.924024
[400]	valid_0's auc: 0.924259
[420]	valid_0's auc: 0.92473
[440]	valid_0's auc: 0.924651
[460]	valid_0's auc: 0.924425


[480]	valid_0's auc: 0.924287
[500]	valid_0's auc: 0.924798
[520]	valid_0's auc: 0.925311
[540]	valid_0's auc: 0.925346
[560]	valid_0's auc: 0.925023
[580]	valid_0's auc: 0.925144
[600]	valid_0's auc: 0.925101
Early stopping, best iteration is:
[518]	valid_0's auc: 0.925352


time taken: 6.96
cv scores:  [0.9281110935092243, 0.9164941566347139, 0.9253518667142262]
Final CV AUC:  0.922439969134574
Test AUC:  0.9279713344210897


In [254]:
%time perf_df = optimal_cutoff(pred_test_full, y_test)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
CPU times: user 2.55 s, sys: 23.9 ms, total: 2.57 s
Wall time: 2.57 s


In [255]:
mask1 = perf_df['precision'].notnull()
mask2 = perf_df['precision'] >= 0.3
perf_df.loc[mask1&mask2, :].head()

Unnamed: 0,precision,recall,f_score,threshold
0,0.327526,0.681159,0.442353,0.145
3,0.328571,0.666667,0.440191,0.148
6,0.325175,0.673913,0.438679,0.146
7,0.323024,0.681159,0.438228,0.144
11,0.325088,0.666667,0.437055,0.147
