In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as ss

from itertools import product
from functools import partial

import pickle

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn import svm

from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler
from ray.air.config import RunConfig

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score, make_scorer

from sklearn.model_selection import StratifiedKFold

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

In [3]:
# import sys
# !{sys.executable} -m pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m863.9 kB/s[0m eta [36m0:00:00[0m1m828.0 kB/s[0m eta [36m0:00:01[0m
[?25hCollecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [9]:
TARGET = 'target_full_ltv_day30'
subtargs = ['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30']

In [4]:
us = pd.read_pickle('./datasets/android_num_us.pkl')

In [5]:
kft_dict = pd.read_pickle('./datasets/android_num_kft.pkl')

In [5]:
us.tail()

Unnamed: 0,total_sessions_day0,total_sessions_day1,total_sessions_day3,total_sessions_day7,chapters_finished_day0,chapters_finished_day1,chapters_finished_day3,chapters_finished_day7,chapters_opened_day0,chapters_opened_day1,...,app_sub_ltv_day0,app_sub_ltv_day1,app_sub_ltv_day3,app_iap_ltv_day0,app_iap_ltv_day1,app_iap_ltv_day3,ad_ltv_day0,ad_ltv_day1,ad_ltv_day3,target_full_ltv_day30
1467813,13.0,20.0,20.0,20.0,12,18,18,18,12,19,...,0.0,0.0,0.0,4.394029,4.394029,4.394029,0.42646,0.42646,0.42646,4.820489
1467814,13.0,20.0,20.0,20.0,12,18,18,18,12,19,...,0.0,0.0,0.0,3.335499,3.335499,3.335499,0.323725,0.323725,0.323725,3.659224
1467815,9.0,16.0,24.0,25.0,7,13,17,19,8,14,...,0.0,0.0,0.0,4.171699,4.171699,6.713089,0.013045,0.013045,0.013045,6.726134
1467826,9.0,9.0,18.0,42.0,9,9,17,34,11,11,...,0.0,0.0,0.0,3.567268,3.567268,7.898951,0.079232,0.079232,0.179316,78.376447
1467831,6.0,7.0,7.0,7.0,15,15,15,15,15,16,...,0.0,0.0,0.0,3.949385,3.949385,3.949385,0.114909,0.114909,0.114909,4.064295


In [7]:
kft_dict.keys()

dict_keys(['test', 'sep_0', 'sep_1', 'sep_2', 'sep_3', 'sep_4', 'sep_5', 'sep_6', 'sep_7', 'sep_8', 'sep_9'])

In [6]:
def calc_metrics(y_hat, y_true):
    mape = mean_absolute_percentage_error(y_true, y_hat)
    rmse = mean_squared_error(y_true, y_hat, squared=False)
    r2 = r2_score(y_true, y_hat)
    mae = mean_absolute_error(y_true, y_hat)
    return mape, rmse, r2, mae

def get_fold_results(df, target, kft_dict, fold_i, model):
    sep_dict = kft_dict[f'sep_{fold_i}']
        
    train_df = df.iloc[sep_dict['train']]
    X_train = train_df.drop(target, axis=1)
    y_train = train_df[target]

    test_df = df.iloc[sep_dict['valid']]
    X_test = test_df.drop(target, axis=1)
    y_test = test_df[target]

    model.fit(X_train, y_train)
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    train_metrics = calc_metrics(y_hat_train, y_train)
    test_metrics = calc_metrics(y_hat_test, y_test)
    
    return train_metrics, test_metrics

def get_cv_results(df, target, kft_dict, model):
    train_metrics = []
    test_metrics = []
    for i in range(10):
        sep_dict = kft_dict[f'sep_{i}']
        
        train_df = df.iloc[sep_dict['train']]
        X_train = train_df.drop(target, axis=1)
        y_train = train_df[target]
        
        test_df = df.iloc[sep_dict['valid']]
        X_test = test_df.drop(target, axis=1)
        y_test = test_df[target]
        
        model.fit(X_train, y_train)
        y_hat_train = model.predict(X_train)
        y_hat_test = model.predict(X_test)
        
        train_metrics.append(calc_metrics(y_hat_train, y_train))
        test_metrics.append(calc_metrics(y_hat_test, y_test))
        
        print(f'Fold finished: {i}')
    train_metrics = pd.DataFrame(train_metrics, columns=['mape', 'rmse', 'r2', 'mae'])
    test_metrics = pd.DataFrame(test_metrics, columns=['mape', 'rmse', 'r2', 'mae'])
    
    return train_metrics, test_metrics

def get_agg_metrics(cv_metrics):
    return cv_metrics.median()

In [18]:
df = us
model = XGBRegressor()

train_metrics, test_metrics = get_fold_results(df, TARGET, kft_dict, 0, model)
print(train_metrics, test_metrics)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


(3722092229890.318, 0.18212036683290658, 0.9868471410801164, 0.01830149341573325) (3979874911920.808, 0.6448009961458047, 0.8691759954089687, 0.03458070914624031)


In [20]:
for i in range(10):
    sep_dict = kft_dict[f'sep_{i}']
    intersec = np.intersect1d(sep_dict['train'], sep_dict['valid'])
    priont(intersec)

array([], dtype=int64)

In [23]:
df = us
model = XGBRegressor()

train_cv_metrics, test_cv_metrics = get_cv_results(df, TARGET, kft_dict, model)
display(train_cv_metrics)
display(test_cv_metrics)
train_agg_metrics = get_agg_metrics(train_cv_metrics)
test_agg_metrics = get_agg_metrics(test_cv_metrics)
display(train_agg_metrics)
display(test_agg_metrics)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 0


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 1


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 2


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 3


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 4


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 5


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 6


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 7


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 8


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fold finished: 9


Unnamed: 0,mape,rmse,r2,mae
0,3722092000000.0,0.18212,0.986847,0.018301
1,3840284000000.0,0.182369,0.987434,0.018319
2,4053226000000.0,0.193278,0.985299,0.019164
3,3843345000000.0,0.186841,0.986825,0.018803
4,3777879000000.0,0.187102,0.98642,0.018609
5,3512640000000.0,0.182588,0.987235,0.018354
6,3651777000000.0,0.187346,0.986566,0.018369
7,3691065000000.0,0.178813,0.98754,0.017825
8,3726996000000.0,0.191824,0.985609,0.01884
9,3396788000000.0,0.190954,0.985917,0.019033


Unnamed: 0,mape,rmse,r2,mae
0,3979875000000.0,0.644801,0.869176,0.034581
1,3972973000000.0,0.711309,0.753497,0.038584
2,4566545000000.0,0.683963,0.844234,0.037009
3,4314064000000.0,0.496247,0.878517,0.03363
4,4648066000000.0,0.869964,0.716915,0.037036
5,4461650000000.0,0.627683,0.833607,0.035901
6,3356950000000.0,0.625524,0.834214,0.035056
7,6641512000000.0,0.585219,0.876706,0.036155
8,4339393000000.0,0.580003,0.882457,0.036787
9,3465544000000.0,0.877536,0.7005,0.036365


mape    3.724544e+12
rmse    1.869711e-01
r2      9.866952e-01
mae     1.848878e-02
dtype: float64

mape    4.326729e+12
rmse    6.362419e-01
r2      8.392238e-01
mae     3.625961e-02
dtype: float64

In [10]:
RANDOMSTATE = 10
NUM_SAMPLES = 2
MAX_CONCURRENT_TRIALS = 1
VERBOSE = 1
df = us

search_space = {
    "n_estimators": tune.loguniform(100, 10000),
    "max_depth": tune.randint(0, 5),
    "subsample": tune.quniform(0.25, 0.75, 0.01),
    "colsample_bytree": tune.quniform(0.05, 0.5, 0.01),
    "colsample_bylevel": tune.quniform(0.05, 0.5, 0.01),    
    "learning_rate": tune.quniform(-3.0, -1.0, 0.5) # pows of 10
}

def objective(config):
    config['n_estimators'] = int(config['n_estimators'])
    config['max_depth'] = int(config['max_depth']) + 2
    config['learning_rate'] = 10 ** config['learning_rate']
    
    model = XGBRegressor(
        random_state=RANDOMSTATE,
        booster='gbtree',
        scale_pos_weight=1,
        **config
    )
    _, test_cv_metrics = get_cv_results(df, TARGET, kft_dict, model)
    test_agg_metrics = get_agg_metrics(test_cv_metrics)
    rmse = test_agg_metrics['rmse']
    
    tune.report(rmse)
    
    return {"rmse": rmse}

algo = HyperOptSearch(random_state_seed=RANDOMSTATE)

scheduler = ASHAScheduler()

tuner = tune.Tuner(objective,
               param_space=search_space,
               tune_config=tune.TuneConfig(
                    num_samples=NUM_SAMPLES,
                    search_alg=algo,
                    scheduler=scheduler,
                    metric="rmse",
                    mode="min",
                    max_concurrent_trials=MAX_CONCURRENT_TRIALS
               ),
                run_config=RunConfig(
                    verbose=VERBOSE,
                    name="hyperopt_xgb",
                    local_dir="~/tune_results"
                ))

In [11]:
results = tuner.fit()

TuneError: Tune run failed. Please use tuner = Tuner.restore("/home/vivi/tune_results/hyperopt_xgb") to resume.

*** SIGTERM received at time=1662812555 on cpu 3 ***
PC: @     0x7f82fd420fde  (unknown)  epoll_wait
    @     0x7f82fd33d520  (unknown)  (unknown)
[2022-09-10 15:22:35,351 E 16637 16637] logging.cc:361: *** SIGTERM received at time=1662812555 on cpu 3 ***
[2022-09-10 15:22:35,351 E 16637 16637] logging.cc:361: PC: @     0x7f82fd420fde  (unknown)  epoll_wait
[2022-09-10 15:22:35,351 E 16637 16637] logging.cc:361:     @     0x7f82fd33d520  (unknown)  (unknown)


In [2]:
android_dict = pd.read_pickle('./datasets/android_dict.pkl')

In [4]:
X_train, y_train = android_dict['train']
X_test, y_test = android_dict['test']

In [9]:
search_spaces = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(2, 12),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'),
                 'reg_lambda': Real(1e-9, 100., 'uniform'),
                 'reg_alpha': Real(1e-9, 100., 'uniform'),
                 'n_estimators': Integer(100, 2000)
}

skf = StratifiedKFold(n_splits=7,
                      shuffle=True, 
                      random_state=42)

y_stratified = pd.cut(y_train.rank(method='first'), bins=10, labels=False)
cv = list(skf.split(X_train, y_stratified))
scoring = make_scorer(partial(mean_squared_error, squared=True), 
                      greater_is_better=False)

xgb = XGBRegressor(random_state=42)

opt = BayesSearchCV(estimator=xgb,                                    
                    search_spaces=search_spaces,                      
                    scoring=scoring,                                  
                    cv=cv,                                           
                    n_iter=3,
                    n_points=1,
                    n_jobs=1,
                    iid=False,
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=0)



In [20]:
opt.fit(X_train, y_train)