In [1]:
#Basic
import pandas as pd
import numpy as np
from google.cloud import bigquery, bigquery_storage
from datetime import date

import random 
RANDOM_STATE = 4222
random.seed(RANDOM_STATE)

#Sklearn
pd.set_option('display.max_columns', None)
from sklearn import model_selection, preprocessing, pipeline, ensemble, calibration, metrics
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.model_selection import *

#plotting
import matplotlib.pyplot as plt

#Custom loss functions
import keras.backend as K
from loss_function import *

#ALgorithms
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import InstanceHardnessThreshold

from xgboost import XGBClassifier
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from BorutaShap import BorutaShap

import warnings
warnings.filterwarnings("ignore") 

In [2]:
#fetching data
bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

query = """SELECT * FROM `api-project-901373404215.lookalike_trail_data.train_data_pre_fin_1`"""
df_train = bqclient.query(query).result().to_dataframe(bqstorage_client=bqstorageclient)
df_train = df_train.fillna(0)

print(df_train.shape)
df_train.head()

(128394, 193)


Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,sentiment_score_mean,sentiment_score_median,rf_content_aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic_search,rf_organic_social__dark,rf_organic_social__forbes,rf_paid_search,rf_paid_social__dark,rf_paid_social__forbes,rf_paid_web,rf_push_notification,rf_referral,ct_australia,ct_canada,ct_france,ct_hong_kong,ct_india,ct_ireland,ct_israel,ct_italy,ct_mexico,ct_new_zealand,ct_other,ct_singapore,ct_south_africa,ct_united_arab_emirates,ct_united_kingdom,ct_united_states,dos__not_set,dos_android,dos_blackberry,dos_chrome_os,dos_ios,dos_linux,dos_macintosh,dos_windows,t1_automotive,t1_books_and_literature,t1_business_and_finance,t1_careers,t1_education,t1_fine_art,t1_food_and_drink,t1_healthy_living,t1_medical_health,t1_movies,t1_music_and_audio,t1_news_and_politics,t1_personal_finance,t1_real_estate,t1_science,t1_shopping,t1_sports,t1_style_and_fashion,t1_technology_and_computing,t1_television,t1_travel,t1_video_gaming,t1_other,t1_automotive_perc_pvs,t1_books_and_literature_perc_pvs,t1_business_and_finance_perc_pvs,t1_careers_perc_pvs,t1_education_perc_pvs,t1_fine_art_perc_pvs,t1_food_and_drink_perc_pvs,t1_healthy_living_perc_pvs,t1_medical_health_perc_pvs,t1_movies_perc_pvs,t1_music_and_audio_perc_pvs,t1_news_and_politics_perc_pvs,t1_personal_finance_perc_pvs,t1_real_estate_perc_pvs,t1_science_perc_pvs,t1_shopping_perc_pvs,t1_sports_perc_pvs,t1_style_and_fashion_perc_pvs,t1_technology_and_computing_perc_pvs,t1_television_perc_pvs,t1_travel_perc_pvs,t1_video_gaming_perc_pvs,t1_other_perc_pvs,pc_404,pc_advisor,pc_asia,pc_billionaires,pc_business,pc_forbes_vetted,pc_home,pc_innovation,pc_leadership,pc_lifestyle,pc_money,pc_newsletters,pc_none,pc_other,pc_real_estate,pc_small_business,ps_careers_pvs,ps_cmo_network_pvs,ps_crypto_and_blockchain_pvs,ps_entrepreneurs_pvs,ps_food_and_drink_pvs,ps_forbeswomen_pvs,ps_healthcare_pvs,ps_investing_pvs,ps_leadership_strategy_pvs,ps_markets_pvs,ps_none_pvs,ps_other_pvs,ps_personal_finance_pvs,ps_retail_pvs,ps_sportsmoney_pvs,ps_travel_pvs,t1_automotive_top,t1_books_and_literature_top,t1_business_and_finance_top,t1_careers_top,t1_education_top,t1_fine_art_top,t1_food_and_drink_top,t1_healthy_living_top,t1_medical_health_top,t1_movies_top,t1_music_and_audio_top,t1_news_and_politics_top,t1_personal_finance_top,t1_real_estate_top,t1_science_top,t1_shopping_top,t1_sports_top,t1_style_and_fashion_top,t1_technology_and_computing_top,t1_television_top,t1_travel_top,t1_video_gaming_top,t1_other_top,bounce_rate,content_views_rate,weekend_top,weekday_top,business_hours_top,non_business_hours_top,friday_top,monday_top,saturday_top,sunday_top,thursday_top,tuesday_top,wednesday_top,day_of_mon_1_top,day_of_mon_2_top,day_of_mon_3_top,day_of_mon_4_top,day_of_mon_5_top,day_of_mon_6_top,day_of_mon_7_top,day_of_mon_8_top,day_of_mon_9_top,day_of_mon_10_top,day_of_mon_11_top,day_of_mon_12_top,day_of_mon_13_top,day_of_mon_14_top,day_of_mon_15_top,day_of_mon_16_top,day_of_mon_17_top,day_of_mon_18_top,day_of_mon_19_top,day_of_mon_20_top,day_of_mon_21_top,day_of_mon_22_top,day_of_mon_23_top,day_of_mon_24_top,day_of_mon_25_top,day_of_mon_26_top,day_of_mon_27_top,day_of_mon_28_top,day_of_mon_29_top,day_of_mon_30_top,day_of_mon_31_top,nl_subscription,managementlevel,c_level,subscriber
0,1329972406199074142,1.0,1.0,3.5,3.5,-0.108918,-0.108918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.5,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nl_subscriber,Director,0,non_subscriber
1,5168615631927784737,1.0,1.0,4.0,4.0,0.385637,0.385637,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,4.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nl_subscriber,C-Level,1,non_subscriber
2,3732910860632127814,1.0,1.0,5.0,5.0,0.186914,0.186914,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nl_subscriber,Non-Manager,0,non_subscriber
3,9058852749292128633,1.0,1.0,5.0,5.0,0.169675,0.169675,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nl_subscriber,Non-Manager,0,non_subscriber
4,4800326657361512519,2.0,2.0,6.0,6.0,-0.277186,-0.277186,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nl_subscriber,Non-Manager,0,non_subscriber


In [3]:
#dropping features
df_train=df_train.drop(columns=['ga_fullvisitorid','managementlevel'],axis=1)
df_train.shape

(128394, 191)

In [4]:
#Fetching Categorical Features
cat=[]
for column in df_train.columns:
    if df_train[column].dtypes == 'object':
        cat.append(column)
cat

['nl_subscription', 'subscriber']

In [5]:
df_train.c_level.value_counts()

0    96476
1    31918
Name: c_level, dtype: int64

In [6]:
#Encoding Categorical Features 
label_encoder = preprocessing.LabelEncoder()

for column in cat:
    df_train[column]=label_encoder.fit_transform(df_train[column])
df_train.shape

(128394, 191)

In [7]:
#Independent and dependent features

y_col=['c_level']
x_cols = [i for i in df_train.columns if i not in y_col]

# X and Y 
X=df_train[x_cols]

y=df_train[y_col]

In [8]:
#Train and Hold out data (10%) split (stratify C-level)
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.10,random_state=RANDOM_STATE)


In [9]:
#creating DF from nd arrray
X_train=pd.DataFrame(X_train,columns=x_cols)
y_train = y_train['c_level'].astype(int)

In [10]:
# Model tunnig to get parameters for feature selection
N_TRIALS_feature_selection = 10

def objective(trial):


    param = {"verbosity":0,
             "scale_pos_weight": trial.suggest_int("scale_pos_weight",1,5),
            "eval_metric":trial.suggest_categorical("eval_metric", [dice_coef_loss,focal_tversky_loss,'logloss']),   
            "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
            "subsample":trial.suggest_discrete_uniform('subsample',0.1,1,0.1),
            "learning_rate":trial.suggest_loguniform("learning_rate", 1e-9, 1.0),
            "n_estimators" : trial.suggest_int('n_estimators', 0, 1000),
            'max_depth':trial.suggest_int('max_depth', 2, 13),
            'gamma':trial.suggest_int('gamma', 0, 5),
            'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.1)
    }

    model=XGBClassifier(**param)
    cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
    cv_results = model_selection.cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring='f1_weighted',
        n_jobs=-1,
        error_score='raise'
    )
    
    print(cv_results['test_score'], np.mean(cv_results['test_score']))
    return np.mean(cv_results['test_score'])

In [12]:
#Feature Selection Tuned Parameters 
import time
start = time.time()

sampler_feature_selection = optuna.samplers.TPESampler(multivariate=True, warn_independent_sampling=False, group=True, seed=RANDOM_STATE)
study_feature_selection = optuna.create_study(direction='maximize')
study_feature_selection.optimize(objective, n_trials=N_TRIALS_feature_selection)

elapsed = round((time.time() - start) / 60, 2)
print(f"Took {elapsed} minutes to run")

print(study_feature_selection.best_value)
tuned_params_feature_selection = study_feature_selection.best_params
tuned_params_feature_selection

[0.65811454 0.6578217  0.65977814 0.66081028 0.65882306] 0.6590695447992655
[0.68557871 0.68542079 0.68789549 0.68443397 0.68467049] 0.6855998905886158
[0.64983015 0.65438717 0.65583345 0.64920551 0.64610401] 0.6510720577485503
[0.62630072 0.62638688 0.63836236 0.62558475 0.62763081] 0.6288531046732906
[0.33854476 0.33754501 0.33852292 0.33702087 0.34204604] 0.3387359220470807
[0.64813566 0.64711932 0.64877047 0.64735765 0.64712457] 0.6477015317553996
[0.64961265 0.64943293 0.65031205 0.64971943 0.65016464] 0.6498483428050542
[0.65229298 0.65556338 0.66040801 0.65527151 0.6512763 ] 0.6549624375652847
[0.57187417 0.56826887 0.56838566 0.56712088 0.55542677] 0.566215270975293
[0.48807688 0.47822618 0.49135034 0.48106153 0.47921262] 0.4835855095780112
Took 32.36 minutes to run
0.6855998905886158


{'scale_pos_weight': 2,
 'eval_metric': <function loss_function.dice_coef_loss(y_true, y_pred)>,
 'lambda': 1.3491728122372134e-05,
 'alpha': 0.0008493614612816257,
 'subsample': 0.4,
 'learning_rate': 1.580887432273951e-07,
 'n_estimators': 480,
 'max_depth': 8,
 'gamma': 4,
 'colsample_bytree': 0.9}

In [13]:
# Feature Selection

model = XGBClassifier(**tuned_params_feature_selection)


Feature_Selector = BorutaShap(
    model=model,
    importance_measure='shap',
    classification=True
)


Feature_Selector.fit(
    X=X_train,
    y=y_train,
    n_trials=100,
    train_or_test='test',
    normalize=True,
    verbose=True
)

# Return a list with the selected features 
# We take both important and tentative features

FEATURES=Feature_Selector.accepted + Feature_Selector.tentative
print(FEATURES)
print(len(FEATURES))

  0%|          | 0/100 [00:00<?, ?it/s]

29 attributes confirmed important: ['dos_android', 'pc_billionaires', 'dos_windows', 'weekday_top', 'sentiment_score_median', 'pc_business', 'session_top_median', 'weekend_top', 'rf_fbia', 't1_careers_perc_pvs', 'rf_organic_social__dark', 'tuesday_top', 'dos_ios', 'subscriber', 'friday_top', 'business_hours_top', 'session_top_mean', 'sentiment_score_mean', 't1_other_top', 't1_careers_top', 'rf_organic_search', 't1_business_and_finance_top', 'content_views_rate', 'saturday_top', 'rf_newsletter', 't1_other_perc_pvs', 'ps_other_pvs', 'ps_careers_pvs', 'dos_macintosh']
161 attributes confirmed unimportant: ['t1_technology_and_computing_top', 'ct_mexico', 't1_real_estate', 'day_of_mon_26_top', 'ct_australia', 'day_of_mon_5_top', 't1_medical_health_perc_pvs', 'ps_entrepreneurs_pvs', 'dos_linux', 'ct_hong_kong', 't1_fine_art', 'day_of_mon_27_top', 'rf_organic_social__forbes', 'ct_united_states', 'ps_retail_pvs', 'ps_sportsmoney_pvs', 'ct_italy', 'pc_advisor', 't1_personal_finance_top', 't1_sh

In [14]:
# Trainig data after feature selection
X_train= X_train[FEATURES]
print(X_train.shape)

(115554, 29)


In [15]:
# Model tunnig
N_TRIALS = 60

def objective(trial):


    param = {"verbosity":0,
             "scale_pos_weight": trial.suggest_int("scale_pos_weight",1,5),
            "eval_metric":trial.suggest_categorical("eval_metric", [dice_coef_loss,focal_tversky_loss,'logloss']),   
            "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
            "subsample":trial.suggest_discrete_uniform('subsample',0.1,1,0.1),
            "learning_rate":trial.suggest_loguniform("learning_rate", 1e-9, 1.0),
            "n_estimators" : trial.suggest_int('n_estimators', 0, 1000),
            'max_depth':trial.suggest_int('max_depth', 2, 13),
            'gamma':trial.suggest_int('gamma', 0, 5),
            'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.1)
    }

    model=XGBClassifier(**param)
    cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
    cv_results = model_selection.cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring='f1_weighted',
        n_jobs=-1,
        error_score='raise'
    )
    
    print(cv_results['test_score'], np.mean(cv_results['test_score']))
    return np.mean(cv_results['test_score'])

In [16]:
start = time.time()

sampler = optuna.samplers.TPESampler(multivariate=True, warn_independent_sampling=False, group=True, seed=RANDOM_STATE)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS)

elapsed = round((time.time() - start) / 60, 2)
print(f"Took {elapsed} minutes to run")

print(study.best_value)
tuned_params = study.best_params
tuned_params

[0.6447666  0.6447666  0.6447666  0.64470828 0.6447521 ] 0.6447520385146734
[0.66172704 0.66270286 0.6698522  0.66374224 0.66255554] 0.6641159756361052
[0.64587062 0.64599411 0.64619829 0.64523739 0.64540477] 0.6457410344067108
[0.639492   0.64255875 0.63396933 0.64427031 0.63835096] 0.6397282709486609
[0.68127115 0.68204789 0.68028271 0.68028648 0.68220014] 0.6812176742086253
[0.65076457 0.64809641 0.65127124 0.65023147 0.64957623] 0.6499879838572451
[0.60833565 0.60797135 0.61592612 0.61022348 0.60940887] 0.610373092816953
[0.64495025 0.64484784 0.64505263 0.64509665 0.64511924] 0.6450133223487783
[0.65007695 0.6491462  0.65150148 0.6493001  0.64925556] 0.6498560583883061
[0.55360188 0.55487914 0.55694143 0.55615909 0.54639723] 0.5535957532439457
[0.30270403 0.29684331 0.29745575 0.31459314 0.31639898] 0.305599043716207
[0.52235661 0.51324199 0.52860737 0.51997994 0.51891807] 0.5206207951374691
[0.67915576 0.67979658 0.6808851  0.68221585 0.67865154] 0.680140967885901
[0.16498752 0.1

{'scale_pos_weight': 2,
 'eval_metric': <function loss_function.dice_coef_loss(y_true, y_pred)>,
 'lambda': 0.0006441923338469773,
 'alpha': 3.077124520503162e-05,
 'subsample': 0.4,
 'learning_rate': 0.017404265822335053,
 'n_estimators': 703,
 'max_depth': 7,
 'gamma': 1,
 'colsample_bytree': 0.9}

In [17]:
# Final model after feature selection and hyperparameter tuning
model = XGBClassifier(**tuned_params).fit(X_train,y_train)

# Calibrated classifier
calibrated_model = calibration.CalibratedClassifierCV(
    base_estimator=model,
    method='isotonic',
    cv='prefit',
    n_jobs=-1,
    ensemble=True
).fit(X_train,y_train)

calibrated_model

CalibratedClassifierCV(base_estimator=XGBClassifier(alpha=3.077124520503162e-05,
                                                    base_score=0.5,
                                                    booster='gbtree',
                                                    callbacks=None,
                                                    colsample_bylevel=1,
                                                    colsample_bynode=1,
                                                    colsample_bytree=0.9,
                                                    early_stopping_rounds=None,
                                                    enable_categorical=False,
                                                    eval_metric=<function dice_coef_loss at 0x7f9a0cd0c5f0>,
                                                    gamma=1, gpu_id=-1,
                                                    grow_policy='depthwise',
                                                    importance_type=None,
    

In [18]:
# For demonstration purposes I'm manually setting the training data to the first of the month which is when we will re-train
today = date.today()
training_date = today.strftime('%Y-%m-%d')
cross_val_score = study.best_value
model_ver = 'c-level-v2'

# Check on the val hold-out set
X_hold=pd.DataFrame(X_val,columns=x_cols)

X_hold = X_hold[FEATURES]


y_pred = calibrated_model.predict(X_hold)
y_true = y_val['c_level'].astype(int)
val_score = metrics.f1_score(y_true, y_pred, average='weighted')

# Save the results to a df
df_results = pd.DataFrame([training_date, cross_val_score, val_score, model_ver]).T
df_results.columns=['training_date', 'cross_val_score', 'val_score', 'model_ver']
df_results

Unnamed: 0,training_date,cross_val_score,val_score,model_ver
0,2022-07-15,0.689637,0.681401,c-level-v2


In [19]:
# Save features and model
import pickle
with open('features_new.pkl', 'wb') as f:
    pickle.dump(FEATURES, f)
    
with open('calibrated_model_new.pkl','wb') as f:
    pickle.dump(calibrated_model,f)
    
with open('model_new.pkl','wb') as f:
    pickle.dump(model,f)

In [None]:
# # delete the trian table
# bqclient.delete_dataset(
#     lookalike_trail_data, delete_contents=True, not_found_ok=True
# )  # Make an API request.

# print("Deleted dataset '{}'.".format('lookalike_trail_data'))