## Imports and Connections

In [90]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.window import Window
from snowflake.ml.modeling.preprocessing import *
from snowflake.ml.modeling.impute import *
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F

import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gc
import pickle

In [91]:
with open('creds.json') as f:
    connection_parameters = json.load(f)

In [92]:
session = Session.builder.configs(connection_parameters).create()
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ADTRACKING"."ADTRACKING_SCHEMA"
Current Warehouse: "AMAZON_SAGEMAKE_W_SNOWFLAKE_AS_DATASOURCE"


In [93]:
train_sdf = session.table("FULL_TRAINING_DATA")

In [8]:
# train_sdf.columns

In [5]:
count_day6 = train_sdf.filter(F.col("DAY") == 6).count()
count_day7 = train_sdf.filter(F.col("DAY") == 7).count()
count_day8 = train_sdf.filter(F.col("DAY") == 8).count()
count_day9 = train_sdf.filter(F.col("DAY") == 9).count()
count_total = train_sdf.count()

In [6]:
print(f"There are {count_total} rows in our training data. \n"
      f"{np.round(count_day6/count_total*100, 2)}% In day 6 \n"
      f"{np.round(count_day7/count_total*100, 2)}% In day 7 \n"
      f"{np.round(count_day8/count_total*100, 2)}% In day 8 \n"
      f"{np.round(count_day9/count_total*100, 2)}% In day 9 \n")

There are 913692 rows in our training data. 
4.34% In day 6 
32.93% In day 7 
33.75% In day 8 
28.98% In day 9 



In [7]:
feature_cols = train_sdf.columns
# Remove feature columns
feature_cols.remove("IS_ATTRIBUTED")
feature_cols.remove("ATTRIBUTED_TIME")
# Remove since we took day/hour/min as separate columns
feature_cols.remove("CLICK_TIME")

target_col = "IS_ATTRIBUTED"

We want to try a few different things in this notebook.  For example:

- Snowpark vs. Python implementation
- XGBoost vs. LGB
- Stratified vs. By day (custom) cross validation

We'll do our best to get the combinations of these done before our deadline, and leave the rest for future work.

## 

## Python implementation

In [40]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV

### Train XGBoost Model in Python with stratified cv

In [20]:
train_df = train_sdf.to_pandas()

In [21]:
train_df.head()

Unnamed: 0,IP,APP,DEVICE,OS,CHANNEL,CLICK_TIME,ATTRIBUTED_TIME,IS_ATTRIBUTED,DAY,HOUR,...,IP_APP_DEVICE_OS_NEXTCLICK,IP_APP_DEVICE_CHANNEL_NEXTCLICK,IP_APP_OS_CHANNEL_NEXTCLICK,IP_DEVICE_OS_CHANNEL_NEXTCLICK,IP_APP_DEVICE_OS_CHANNEL_NEXTCLICK,IP_DAY_HOUR_COUNT_CHANNEL,IP_APP_COUNT_CHANNEL,IP_APP_OS_COUNT_CHANNEL,IP_APP_OS_VAR_HOUR,IP_NUNIQUE_APP
0,151462,26,1,19,266,2017-11-07 07:33:18,NaT,0,7,7,...,,,,,,1,1,1,,2
1,249327,14,1,9,349,2017-11-09 09:08:21,NaT,0,9,9,...,,,,,,5,12,1,,9
2,123945,8,1,8,145,2017-11-07 23:39:14,NaT,0,7,23,...,,13612.0,,,,1,3,1,,18
3,88552,12,1,19,265,2017-11-07 07:10:49,NaT,0,7,7,...,,,,,,1,2,1,,9
4,63340,15,1,19,386,2017-11-09 03:37:51,NaT,0,9,3,...,,,,,,1,1,1,,7


In [30]:
# Get X and y for XGBoost as pandas dfs
y = train_df["IS_ATTRIBUTED"]
X = train_df.drop(["IS_ATTRIBUTED", "CLICK_TIME", "ATTRIBUTED_TIME"], axis=1)

NameError: name 'train_df' is not defined

In [26]:
# Remove the full df from memory
del train_df
gc.collect()

239

In [45]:
bayes_params = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
    }

In [85]:
bayes_cv = BayesSearchCV(
    estimator=xgb.XGBClassifier(        
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='approx'
        ),
    search_spaces=bayes_params,
    n_jobs=-1,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3, shuffle=True),
    verbose=2,
    n_iter=5
)

In [86]:
def print_model_status(optim_result):
    # Save all models for future use if necessary
    all_models = pd.DataFrame(bayes_cv.cv_results_)
    
    print(f"Model # {len(all_models)}\n"
          f"Best ROC-AUC score: {np.round(bayes_cv.best_score_, 4)}\n"
          f"Best params: {bayes_cv.best_params_}")
    
    # Save model results
    clf_name = bayes_cv.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [87]:
result = bayes_cv.fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [83]:
with open(r'data/model_results/xgb_bayes_strat.pkl', 'wb') as f:
    pickle.dump(bayes_cv, f)

### Train XGBoost Model in Python with static validation set for CV
We want to use day 9 as our validation set here.

## Snowpark implementation

### Train XGBoost Model via Snowpark with stratified cv

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.metrics import *

In [54]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=xgb_params,
    n_jobs=-1,
    input_cols=feature_cols,
    label_cols=target_col,
    output_cols="PREDICTION",
    scoring=roc_auc_score,
    cv=3,  # Change to stratified
    verbose=2
)

In [56]:
grid_search.fit(train_sdf)