In [1]:
spark

In [2]:
from collections import namedtuple
import datetime
import json
from typing import Dict, Union, Sequence
import textwrap
import itertools


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.metrics import average_precision_score

from pyspark.sql import Row, Window, DataFrame, Column
from pyspark.sql import types
import pyspark.sql.functions as sf

%load_ext autoreload
%autoreload 2

In [3]:
import project_utils
import bks_utils
import propensity_utils

In [4]:
from kdevries.analysis_utils import show_sorted_frequencies as ssf
from kdevries.analysis_utils import count_condition
import kdevries

In [5]:
import bkng.data

In [6]:
bks_utils.create_h2o_context_on_bks()

Waiting for pods
Waiting for pods
Waiting for pods
Waiting for pods
Connecting to H2O server at http://bigdata-g99lee0jue:54321/gateway/eu-nl-c/g99lee0jue/h2o ... successful.


0,1
H2O_cluster_uptime:,18 secs
H2O_cluster_timezone:,+01:00
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,user33922
H2O_cluster_total_nodes:,2
H2O_cluster_free_memory:,30 Gb
H2O_cluster_total_cores:,112
H2O_cluster_allowed_cores:,112



Sparkling Water Context:
 * Sparkling Water Version: 3.32.0.2-1-3.0
 * H2O name: spark
 * cluster size: 2
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,10.239.198.15,54321)
  (1,10.239.192.212,54321)
  ------------------------

  Open H2O Flow in browser: http://bigdata-g99lee0jue:54321/gateway/eu-nl-c/g99lee0jue/h2o (CMD + click in Mac OSX)

    


# Get data

In [7]:
from project_utils import DATA_START_DATE, DATA_END_DATE
data_df = (
    project_utils
    .get_data_df(DATA_START_DATE, DATA_END_DATE)
)
data_df = project_utils.add_week_start_date_column(data_df, time_col="timestamp")

Info: Forcing materialisation by calculating distinct count: 1696926
Info: Verifying the number of original rows is preserved


In [8]:
ssf(data_df, sf.month("yyyy_mm_dd"), sort_cols="month(yyyy_mm_dd)")

+-----------------+--------+--------+
|month(yyyy_mm_dd)|abs_freq|rel_freq|
+-----------------+--------+--------+
|1                |143895  |0.0848  |
|2                |137964  |0.0813  |
|3                |98408   |0.058   |
|4                |72704   |0.0428  |
|5                |90586   |0.0534  |
|6                |125962  |0.0742  |
|7                |234660  |0.1383  |
|8                |244362  |0.144   |
|9                |216325  |0.1275  |
|10               |164289  |0.0968  |
|11               |88227   |0.052   |
|12               |79544   |0.0469  |
+-----------------+--------+--------+



# Train and test dates

In [9]:
from project_utils import ANALYSIS_START_DATE
test_week_start_dates = project_utils.get_week_start_date_range(start=ANALYSIS_START_DATE, end=DATA_END_DATE)

In [10]:
test_week_start_dates

[datetime.date(2019, 12, 2),
 datetime.date(2019, 12, 9),
 datetime.date(2019, 12, 16),
 datetime.date(2019, 12, 23),
 datetime.date(2019, 12, 30),
 datetime.date(2020, 1, 6),
 datetime.date(2020, 1, 13),
 datetime.date(2020, 1, 20),
 datetime.date(2020, 1, 27),
 datetime.date(2020, 2, 3),
 datetime.date(2020, 2, 10),
 datetime.date(2020, 2, 17),
 datetime.date(2020, 2, 24),
 datetime.date(2020, 3, 2),
 datetime.date(2020, 3, 9),
 datetime.date(2020, 3, 16),
 datetime.date(2020, 3, 23),
 datetime.date(2020, 3, 30),
 datetime.date(2020, 4, 6),
 datetime.date(2020, 4, 13),
 datetime.date(2020, 4, 20),
 datetime.date(2020, 4, 27),
 datetime.date(2020, 5, 4),
 datetime.date(2020, 5, 11),
 datetime.date(2020, 5, 18),
 datetime.date(2020, 5, 25),
 datetime.date(2020, 6, 1),
 datetime.date(2020, 6, 8),
 datetime.date(2020, 6, 15),
 datetime.date(2020, 6, 22),
 datetime.date(2020, 6, 29),
 datetime.date(2020, 7, 6),
 datetime.date(2020, 7, 13),
 datetime.date(2020, 7, 20),
 datetime.date(2020,

# Prepare H2O Frame

In [11]:
metadata = json.load(open("incremental_learning_boreas_config.json"))
metadata = project_utils.set_default_model_params(metadata)
metadata = project_utils.move_categorical_features_to_numerical(metadata)
metadata["dates"] = None 

In [12]:
import propensity_utils

In [13]:
data_df = propensity_utils.add_label_and_selector_cols(data_df, test_week_start_dates)
data_h2o_df = project_utils.spark_to_h2o_frame(data_df, metadata)
data_h2o_df = propensity_utils.cast_label_and_selector_cols_as_factor(data_h2o_df, test_week_start_dates)

# Train gbm: fraud

In [14]:
test_week_start_date = test_week_start_dates[0]

In [15]:
def get_fraud_train_test_h2o_df(data_h2o_df, test_week_start_date):
    train_test_col_name = propensity_utils.get_week_train_test_col_name(test_week_start_date)
    # train_test_col: "1" when test, "0" when train 
    return data_h2o_df[data_h2o_df[train_test_col_name]=='0'], data_h2o_df[data_h2o_df[train_test_col_name]=='1']

In [16]:
fraud_train_h2o_df, fraud_test_h2o_df = get_fraud_train_test_h2o_df(data_h2o_df, test_week_start_date)

In [17]:
fraud_xgb_model = project_utils.train_h2o_model(fraud_train_h2o_df, metadata, val_h2o_df=fraud_test_h2o_df)

Dropping bad and constant columns: [email_address_hash_distinct_count_detected_country_1dly_30d2h_2h]


xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [18]:
fraud_xgb_model.aucpr()

0.7403940950373811

In [19]:
fraud_xgb_model.aucpr(valid=True)

0.523070085524198

# Leave nodes for Train / Test set

In [20]:
leaves_h2o_df = fraud_xgb_model.predict_leaf_node_assignment(data_h2o_df)

In [21]:
data_and_leaves_h2o_df = data_h2o_df

In [22]:
leaf_columns = leaves_h2o_df.columns
renamed_leaf_columns = [leaf_col.replace(".C1", "").lower() for leaf_col in leaf_columns]
for leaf_col, renamed_leaf_column in zip(leaf_columns, renamed_leaf_columns):
    data_and_leaves_h2o_df[renamed_leaf_column] = leaves_h2o_df[leaf_col].asfactor()

In [23]:
data_and_leaves_df = project_utils.as_spark_frame(data_and_leaves_h2o_df)

In [24]:
data_and_leaves_df.printSchema()

root
 |-- request_id: string (nullable = false)
 |-- timestamp: string (nullable = false)
 |-- model_name: string (nullable = true)
 |-- product_code: string (nullable = false)
 |-- control_group: string (nullable = false)
 |-- control_group_ratio: double (nullable = false)
 |-- final_decision: string (nullable = false)
 |-- model_score: double (nullable = false)
 |-- model_3ds_threshold: double (nullable = true)
 |-- yyyy_mm_dd: timestamp (nullable = false)
 |-- filter_date: timestamp (nullable = false)
 |-- binary_label: string (nullable = false)
 |-- email_address_hash_distinct_count_digest_2d: short (nullable = false)
 |-- email_address_hash_distinct_count_detected_country_2d: byte (nullable = false)
 |-- email_address_hash_distinct_count_detected_country_1dly_30d2h_2h: byte (nullable = true)
 |-- email_address_hash_distinct_count_ip_address_hash_2d: byte (nullable = false)
 |-- digest_distinct_count_email_address_hash_2d: byte (nullable = true)
 |-- digest_distinct_count_email_add




In [25]:
data_and_leaves_df.write.saveAsTable("kdevries.data_and_leaves_df__analysis_2021_01_15")