In [1]:
import gc
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
def rename_test_identity_columns(df):
    """
    Correct test identity columns
    :param df:
    :return df:
    """
    for col in df.columns:
        if col[0:2] == 'id':
            new_col_name = col.split('-')[0] + '_' + col.split('-')[1]
            df = df.rename(columns={col: new_col_name})

    return df


def load_data(DATA_DIRECTORY):
    """
    load training, testing and sample submission files from a given directory
    :param DATA_DIRECTORY:
    :return dataframes:
    """
    train_identity = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train_identity.csv'), index_col='TransactionID')
    train_transaction = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train_transaction.csv'), index_col='TransactionID')

    test_identity = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test_identity.csv'), index_col='TransactionID')
    test_transaction = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test_transaction.csv'), index_col='TransactionID')

    submission = pd.read_csv(os.path.join(DATA_DIRECTORY, 'sample_submission.csv'))

    return train_identity, train_transaction, rename_test_identity_columns(test_identity), test_transaction, submission

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce memory usage by casting datatype to lower bits
    :param df:
    :param verbose:
    :return df:
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:

        col_type = df[col].dtypes

        if col_type in numerics:

            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':

                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            else:

                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem)
        )
    return df


def label_encoding(df):
    for col in tqdm(df.columns, desc="Label Encoding..."):
        if df[col].dtype == 'object':
            le = LabelEncoder()
            le.fit(list(df[col].astype(str).values) + list(df[col].astype(str).values))
            df[col] = le.transform(list(df[col].astype(str).values))


def recursive_feature_elimination(train, from_backup=True):
    """
    conduct recursive feature elimination on the given training dataset
    :param train: training dataset
    :param from_backup: load from historical result (stored as list of strings), defaults to True
    :return top ranked features:
    """
    # defaults to return backup list
    if from_backup:
        return ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
                'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
                'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14',
                'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V7', 'V12', 'V13', 'V19',
                'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V43', 'V44', 'V45', 'V47', 'V48',
                'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V57', 'V58', 'V60', 'V61', 'V62', 'V69', 'V70', 'V72', 'V74',
                'V75', 'V76', 'V78', 'V81', 'V82', 'V83', 'V87', 'V90', 'V91', 'V94', 'V95', 'V96', 'V97', 'V99',
                'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V139', 'V140', 'V143', 'V145', 'V149', 'V150', 'V152',
                'V156', 'V158', 'V159', 'V160', 'V162', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V187',
                'V188', 'V189', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210',
                'V212', 'V213', 'V215', 'V216', 'V217', 'V218', 'V219', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226',
                'V228', 'V231', 'V232', 'V233', 'V234', 'V243', 'V244', 'V251', 'V254', 'V256', 'V257', 'V258', 'V261',
                'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275',
                'V276', 'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292',
                'V294', 'V303', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323',
                'V324', 'V326', 'V331', 'V332', 'V333', 'V335', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                'id_38', 'DeviceType', 'DeviceInfo', 'device_name', 'OS_id_30', 'version_id_30', 'browser_id_31',
                'version_id_31', 'screen_width', 'screen_height', 'P_emaildomain_bin', 'P_emaildomain_suffix',
                'R_emaildomain_bin', 'R_emaildomain_suffix', 'TransactionAmt_Log', 'TransactionAmt_decimal']

        # if desire to conduct RFE again...
    train = reduce_mem_usage(train)

    X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
    y = train.sort_values('TransactionDT')['isFraud']

    del train
    gc.collect()

    X.fillna(-999, inplace=True)

    # parameters chosen by BayesianOptimization
    # Credit to this notebook: https://www.kaggle.com/vincentlugat/ieee-lgb-bayesian-opt/notebook
    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        "boosting_type": "gbdt",
        "bagging_seed": 11,
        "metric": 'auc',
        "verbosity": -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47
    }

    import lightgbm as lgb

    clf = lgb.LGBMClassifier(**params)
    rfe = RFECV(estimator=clf, step=10, cv=KFold(n_splits=5, shuffle=False), scoring='roc_auc', verbose=2)

    rfe.fit(X, y)

    return X.columns[rfe.ranking_ == 1].tolist()

In [4]:
DATA_DIRECTORY = '../input/ieee-fraud-detection/'


def id_split(df):
    """
    Split or convert some identifiable features in subsets
    :param df:
    :return df:
    """
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].str.split(' ', expand=True)[1]

    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]

    df['screen_width'] = df['id_33'].str.split('x', expand=True)[0]
    df['screen_height'] = df['id_33'].str.split('x', expand=True)[1]

    df['id_34'] = df['id_34'].str.split(':', expand=True)[1]
    df['id_23'] = df['id_23'].str.split(':', expand=True)[1]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[
                                   df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()

    return df


def merge_transaction_and_identify(transaction, identity):
    df = transaction.merge(identity, how='left', left_index=True, right_index=True)

    del transaction, identity

    gc.collect()

    return df


def email_mappings(train, test):
    """
    group email domains to company
    yahoo / ymail / frontier / rocketmail -> Yahoo
    hotmail / outlook / live / msn -> Microsoft
    icloud / mac / me -> Appe
    prodigy / att / sbcglobal-> AT&T
    centurylink / embarqmail / q -> Centurylink
    aim / aol -> AOL
    twc / charter -> Spectrum
    :param train:
    :param test:
    :return train, test:
    """
    emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other',
              'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo',
              'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft',
              'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google',
              'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other',
              'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft',
              'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
              'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other',
              'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft',
              'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other',
              'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo',
              'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other',
              'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
              'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft',
              'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

    us_emails = ['gmail', 'net', 'edu']

    for c in ['P_emaildomain', 'R_emaildomain']:
        train[c + '_bin'] = train[c].map(emails)
        test[c + '_bin'] = test[c].map(emails)

        train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
        test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])

        train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
        test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

    return train, test


def transaction_amt_features(train, test):
    """
    generate features on base on transaction amount
    :param train:
    :param test:
    :return train, test:
    """
    # log transformation
    train['TransactionAmt_Log'] = np.log(train['TransactionAmt'])
    test['TransactionAmt_Log'] = np.log(test['TransactionAmt'])

    # get decimal part as feature
    train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(
        int)
    test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

    return train, test


def drop_useless_columns(train, test):
    """
    drop columns if:
    - only 1 category
    - More than 90% of the values are NaN
    - More than 90% of the values are the same
    :param train:
    :param test:
    :return train, test:
    """
    one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
    one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

    many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
    many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

    big_top_value_cols = [col for col in train.columns if
                          train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    big_top_value_cols_test = [col for col in test.columns if
                               test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

    cols_to_drop = list(set(
        many_null_cols +
        many_null_cols_test +
        big_top_value_cols +
        big_top_value_cols_test +
        one_value_cols +
        one_value_cols_test
    ))

    # exclude target
    cols_to_drop.remove('isFraud')

    train = train.drop(cols_to_drop, axis=1)
    test = test.drop(cols_to_drop, axis=1)

    return train, test


def top_features_aggregation(train, test):
    """
    aggregate top ranked features (by RFE)
    :param train:
    :param test:
    :return train, test:
    """
    columns_a = ['TransactionAmt', 'id_02', 'D15']
    columns_b = ['card1', 'card4', 'addr1']

    for col_a in columns_a:
        for col_b in columns_b:
            for df in [train, test]:
                df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
                df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')

    return train, test


def label_encoding(train, test):
    """
    Label encode categorical columns
    :param train:
    :param test:
    :return:
    """
    for col in train.columns:
        if train[col].dtype == 'object':
            le = LabelEncoder()
            le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
            train[col] = le.transform(list(train[col].astype(str).values))
            test[col] = le.transform(list(test[col].astype(str).values))

    return train, test


def clean_inf_nan(df):
    """
    replace infs to nan
    reference: https://www.kaggle.com/dimartinot
    :param df:
    :return df:
    """
    return df.replace([np.inf, -np.inf], np.nan)


def feature_engineering():
    """
    load original datasets and conduct feature engineering
    :return X_train, y_train, X_test, submission:
    """
    train_identity, train_transaction, test_identity, test_transaction, submission = load_data(DATA_DIRECTORY)

    train_identity = id_split(train_identity)
    test_identity = id_split(test_identity)

    train = merge_transaction_and_identify(train_transaction, train_identity)
    test = merge_transaction_and_identify(test_transaction, test_identity)

    train, test = email_mappings(train, test)

    train, test = drop_useless_columns(train, test)
    
    train, test = transaction_amt_features(train, test)

    useful_features = recursive_feature_elimination(train)

    cols_to_drop = [col for col in train.columns if col not in useful_features]
    cols_to_drop.remove('isFraud')
    cols_to_drop.remove('TransactionDT')

    train = train.drop(cols_to_drop, axis=1)
    test = test.drop(cols_to_drop, axis=1)

    train, test = label_encoding(train, test)

    X_train = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
    y_train = train.sort_values('TransactionDT')['isFraud']

    X_test = test.drop(['TransactionDT'], axis=1)

    del train, test
    gc.collect()

    X_train = clean_inf_nan(X_train)
    X_test = clean_inf_nan(X_test)

    return X_train, y_train, X_test, submission

In [5]:
X_train, y_train, X_test, sample_submission = feature_engineering()

In [7]:
X_train.fillna(-999, inplace=True)

In [9]:
## List of categorical variables according to data description
id_cat_var = ['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'] + ['card' + str(i) for i in range(1,7)] + ['M' + str(i) for i in range(1,10)]
tran_cat_var = ['DeviceType', 'DeviceInfo'] + ['id_' + str(i) for i in range(12,39)]

## List of categorical variables derived from the original dataset
der_cat_var = ['device_name','OS_id_30','version_id_30','browser_id_31','version_id_31','P_emaildomain_bin','P_emaildomain_suffix','R_emaildomain_bin','R_emaildomain_suffix']

cat_var = id_cat_var + tran_cat_var + der_cat_var

In [10]:
cat_var_new = ['isFraud']

for var in cat_var:
    if var in X_train.columns.tolist():
        cat_var_new.append(var)

In [12]:
training = pd.merge(X_train, y_train, on='TransactionID')

In [14]:
del X_train, y_train

In [27]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [28]:
h2o.init()
h2o.cluster().show_status()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9.1" 2020-11-04; OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpltmvqtbz
  JVM stdout: /tmp/tmpltmvqtbz/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpltmvqtbz/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 23 days
H2O_cluster_name:,H2O_from_python_unknownUser_r5jy8h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 23 days
H2O_cluster_name:,H2O_from_python_unknownUser_r5jy8h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [29]:
training_hf = h2o.H2OFrame(training)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [30]:
for var in cat_var_new:
    training_hf[var] = training_hf[var].asfactor()

In [31]:
training_hf.types

{'TransactionAmt': 'real',
 'ProductCD': 'enum',
 'card1': 'enum',
 'card2': 'enum',
 'card3': 'enum',
 'card4': 'enum',
 'card5': 'enum',
 'card6': 'enum',
 'addr1': 'enum',
 'addr2': 'enum',
 'dist1': 'int',
 'P_emaildomain': 'enum',
 'R_emaildomain': 'enum',
 'C1': 'int',
 'C2': 'int',
 'C4': 'int',
 'C5': 'int',
 'C6': 'int',
 'C7': 'int',
 'C8': 'int',
 'C9': 'int',
 'C10': 'int',
 'C11': 'int',
 'C12': 'int',
 'C13': 'int',
 'C14': 'int',
 'D1': 'int',
 'D2': 'int',
 'D3': 'int',
 'D4': 'int',
 'D5': 'int',
 'D6': 'int',
 'D8': 'real',
 'D9': 'real',
 'D10': 'int',
 'D11': 'int',
 'D12': 'int',
 'D13': 'int',
 'D14': 'int',
 'D15': 'int',
 'M2': 'enum',
 'M3': 'enum',
 'M4': 'enum',
 'M5': 'enum',
 'M6': 'enum',
 'M7': 'enum',
 'M8': 'enum',
 'M9': 'enum',
 'V3': 'int',
 'V4': 'int',
 'V5': 'int',
 'V7': 'int',
 'V12': 'int',
 'V13': 'int',
 'V19': 'int',
 'V20': 'int',
 'V29': 'int',
 'V30': 'int',
 'V33': 'int',
 'V34': 'int',
 'V35': 'int',
 'V36': 'int',
 'V37': 'int',
 'V38'

In [32]:
train, valid = training_hf.split_frame(ratios = [.8], seed = 1234)

In [33]:
rf = H2ORandomForestEstimator(model_id='rf', ntrees=1, seed=1234)

In [34]:
predictors = training.columns.tolist()
predictors.remove('isFraud')
response = 'isFraud'

In [35]:
rf.train(x = predictors, y = response, training_frame = train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [42]:
rf.auc(train=True, valid=True)

{'train': 0.7673385668273435, 'valid': 0.7705249385146199}

In [39]:
rf.model_performance(valid)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.027069427709748087
RMSE: 0.1645278934094401
LogLoss: 0.4123140100185691
Mean Per-Class Error: 0.2587420976915278
AUC: 0.7705249385146199
AUCPR: 0.3615393853451903
Gini: 0.5410498770292398

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.30000001192092896: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,112646.0,1041.0,0.0092,(1041.0/113687.0)
1,1,2510.0,1619.0,0.6079,(2510.0/4129.0)
2,Total,115156.0,2660.0,0.0301,(3551.0/117816.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.3,0.476948,55.0
1,max f2,0.122449,0.443672,91.0
2,max f0point5,0.545455,0.57531,33.0
3,max accuracy,0.733333,0.9716,21.0
4,max precision,0.934911,0.695744,3.0
5,max recall,0.0,1.0,326.0
6,max specificity,1.0,0.995013,0.0
7,max absolute_mcc,0.545455,0.481544,33.0
8,max min_per_class_accuracy,0.02116,0.705498,231.0
9,max mean_per_class_accuracy,0.038394,0.741258,177.0



Gains/Lift Table: Avg response rate:  3.50 %, avg score:  3.45 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01521,1.0,19.505517,19.505517,0.683594,1.0,0.683594,1.0,0.296682,0.296682,1850.551738,1850.551738,0.291695
1,2,0.02004,0.428571,15.595795,18.563276,0.546573,0.705244,0.650572,0.928964,0.075321,0.372003,1459.579484,1756.32759,0.364746
2,3,0.030225,0.138614,5.302528,14.094616,0.185833,0.236492,0.493962,0.695612,0.054008,0.426011,430.252846,1309.461627,0.410161
3,4,0.041234,0.097107,2.66198,11.042305,0.093292,0.113872,0.386991,0.540298,0.029305,0.455316,166.197998,1004.230477,0.429121
4,5,0.053363,0.077661,2.256346,9.045305,0.079076,0.084613,0.317003,0.436723,0.027367,0.482683,125.634552,804.530529,0.444913
5,6,0.119339,0.041451,1.523417,4.886867,0.05339,0.049186,0.171266,0.222475,0.100509,0.583192,52.341708,388.686737,0.4807
6,7,0.150421,0.027624,0.849313,4.052564,0.029765,0.033822,0.142027,0.183493,0.026399,0.609591,-15.068744,305.256393,0.475846
7,8,0.206763,0.022573,0.786635,3.162612,0.027569,0.022854,0.110837,0.139719,0.044321,0.653911,-21.336506,216.261168,0.463388
8,9,0.309321,0.02116,0.618708,2.319158,0.021683,0.021403,0.081278,0.100491,0.063454,0.717365,-38.129175,131.915793,0.422863
9,10,0.401109,0.010279,0.567298,1.918273,0.019882,0.013982,0.067228,0.080695,0.052071,0.769436,-43.27017,91.827319,0.381704







In [None]:
X_test.fillna(-999, inplace=True)

In [None]:
testing_hf = h2o.H2OFrame(X_test)

In [None]:
cat_var_new.remove('isFraud')

In [None]:
for var in cat_var_new:
    testing_hf[var] = testing_hf[var].asfactor()

In [None]:
predict = rf.predict(testing_hf)

In [None]:
h2o.download_csv(predict, "predict.csv")

In [None]:
#Kaggle Score: 0.695691