In [1]:
# imports
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import os
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import eli5
import shap
from IPython.display import HTML
import json
import altair as alt

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import multiprocessing
import warnings
warnings.simplefilter('ignore')
alt.renderers.enable('notebook')
from functions import save_features2, LabelEncoderPopularity,load_data
from lib.training import *

In [2]:
from catenc.utils import *

## Load base dataset


In [3]:
data = pd.read_pickle("./pickles/data_all_features.pkl")


In [4]:
fraud = data['isFraud']
fraud = fraud.replace({'test':-1})
fraud = fraud.astype(np.int8)
data['isFraud'] = fraud

In [5]:
with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
    categorial_features = list(store['initial/categorial_features'].values.flatten())
    numerical_features = list(store['initial/numerical_features'].values.flatten())

## Apply Additional Features

In [6]:
with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
    num_features = store.get_node('encodedFeatures')
    for feature in num_features:
        feature_name = feature._v_name
        # here would be assession of feature introduction
        q = store.select(feature._v_pathname)
        if len(q[q.index.duplicated()]) != 0:
            print('Problem', feature_name)
            q.drop(q[q.index.duplicated()].index, axis=0, inplace=True)
        data[feature_name] = q
    numerical_features += ['BrowserAge', 'BrowserVersion', 'screen_height', 'screen_width']
    categorial_features +=  ['Browser',  'OS', 'OSVersion', 'device_name', 'device_version']
    
categorial_features.remove('id_30')
categorial_features.remove('id_31')
data.drop(['id_30','id_31'], axis=1, inplace=True)

## Add Engineering features

In [7]:
with pd.HDFStore('./hdf/engineering.h5', 'r') as store:
    numerical = store.get_node('numerical')
    for feature in numerical:
        feature_name = feature._v_name
        # here would be assession of feature introduction
        q = store.select(feature._v_pathname)
        if len(q[q.index.duplicated()]) != 0:
            print('Problem', feature_name)
            q.drop(q[q.index.duplicated()].index, axis=0, inplace=True)
        data[feature_name] = q
        numerical_features += [feature_name]
    categorial = store.get_node('categorial')
    for feature in categorial:
        feature_name = feature._v_name
        # here would be assession of feature introduction
        q = store.select(feature._v_pathname)
        if len(q[q.index.duplicated()]) != 0:
            print('Problem', feature_name)
            q.drop(q[q.index.duplicated()].index, axis=0, inplace=True)
        data[feature_name] = q
        categorial_features += [feature_name]        
    
    

In [8]:
data.isFraud.value_counts()

 0    569877
-1    506691
 1     20663
Name: isFraud, dtype: int64

In [9]:
data.head()

Unnamed: 0_level_0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,D11__device_name,P_emaildomain_bin__C2,addr1__card1,card1__card5,card2__dist1,card2__id_20,card5__P_emaildomain_bin,device_name__P_emaildomain_bin,id_02__D8,id_02__id_20
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,13.0_nan,nan_1.0,315.0_13926,13926_142.0,nan_19.0,nan_nan,142.0_nan,nan_nan,nan_nan,nan_nan
2987001,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,nan_nan,google_1.0,325.0_2755,2755_102.0,404.0_nan,404.0_nan,102.0_google,nan_google,nan_nan,nan_nan
2987002,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,315.0_nan,microsoft_1.0,330.0_4663,4663_166.0,490.0_287.0,490.0_nan,166.0_microsoft,nan_microsoft,nan_nan,nan_nan
2987003,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,nan_nan,yahoo_5.0,476.0_18132,18132_117.0,567.0_nan,567.0_nan,117.0_yahoo,nan_yahoo,nan_nan,nan_nan
2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,nan_2.0,google_1.0,420.0_4497,4497_102.0,514.0_nan,514.0_144.0,102.0_google,2.0_google,70787.0_nan,70787.0_144.0


In [33]:
# tdata = data[data['isFraud'] >= 0].drop(['Date'],axis=1)
X = data[data['isFraud'] >= 0].drop(['isFraud','Date'],axis=1)
y = data[data['isFraud'] >= 0]['isFraud']#.astype(np.bool)
X_test = data[data['isFraud'] < 0].drop(['isFraud','Date'],axis=1)   

In [11]:
list(data.columns)

In [17]:
cols = [f for f in categorial_features if f in data.columns.values]
encoder_names = ("JamesSteinEncoder", "CatBoostEncoder")
Encoder = DoubleValidationEncoderNumerical(cols, encoder_names)

In [31]:
# X_train, X_test, y_train, y_test = train_test_split(data.drop("isFraud", axis=1), data["isFraud"],
#                                                     test_size=0.4, shuffle=False)
# X_train, X_test = X_train.reset_index(drop=False), X_test.reset_index(drop=False)
# y_train, y_test = np.array(y_train), np.array(y_test)

In [32]:
y_train

array([ 0,  0,  0, ..., -1, -1, -1], dtype=int8)

In [36]:
y.values

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [None]:
X = X.reset_index(drop=False)
# y = np.array(y.reset_index(drop=False))
# X_test = X_test.reset_index(drop=False)
X_emb = Encoder.fit_transform(X, y.values)


In [None]:
X_emb

In [22]:
y.shape

(590540,)

In [None]:
X_test = Encoder.transform(X_test)

In [None]:
X_test.head()

In [None]:
Encoder.storage