In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import os
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import eli5
import shap
from IPython.display import HTML
import json
import altair as alt

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

alt.renderers.enable('notebook')

In [None]:
# with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
#     data = store['initial/encoded']
#     categorial_features = list(store['initial/categorial_features'].values.flatten())
#     numerical_features = list(store['initial/numerical_features'].values.flatten())
#     numerical_features.sort()
#     categorial_features.sort()
data = pd.read_pickle("./pickles/data_all_features.pkl")

In [None]:
fraud = data['isFraud']
fraud = fraud.replace({'test':-1})
fraud = fraud.astype(np.int8)
data['isFraud'] = fraud

In [None]:
# with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
#     categorial_features = list(store['initial/categorial_features'].values.flatten())
#     numerical_features = list(store['initial/numerical_features'].values.flatten())
categorial_features = json.load(open('pickles/categorial_features.json','r'))
numerical_features = json.load(open('pickles/numerical_features.json','r'))

## Apply Additional Features

In [None]:
with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
    num_features = store.get_node('encodedFeatures')
    for feature in num_features:
        feature_name = feature._v_name
        # here would be assession of feature introduction
        q = store.select(feature._v_pathname)
        if len(q[q.index.duplicated()]) != 0:
            print('Problem', feature_name)
            q.drop(q[q.index.duplicated()].index, axis=0, inplace=True)
        data[feature_name] = q
    numerical_features += ['BrowserAge', 'BrowserVersion', 'screen_height', 'screen_width']
    categorial_features +=  ['Browser',  'OS', 'OSVersion', 'device_name', 'device_version']
    
categorial_features.remove('id_30')
categorial_features.remove('id_31')
data.drop(['id_30','id_31'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
## Load CardId
with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
    data['card_id'] = store['initial/card_id']
categorial_features.append('card_id')    

In [None]:
data.head()

In [None]:
data.drop(['R_emaildomain','P_emaildomain'], axis=1,inplace=True)

# Other features

In [None]:
data['TransactionAmt_decimal'] = ((data['TransactionAmt'] - data['TransactionAmt'].astype(int)) * 1000).astype(int)
with pd.HDFStore('./hdf/engineering.h5') as store:
    store['numerical/TransactionAmt_decimal'] = data[['TransactionAmt_decimal']]
    

In [None]:
data['card1_count_full'] = data['card1'].map(data['card1'].value_counts(dropna=False))
with pd.HDFStore('./hdf/engineering.h5') as store:
    store['numerical/card1_count_full'] = data[['card1_count_full']]  

In [None]:
# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__device_name', 'device_name__P_emaildomain_bin', 'P_emaildomain_bin__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain_bin', 'addr1__card1']:
    print(feature)
    f1, f2 = feature.split('__')
    data[feature] = data[f1].astype(str) + '_' + data[f2].astype(str)

#     le = LabelEncoderPopularity(convert_nan=True)
#     le.fit(data[feature].astype(str))
#     data[feature] = le.transform(data[feature].astype(str))
    with pd.HDFStore('./hdf/engineering.h5') as store:
        store[f'categorial/{feature}'] = data[[feature]]    
        

In [None]:
categorial_features += ['id_02__id_20', 'id_02__D8', 'D11__device_name', 'device_name__P_emaildomain_bin', 'P_emaildomain_bin__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain_bin', 'addr1__card1']

In [None]:
for feature in  ['id_01', 'id_33', 'id_35', 'id_34', 'id_36']:
        # Count encoded for both train and test
        data[feature + '_count_full'] = data[feature].map(data[feature].value_counts(dropna=False))
        with pd.HDFStore('./hdf/engineering.h5') as store:
            store[f'numerical/{feature}_count_full'] = data[[feature + '_count_full']   ]
        
    
# for feature in ['id_34', 'id_36']:
#         # Count encoded for both train and test
#         data[feature + '_count_full'] = data[feature].map(data[feature].value_counts(dropna=False))
        
# for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
#     if feature in useful_features:
#         # Count encoded separately for train and test
#         train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
#         test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

## Label encode categorial Features

In [None]:
from functions import LabelEncoderNan
for col in categorial_features:
    if col in data.columns:
        le = LabelEncoderNan()
        le.fit(list(data[col].map(lambda x: str(x).lower()).values))
        data[col] = le.transform(list(data[col].map(lambda x: str(x).lower()).values))

In [None]:
with pd.HDFStore('./hdf/pre-aggregate.h5') as store:
    store[f'data'] = data


In [18]:
from lib.training import train_model_classification
def quick_score2(data, categorial_features, addition=None, categorial_features_in_addition=None, target='isFraud', drop_columns=['Date'], filter_neg_target=True,
                 params=None, n_estimators = 6):
    if params is None:
        params = {'num_leaves': 500,
                  'min_child_weight': 0.03454472573214212,
                  'feature_fraction': 0.3797454081646243,
                  'bagging_fraction': 0.4181193142567742,
                  'min_data_in_leaf': 106,
                  'objective': 'binary',
                  'max_depth': -1,
                  'learning_rate': 0.1,
                  "boosting_type": "gbdt",
                  "bagging_seed": 11,
                  "metric": 'auc',
                  "verbosity": -1,
                  'reg_alpha': 0.3899927210061127,
                  'reg_lambda': 0.6485237330340494,
                  'random_state': 47,
                  }

    if filter_neg_target:
        train_subset_ids = data[data[target] >= 0].index
    else:
        train_subset_ids = data.index

    X_new = data.loc[train_subset_ids].drop(drop_columns + [target], axis=1)
    if addition is not None:
        X_new = X_new.join(addition.loc[train_subset_ids])
        if categorial_features_in_addition is not None:
            categorial_features.extend(categorial_features_in_addition)

    y = data.loc[train_subset_ids][target].astype(np.bool)

    folds = KFold(n_splits=5, shuffle=False)


    categorical_columns = [c for c, col in enumerate(X_new.columns) if col in categorial_features]

    params['categorical_feature'] = categorical_columns

    results = train_model_classification(X=X_new, X_test=None, y=y, params=params, folds=folds, splits=1,
                                         model_type='lgb', eval_metric='auc', plot_feature_importance=False,
                                         verbose=None, early_stopping_rounds=40, n_estimators=600, averaging='usual',
                                         n_jobs=-1)

    return sum(results['scores']) / len(results['scores'])


In [19]:
score = quick_score2(data,categorial_features,drop_columns=['Date','card_id'])
print ("Main score is ", score)

Fold 5 started at Sat Aug 24 20:42:42 2019
CV mean score: 0.9117, std: 0.0000.
Main score is  0.9116769852711911


# Aggregates

In [30]:
def add_numerical_aggregate(df, target_column, group_column, scoring=True, save=True):
    print(target_column, group_column)
    to_mean = pd.DataFrame()
    to_mean[f'{target_column}_to_mean_{group_column}'] = df[target_column] / data.groupby([group_column])[target_column].transform('mean')
    if scoring:
        new_score = quick_score2(df,categorial_features,addition=to_mean,categorial_features_in_addition=[])
        delta_score = new_score - score
        with open('./hdf/num_agg_scores.csv','a') as f:
            f.write(f'{target_column}_to_mean_{group_column}, {delta_score}\n')
    if save:
        with pd.HDFStore('./hdf/engineering.h5') as store:
            store[f'numerical/{target_column}_to_std_{group_column}'] = to_mean[[f'{target_column}_to_mean_{group_column}']]

    to_std = pd.DataFrame()
    to_std[f'{target_column}_to_std_{group_column}'] = df[target_column] / data.groupby([group_column])[target_column].transform('std')
    if scoring:
        new_score = quick_score2(df,categorial_features,addition=to_std,categorial_features_in_addition=[])
        delta_score = new_score - score
        with open('./hdf/num_agg_scores.csv','a') as f:
            f.write(f'{target_column}_to_std_{group_column}, {delta_score}\n')
    if save:
        with pd.HDFStore('./hdf/engineering.h5') as store:
            store[f'numerical/{target_column}_to_std_{group_column}'] = to_std[[f'{target_column}_to_std_{group_column}']]
    return to_mean.join(to_std)
    
# add_numerical_aggregate(data,'TransactionAmt', 'card_id')
# add_numerical_aggregate(data,'dist1', 'card_id')
# add_numerical_aggregate(data,'_Hours', 'card_id')





In [27]:
started = False
for num_feature in numerical_features:
    if not started and num_feature != 'V1':
        continue
    started = True
    for cat_feature in ['card_id']:
        try:
            add_numerical_aggregate(data,num_feature, cat_feature)
        except Exception as ex:
            print('Exception', ex)
            

V1 card_id
Exception 'V1'
V2 card_id
Exception 'V2'
V3 card_id
Fold 5 started at Sat Aug 24 22:28:14 2019
CV mean score: 0.9134, std: 0.0000.
Fold 5 started at Sat Aug 24 22:30:34 2019
CV mean score: 0.9130, std: 0.0000.
V4 card_id
Fold 5 started at Sat Aug 24 22:32:34 2019
CV mean score: 0.9120, std: 0.0000.
Fold 5 started at Sat Aug 24 22:34:48 2019
CV mean score: 0.9118, std: 0.0000.
V5 card_id
Fold 5 started at Sat Aug 24 22:37:09 2019
CV mean score: 0.9123, std: 0.0000.
Fold 5 started at Sat Aug 24 22:39:44 2019
CV mean score: 0.9140, std: 0.0000.
V6 card_id
Fold 5 started at Sat Aug 24 22:41:35 2019
CV mean score: 0.9122, std: 0.0000.
Fold 5 started at Sat Aug 24 22:43:19 2019
CV mean score: 0.9116, std: 0.0000.
V7 card_id
Fold 5 started at Sat Aug 24 22:45:21 2019
CV mean score: 0.9124, std: 0.0000.
Fold 5 started at Sat Aug 24 22:47:11 2019
CV mean score: 0.9136, std: 0.0000.
V8 card_id
Fold 5 started at Sat Aug 24 22:49:24 2019
CV mean score: 0.9132, std: 0.0000.
Fold 5 starte

Unable to open/create file './hdf/engineering.h5'
V94 card_id
Fold 5 started at Sun Aug 25 02:42:42 2019
CV mean score: 0.9130, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V95 card_id
Fold 5 started at Sun Aug 25 02:44:37 2019
CV mean score: 0.9137, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", li

Unable to open/create file './hdf/engineering.h5'
V132 card_id
Exception 'V132'
V133 card_id
Exception 'V133'
V134 card_id
Exception 'V134'
V135 card_id
Exception 'V135'
V136 card_id
Exception 'V136'
V137 card_id
Exception 'V137'
V138 card_id
Fold 5 started at Sun Aug 25 03:07:10 2019
CV mean score: 0.9136, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V139 card_id
Fold 5 started at Sun Aug 25 03:09:03 2019
CV mean score: 0.9139, std: 0.0000.
E

Unable to open/create file './hdf/engineering.h5'
V155 card_id
Exception 'V155'
V156 card_id
Fold 5 started at Sun Aug 25 03:36:02 2019
CV mean score: 0.9152, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V157 card_id
Exception 'V157'
V158 card_id
Fold 5 started at Sun Aug 25 03:37:54 2019
CV mean score: 0.9131, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to ope

Unable to open/create file './hdf/engineering.h5'
V170 card_id
Fold 5 started at Sun Aug 25 04:05:19 2019
CV mean score: 0.9124, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V171 card_id
Fold 5 started at Sun Aug 25 04:07:02 2019
CV mean score: 0.9147, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", 

Unable to open/create file './hdf/engineering.h5'
V188 card_id
Fold 5 started at Sun Aug 25 04:34:24 2019
CV mean score: 0.9127, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V189 card_id
Fold 5 started at Sun Aug 25 04:36:15 2019
CV mean score: 0.9161, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", 

Unable to open/create file './hdf/engineering.h5'
V208 card_id
Fold 5 started at Sun Aug 25 05:00:04 2019
CV mean score: 0.9123, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V209 card_id
Fold 5 started at Sun Aug 25 05:03:05 2019
CV mean score: 0.9130, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", 

Unable to open/create file './hdf/engineering.h5'
V239 card_id
Fold 5 started at Sun Aug 25 05:55:10 2019
CV mean score: 0.9112, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V240 card_id
Exception 'V240'
V241 card_id
Exception 'V241'
V242 card_id
Fold 5 started at Sun Aug 25 05:57:19 2019
CV mean score: 0.9138, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to ope

Unable to open/create file './hdf/engineering.h5'
V271 card_id
Fold 5 started at Sun Aug 25 06:49:44 2019
CV mean score: 0.9133, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V272 card_id
Fold 5 started at Sun Aug 25 06:52:21 2019
CV mean score: 0.9129, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", 

Unable to open/create file './hdf/engineering.h5'
V284 card_id
Exception 'V284'
V285 card_id
Fold 5 started at Sun Aug 25 07:15:52 2019
CV mean score: 0.9122, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V286 card_id
Exception 'V286'
V287 card_id
Fold 5 started at Sun Aug 25 07:19:42 2019
CV mean score: 0.9124, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to ope

Unable to open/create file './hdf/engineering.h5'
V309 card_id
Exception 'V309'
V310 card_id
Fold 5 started at Sun Aug 25 07:44:33 2019
CV mean score: 0.9134, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V311 card_id
Exception 'V311'
V312 card_id
Fold 5 started at Sun Aug 25 07:47:01 2019
CV mean score: 0.9132, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to ope

Unable to open/create file './hdf/engineering.h5'
V332 card_id
Fold 5 started at Sun Aug 25 08:12:34 2019
CV mean score: 0.9136, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
V333 card_id
Fold 5 started at Sun Aug 25 08:14:59 2019
CV mean score: 0.9107, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", 

Unable to open/create file './hdf/engineering.h5'
D9 card_id
Fold 5 started at Sun Aug 25 08:39:56 2019
CV mean score: 0.9121, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
D10 card_id
Fold 5 started at Sun Aug 25 08:42:32 2019
CV mean score: 0.9129, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", lin

Fold 5 started at Sun Aug 25 09:07:28 2019
CV mean score: 0.9127, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
C8 card_id
Fold 5 started at Sun Aug 25 09:09:39 2019
CV mean score: 0.9115, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf

Unable to open/create file './hdf/engineering.h5'
_Weekdays card_id
Fold 5 started at Sun Aug 25 09:35:34 2019
CV mean score: 0.9129, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1400, in H5F__open
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fint.c", line 1700, in H5F_open
    unable to read superblock
  File "C:\ci\hdf5_1545244154871\work\src\H5Fsuper.c", line 623, in H5F__super_read
    truncated file: eof = 1660641724, sblock->base_addr = 0, stored_eof = 1669419572

End of HDF5 error back trace

Unable to open/create file './hdf/engineering.h5'
_Hours card_id
Fold 5 started at Sun Aug 25 09:37:31 2019
CV mean score: 0.9114, std: 0.0000.
Exception HDF5 error back trace

  File "C:\ci\hdf5_1545244154871\work\src\H5F.c", line 509, in H5Fopen
    unable to open file
  File "C:\ci\hdf5_1545244154871\work\src\H5Fi

In [41]:
num_aggs = []
started = False
for i,num_feature in enumerate(numerical_features):
    if not started and num_feature != 'V266':
        continue
    started=True
    for cat_feature in ['card_id']:
        try:
            agg_new = add_numerical_aggregate(data,num_feature, cat_feature,scoring=False, save=False)
            num_aggs.append(agg_new)
        except Exception as ex:
            print('Exception', ex)
            

V266 card_id
V267 card_id
V268 card_id
V269 card_id
Exception 'V269'
V270 card_id
V271 card_id
V272 card_id
V273 card_id
V274 card_id
V275 card_id
V276 card_id
V277 card_id
V278 card_id
V279 card_id
V280 card_id
V281 card_id
Exception 'V281'
V282 card_id
V283 card_id
V284 card_id
Exception 'V284'
V285 card_id
V286 card_id
Exception 'V286'
V287 card_id
V288 card_id
V289 card_id
V290 card_id
Exception 'V290'
V291 card_id
V292 card_id
V293 card_id
Exception 'V293'
V294 card_id
V295 card_id
Exception 'V295'
V296 card_id
Exception 'V296'
V297 card_id
Exception 'V297'
V298 card_id
Exception 'V298'
V299 card_id
Exception 'V299'
V300 card_id
Exception 'V300'
V301 card_id
Exception 'V301'
V302 card_id
Exception 'V302'
V303 card_id
V304 card_id
V305 card_id
Exception 'V305'
V306 card_id
V307 card_id
V308 card_id
V309 card_id
Exception 'V309'
V310 card_id
V311 card_id
Exception 'V311'
V312 card_id
V313 card_id
V314 card_id
V315 card_id
V316 card_id
Exception 'V316'
V317 card_id
V318 card_id
Excep

In [42]:
num_agg2 = pd.concat(num_aggs,axis=1)

In [44]:
num_agg = num_agg.join(num_agg2)

In [54]:
del num_aggs

In [55]:
del num_agg2

In [57]:
import gc
gc.collect()

17687

In [45]:
num_agg.head()

Unnamed: 0_level_0,id_01_to_mean_card_id,id_01_to_std_card_id,id_02_to_mean_card_id,id_02_to_std_card_id,id_03_to_mean_card_id,id_03_to_std_card_id,id_04_to_mean_card_id,id_04_to_std_card_id,id_05_to_mean_card_id,id_05_to_std_card_id,...,_Days_to_mean_card_id,_Days_to_std_card_id,BrowserAge_to_mean_card_id,BrowserAge_to_std_card_id,BrowserVersion_to_mean_card_id,BrowserVersion_to_std_card_id,screen_height_to_mean_card_id,screen_height_to_std_card_id,screen_width_to_mean_card_id,screen_width_to_std_card_id
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,,,,,,,,,,,...,0.142857142857143,0.187977895099228,,,,,,,,
2987001,,,,,,,,,,,...,0.129283489096573,0.230604616442685,,,,,,,,
2987002,,,,,,,,,,,...,0.128579651748453,0.21496312607867,,,,,,,,
2987003,,,,,,,,,,,...,0.131019268553307,0.224939409060347,,,,,,,,
2987004,-0.0,0.0,0.73072261919263,0.924500429290362,,,,,,,...,0.126050420168067,0.222674539986069,,,,,1.034923315048218,8.318767547607422,1.225240707397461,7.904130458831787


In [58]:
num_agg = num_agg.astype(np.float32)

In [None]:
with pd.HDFStore('e:/numerical_aggregations.h5',complib='blosc:blosclz') as store:
    store.put('num_agg', num_agg)
    

0) Treat categorial features accurately, find very diverse and unite them

1) Убрать коррелирующие фичи

2) Убрать ту из них что даёт меньший джини

3) TimeKFold instead of StratifiedKFold