In [1]:
import glob

feats = glob.glob('/tmp/working/IEEE_Fraud_Detection/features/*.feather')
feat_re = []
for f in feats:
    f_re = f.replace('/tmp/working/IEEE_Fraud_Detection/features/', '')
    if '_train.feather' in f_re:
        f_re = f_re.replace('_train.feather', '')
        feat_re.append(f_re)

In [3]:
set(feat_re) - set(['transaction_identity_merged',
                            'transaction_amt_to_mean_card1',
                            'transaction_amt_to_mean_card4',
                            'transaction_amt_to_std_card1',
                            'transaction_amt_to_std_card4',
                            'id02_to_mean_card1',
                            'id02_to_mean_card4',
                            'id02_to_std_card1',
                            'id02_to_std_card4',
                            'd15_to_mean_card1',
                            'd15_to_mean_card4',
                            'd15_to_std_card1',
                            'd15_to_std_card4',
                            'd15_to_mean_addr1',
                            'd15_to_std_addr1'])

{'d15_to_mean_addr2', 'd15_to_std_addr2', 'emaildomain_p', 'emaildomain_r'}

In [5]:
print("'run.py' is running...")

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import eli5
import shap
from IPython.display import HTML
import json

import networkx as nx
import matplotlib.pyplot as plt

import glob

import sys
sys.path.append('/tmp/working/IEEE_Fraud_Detection/')
from functions.functions import load_datasets

folder_path = '/tmp/working/IEEE_Fraud_Detection/data/input/'
sub = pd.read_csv(f'{folder_path}sample_submission.csv')

# featherからデータの読み込み
feats = glob.glob('/tmp/working/IEEE_Fraud_Detection/features/*.feather')
feat_re = []
for f in feats:
    f_re = f.replace('/tmp/working/IEEE_Fraud_Detection/features/', '')
    if '_train.feather' in f_re:
        f_re = f_re.replace('_train.feather', '')
        feat_re.append(f_re)
train, test = load_datasets(feat_re)


# Noneをnp.nanに戻す
train.replace(to_replace=[None], value=np.nan, inplace=True)
test.replace(to_replace=[None], value=np.nan, inplace=True)

train = train.drop(['D15_to_mean_addr2', 'D15_to_std_addr2',
                   'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3',
                   'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3'], axis=1)
test = test.drop(['D15_to_mean_addr2', 'D15_to_std_addr2',
                 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3',
                   'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3'], axis=1)

print('Importing features is done')

# 値が1つしかないカラムのチェック
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]
print('Check the consistence of train and test 1value columns: ', one_value_cols == one_value_cols_test)

# # Let's create some aggregations. There is no logic in them - simply aggregations on top features.
# train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
# train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
# train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
# train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

# test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
# test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
# test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
# test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

# train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
# train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
# train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
# train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

# test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
# test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
# test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
# test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

# train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
# train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
# train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
# train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

# test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
# test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
# test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
# test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

# train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
# #train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
# train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
# #train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

# test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
# #test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
# test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
# #test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)




many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]
big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))
len(cols_to_drop)
cols_to_drop.remove('isFraud')
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

'run.py' is running...
Importing features is done
Check the consistence of train and test 1value columns:  False


In [6]:
import gc
gc.collect()

46

In [7]:
train_run = train
test_run = test

In [4]:
# Data loading and overview
folder_path = '/tmp/working/IEEE_Fraud_Detection/data/input/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sub = pd.read_csv(f'{folder_path}sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')


del train_identity, train_transaction, test_identity, test_transaction

one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]
one_value_cols == one_value_cols_test

# # Let's create some aggregations. There is no logic in them - simply aggregations on top features.
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
#train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
#train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
#test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
#test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]
big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))
len(cols_to_drop)
cols_to_drop.remove('isFraud')
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [8]:
train_run = train_run.ix[:,train.columns] 
test_run = test_run.ix[:, test.columns]

In [9]:
train['R_emaildomain_2'].nunique(), train_run['R_emaildomain_2'].nunique()

(9, 9)

In [10]:
train['R_emaildomain_2'].unique(), train_run['R_emaildomain_2'].unique()

(array([nan, 'com', 'net', 'es', 'de', 'edu', 'fr', None, 'co', 'rr'],
       dtype=object),
 array([nan, 'com', 'net', 'es', 'de', 'edu', 'fr', None, 'co', 'rr'],
       dtype=object))

In [11]:
set(train['R_emaildomain_2'].unique())-set(train_run['R_emaildomain_2'].unique()), set(train_run['R_emaildomain_2'].unique())-set(train['R_emaildomain_2'].unique())

(set(), set())

In [12]:
set(train['DeviceInfo'].unique())-set(train_run['DeviceInfo'].unique()), set(train_run['DeviceInfo'].unique())-set(train['DeviceInfo'].unique())

(set(), set())

In [13]:
train.head()

Unnamed: 0,TransactionAmt_to_std_card4,D15_to_std_card1,D15_to_std_addr1,id_02_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_card4,id_02_to_std_card1,D15_to_mean_addr1,D15_to_mean_card1,id_02_to_std_card4,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,R_emaildomain,C1,C2,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,...,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_19,id_20,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_02_to_mean_card4,TransactionAmt_to_mean_card1,D15_to_std_card4,TransactionAmt_to_std_card1,P_emaildomain_1,P_emaildomain_2,R_emaildomain_1,R_emaildomain_2
0,0.170233,0.0,0.0,,0.257761,0.0,,0.0,0.0,,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.19464,0.0,0.184566,,,,
1,0.114212,0.0,0.0,,0.219054,0.0,,0.0,0.0,,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.123777,0.0,0.062995,gmail,com,,
2,0.258544,1.851016,1.486472,,0.44307,1.865915,,1.611525,2.518583,,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,outlook.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.60815,1.541448,0.589241,outlook,com,,
3,0.196917,0.520531,0.576638,,0.377679,0.720057,,0.686169,0.550272,,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,yahoo.com,,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.405133,0.558392,0.259447,yahoo,com,,
4,0.196917,,,0.764773,0.377679,,1.753301,,,0.425884,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,70787.0,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,542.0,144.0,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,0.373295,0.515612,,0.882933,gmail,com,,


In [14]:
train_run.head()

Unnamed: 0,TransactionAmt_to_std_card4,D15_to_std_card1,D15_to_std_addr1,id_02_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_card4,id_02_to_std_card1,D15_to_mean_addr1,D15_to_mean_card1,id_02_to_std_card4,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,R_emaildomain,C1,C2,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,...,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_19,id_20,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_02_to_mean_card4,TransactionAmt_to_mean_card1,D15_to_std_card4,TransactionAmt_to_std_card1,P_emaildomain_1,P_emaildomain_2,R_emaildomain_1,R_emaildomain_2
0,0.170233,0.0,0.0,,0.257761,0.0,,0.0,0.0,,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.19464,0.0,0.184566,,,,
1,0.114212,0.0,0.0,,0.219054,0.0,,0.0,0.0,,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.123777,0.0,0.062995,gmail,com,,
2,0.258544,1.851016,1.486472,,0.44307,1.865915,,1.611525,2.518583,,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,outlook.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.60815,1.541448,0.589241,outlook,com,,
3,0.196917,0.520531,0.576638,,0.377679,0.720057,,0.686169,0.550272,,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,yahoo.com,,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.405133,0.558392,0.259447,yahoo,com,,
4,0.196917,,,0.764773,0.377679,,1.753301,,,0.425884,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,70787.0,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,542.0,144.0,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,0.373295,0.515612,,0.882933,gmail,com,,


In [15]:
test.equals(test_run)

True

In [16]:
train.equals(train_run)

True

In [44]:
train_run['DeviceInfo'].nunique()

1787

In [45]:
train['DeviceInfo'].nunique()

1787

In [48]:
train['DeviceInfo'].min()

0

In [49]:
train_run['DeviceInfo'].min()

0

In [22]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [23]:
for col in cat_cols:
    if col in train_run.columns:
        le = LabelEncoder()
        le.fit(list(train_run[col].astype(str).values) + list(test_run[col].astype(str).values))
        train_run[col] = le.transform(list(train_run[col].astype(str).values))
        test_run[col] = le.transform(list(test_run[col].astype(str).values))

In [55]:
train.head() 

Unnamed: 0,TransactionAmt_to_std_card4,D15_to_std_card1,D15_to_std_addr1,id_02_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_card4,id_02_to_std_card1,D15_to_mean_addr1,D15_to_mean_card1,id_02_to_std_card4,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,R_emaildomain,C1,C2,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,...,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_19,id_20,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_02_to_mean_card4,TransactionAmt_to_mean_card1,D15_to_std_card4,TransactionAmt_to_std_card1,P_emaildomain_1,P_emaildomain_2,R_emaildomain_1,R_emaildomain_2
0,0.170233,0.0,0.0,,0.257761,0.0,,0.0,0.0,,2987000,0,86400,68.5,4,4248,501,50,1,42,1,215,80,19.0,32,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.19464,0.0,0.184566,25,7,25,7
1,0.114212,0.0,0.0,,0.219054,0.0,,0.0,0.0,,2987001,0,86401,29.0,4,9979,304,50,2,2,1,225,80,,16,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.123777,0.0,0.062995,15,2,25,7
2,0.258544,1.851016,1.486472,,0.44307,1.865915,,1.611525,2.518583,,2987002,0,86469,59.0,4,11850,390,50,4,66,2,230,80,287.0,36,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.60815,1.541448,0.589241,28,2,25,7
3,0.196917,0.520531,0.576638,,0.377679,0.720057,,0.686169,0.550272,,2987003,0,86499,50.0,4,8796,467,50,2,17,2,376,80,,55,32,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.405133,0.558392,0.259447,44,2,25,7
4,0.196917,,,0.764773,0.377679,,1.753301,,,0.425884,2987004,0,86506,50.0,1,11687,414,50,2,2,1,320,80,,16,32,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,70787.0,,,,,,,100.0,1,55,7,1,1,65,438,44,1,1,7,162,3,268,3,1,0,1,1,1,1565,0.373295,0.515612,,0.882933,15,2,25,7


In [56]:
train_run.head()

Unnamed: 0,TransactionAmt_to_std_card4,D15_to_std_card1,D15_to_std_addr1,id_02_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_card4,id_02_to_std_card1,D15_to_mean_addr1,D15_to_mean_card1,id_02_to_std_card4,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,R_emaildomain,C1,C2,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,...,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_19,id_20,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_02_to_mean_card4,TransactionAmt_to_mean_card1,D15_to_std_card4,TransactionAmt_to_std_card1,P_emaildomain_1,P_emaildomain_2,R_emaildomain_1,R_emaildomain_2
0,0.170233,0.0,0.0,,0.257761,0.0,,0.0,0.0,,2987000,0,86400,68.5,4,4248,501,50,1,42,1,215,80,19.0,32,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.19464,0.0,0.184566,25,7,25,7
1,0.114212,0.0,0.0,,0.219054,0.0,,0.0,0.0,,2987001,0,86401,29.0,4,9979,304,50,2,2,1,225,80,,16,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.123777,0.0,0.062995,15,2,25,7
2,0.258544,1.851016,1.486472,,0.44307,1.865915,,1.611525,2.518583,,2987002,0,86469,59.0,4,11850,390,50,4,66,2,230,80,287.0,36,32,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.60815,1.541448,0.589241,28,2,25,7
3,0.196917,0.520531,0.576638,,0.377679,0.720057,,0.686169,0.550272,,2987003,0,86499,50.0,4,8796,467,50,2,17,2,376,80,,55,32,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,...,,,,,,,,,,,,2,55,28,3,2,127,568,547,2,2,86,136,6,461,4,2,2,2,2,2,2740,,0.405133,0.558392,0.259447,44,2,25,7
4,0.196917,,,0.764773,0.377679,,1.753301,,,0.425884,2987004,0,86506,50.0,1,11687,414,50,2,2,1,320,80,,16,32,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,70787.0,,,,,,,100.0,1,55,7,1,1,65,438,44,1,1,7,162,3,268,3,1,0,1,1,1,1565,0.373295,0.515612,,0.882933,15,2,25,7


In [70]:
col = 'D15_to_std_card1'
(train[col] + train_run[col]).sum()

inf

In [82]:
(train.isnull().sum() != train_run.isnull().sum()).sum()

0

In [83]:
gc.collect()

2609

In [65]:
total = 0
for col in train.columns.values:
    m = (train[col] - train_run[col]).sum()
    total =+ m
total

0

In [64]:
(train.isnull().sum() == train_run.isnull().sum()).sum()

370

In [42]:
type(train.iloc[4, 2]),  type(train_run.iloc[4, 2])

(numpy.float64, numpy.float64)

In [43]:
np.nan == np.nan

False

In [26]:
train.equals(train_run)

True

In [None]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
cat_cols_set = set(cat_cols)

In [None]:
print(df_set - cat_cols_set)
print(cat_cols_set - df_set)

In [None]:
for col in list(df_set):
    if col in train.columns.values:
        print(col, train[col].nunique())

In [None]:
for col in list(df_set):
    print('---------------------', col, '------------------------------------')
    print(train[col].unique(), train[col].nunique())
    print((train.shape[0] - train[col].isnull().sum())/train[col].nunique(), '\t')

In [None]:
train[['id_33_1', 'id_33_2']] = train['id_33'].str.split('x', expand=True).astype(np.float64)
train[['id_33_1', 'id_33_2']].head()

In [None]:
train['id_33_3'] = train['id_33_1'] * train['id_33_2']
train['id_33_3'].head(20)
train[['id_33_1', 'id_33_2', 'id_33_3']].dtypes

In [None]:
train[list(df_set)].head(30)

In [None]:
train['P_emaildomain'].replace(to_replace='gmail', value='gmail.com', inplace=True)
test['P_emaildomain'].replace(to_replace='gmail', value='gmail.com', inplace=True)

train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

In [None]:
col = 'P_emaildomain_2'
print(train[col].unique())
print(train_run[col].unique())

In [None]:
col = 'P_emaildomain_2'
print(train[col].nunique())
print(train_run[col].nunique())

In [None]:
col = 'P_emaildomain_2'
print(set(train_run[col].unique()) - set(train[col].unique()))
print(set(train[col].unique()) - set(train_run[col].unique()))

In [None]:
pemaildomin2list = list(train_run['P_emaildomain_2'])
pemaildominlsit = []
for i, c in enumerate(pemaildomin2list):
    if c is None:
        print(list(train_run[['P_emaildomain']].iloc[i, :]))
#         pemaildominlsit.append(list(train[['P_emaildomain']].iloc[i, :])[0])

set(pemaildominlsit)

In [None]:
count = 0
list_same = []
for col in train_run.columns.values:
    if list(train[col].unique()) == list(train_run[col].unique()):
        print(col)
        if train_run[col].nunique() != train[col].nunique():
            print('nunique is different')
            print('train_run n uneque:', train_run[col].nunique())
            print('train n uneque:', train[col].nunique())
            print('----------------')
#         list_same.append(col)
        count += 1
        
print(count)
# list_same

In [54]:
count = 0
list_notsame = []
for col in train_run.columns.values:
    if set(train[col].unique()) != set(train_run[col].unique()):
        print('----', col, '----')
        print('train_run - train', set(train_run[col].unique()) - set(train[col].unique()))
        print('train - train_run', set(train[col].unique()) - set(train_run[col].unique()))        
        if train_run[col].nunique() != train[col].nunique():
            print('nunique is different')
            print('train_run n uneque:', train_run[col].nunique())
            print('train n uneque:', train[col].nunique())
            list_notsame.append(col)
            count += 1
        print('----------------')
       
        
        
print(count)
list_notsame

---- TransactionAmt_to_std_card4 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- D15_to_std_card1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- D15_to_std_addr1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- id_02_to_mean_card1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- TransactionAmt_to_mean_card4 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- D15_to_mean_card4 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- id_02_to_std_card1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- D15_to_mean_addr1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- D15_to_mean_card1 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- id_02_to_std_card4 ----
train_run - train {nan}
train - train_run {nan}
----------------
---- dist1 ----
train_run - train {nan}
train -

In [None]:
col = 'card2'
print(type(train[col].unique()[0]))
print(type(train_run[col].unique()[0]))

In [None]:
train['R_emaildomain_2'].replace(to_replace=[None], value=np.nan, inplace=True)
train['R_emaildomain_2'].unique()

In [None]:
for col in train_run.columns.values:
    train_run[col].replace(to_replace=[None], value=np.nan, inplace=True)

count = 0
train_run = train_run.replace(to_replace=[None], value=np.nan)
list_notsame = []
for col in train_run.columns.values:
    if list(train[col].unique()) != list(train_run[col].unique()):
        print('----', col, '----')
        print('train_run - train', set(train_run[col].unique()) - set(train[col].unique()))
        print('train - train_run', set(train[col].unique()) - set(train_run[col].unique()))        
        if train_run[col].nunique() != train[col].nunique():
            print('nunique is different')
            print('train_run n uneque:', train_run[col].nunique())
            print('train n uneque:', train[col].nunique())

        print('----------------')
        list_notsame.append(col)
        count += 1
        
print(count)
list_notsame

In [None]:
print(train[list_same].isnull().any())

In [None]:
col = 'card2'

diff = list(set(train_run[col].unique()) - set(train[col].unique()))
diff[0]

In [None]:
train_run_kai = train_run.replace(to_replace='None', value=np.nan)

count = 0
list_notsame = []
for col in train_run_kai.columns.values:
    if list(train[col].unique()) != list(train_run_kai[col].unique()):
        print('----', col, '----')
        print('train_run_kai - train', set(train_run__kai[col].unique()) - set(train[col].unique()))
        print('train - train_run_kai', set(train[col].unique()) - set(train_run_kai[col].unique()))        
        if train_run_kai[col].nunique() != train[col].nunique():
            print('nunique is different')
            print('train_run_kai n uneque:', train_run_kai[col].nunique())
            print('train n uneque:', train[col].nunique())

        print('----------------')
        list_notsame.append(col)
        count += 1
        
print(count)

In [None]:
col = 'card2'

set(train[col].unique()) - set(train_run[col].unique())

In [None]:
test.equals(test_run)

In [None]:
count = 0
for col in test_run.columns.values:
    if list(test[col].unique()) != list(test_run[col].unique()):
        count +=1

count

In [None]:
test.shape

In [None]:
test_run.shape

In [None]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']

for col in cat_cols:
    if col in train_run.columns:
        le = LabelEncoder()
        le.fit(list(train_run[col].astype(str).values) + list(test_run[col].astype(str).values))
        train_run[col] = le.transform(list(train_run[col].astype(str).values))
        test_run[col] = le.transform(list(test_run[col].astype(str).values))
        
X_run = train_run.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y_run = train_run.sort_values('TransactionDT')['isFraud']
X_test_run = test_run.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [None]:
del train_run
gc.collect()

In [None]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [None]:
# %% [code]
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)
del train

In [None]:
display(X_run.head())
print(X_run['P_emaildomain_1'].unique())
X_run['P_emaildomain_1'].nunique()

In [None]:
X_run.dtypes

In [None]:
X.head()
print(X['P_emaildomain_1'].unique())
X['P_emaildomain_1'].nunique()

In [None]:
X.dtypes

In [None]:
col = 'P_emaildomain_2'
print(set(X_run[col].unique()) - set(X[col].unique()))
print(set(X[col].unique()) - set(X_run[col].unique()))

In [None]:
X_run.shape

In [None]:
X.shape

In [None]:
count = 0
for col in X_run.columns.values:
    if list(X[col].unique()) != list(X_run[col].unique()):
        count +=1

count

In [None]:
train_ori.ix[:,train_comp.columns].head(10)

In [None]:
train_comp.head(10)

In [None]:
for dl in ['id_18', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27','P_emaildomain_3', 'R_emaildomain_3']:
    cat_cols.remove(dl)

In [None]:
train_ori[cat_cols].head(20)

In [None]:
train_comp[cat_cols].head(20)

In [None]:
train_ori['id_12'].unique()

In [None]:
train_comp['id_12'].unique()

In [None]:
train_ori['id_12'][0] == np.nan

In [None]:
np.isnan(train_ori["id_12"][0])

In [None]:
train_ori[cat_cols].head(20).replace(to_replace=[None], value=np.nan)

In [None]:
train_comp[cat_cols].equals(train_ori[cat_cols].replace(to_replace=[None], value=np.nan))

In [None]:
print(X.shape)
print(X_test.shape)
print(y.shape)

In [None]:
print(X_comp.shape)
print(X_test_comp.shape)
print(y_comp.shape)

In [None]:
X.equals(X_comp)

In [None]:
columns_X = set(X.columns.values)
columns_X_comp = set(X_comp.columns.values)
columns_X-columns_X_comp

In [None]:
columns_X_comp-columns_X

In [None]:
X_droped = X.drop(['D15_to_mean_addr2', 'D15_to_std_addr2'], axis=1)
print(X_droped.shape)
columns_X_droped = set(X_droped.columns.values)
columns_X_droped - columns_X_comp

In [None]:
X_droped.equals(X_comp)

In [None]:
X_comp.equals(X_droped.replace(to_replace=[None], value=np.nan))

In [None]:
X_comp[cat_cols].equals(X_droped[cat_cols].replace(to_replace=[None], value=np.nan))

In [None]:
X_comp_numeric = X_comp[list(set(X_comp.columns.values) - set(cat_cols))]

In [None]:
X_droped_numeric = X_droped[list(set(X_comp.columns.values) - set(cat_cols))].ix[:,X_comp_numeric.columns.values]

In [None]:
X_comp_numeric.head()

In [None]:
X_droped_numeric.head()

In [None]:
X_comp_numeric.equals(X_droped_numeric)

In [None]:
X_comp_columns = list(X_comp.columns.values)
X_droped.ix[:,X_comp_columns].head(10)

In [None]:
X_comp.head(10)

In [None]:
X_droped.ix[:,X_comp_columns].drop(['P_emaildomain','R_emaildomain','P_emaildomain_1', 'P_emaildomain_2', 
               'R_emaildomain_1', 'R_emaildomain_2'], axis=1).equals(X_comp.drop(['P_emaildomain','R_emaildomain','P_emaildomain_1', 'P_emaildomain_2', 
               'R_emaildomain_1', 'R_emaildomain_2'], axis=1))

In [None]:
X_comp['P_emaildomain_1'].head()

In [None]:
X_1 = X_droped.ix[:,X_comp_columns].drop(['P_emaildomain','R_emaildomain','P_emaildomain_1', 'P_emaildomain_2', 
               'R_emaildomain_1', 'R_emaildomain_2'], axis=1)
X_2 = X_comp.drop(['P_emaildomain','R_emaildomain','P_emaildomain_1', 'P_emaildomain_2', 
               'R_emaildomain_1', 'R_emaildomain_2'], axis=1) 

In [None]:
X_1.tail(10)

In [None]:
X_2.tail(10)

In [None]:
list(X_1.dtypes == X_2.dtypes)

In [None]:
print(False in list(X_1.dtypes == X_2.dtypes))

In [None]:
X_1.equals(X_2)