In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.time_utils import date_add_days, diff_of_days, diff_of_times
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from kaggle_utils import reduce_mem_usage, move_feature
from ieee_utils import get_os_release_date

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]
base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)
# User id を付与し、最初の日付との差分をとったりする
# predicted_user_idのままfeature nameをつけてしまったのはcard_addr_pemail_M(0902)
# df_same = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_same = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
col_user = 'user_id_card_addr'
base[col_user] = df_same['predicted_user_id']

train_iden = pd.read_csv(f'../input/train_identity.csv', index_col='TransactionID')
test_iden = pd.read_csv(f'../input/test_identity.csv', index_col='TransactionID')

In [3]:
# START_DATE = '2017-11-04'
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
base['datetime'] = base['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
base['date'] = base['datetime'].map(lambda x: x.date())

In [4]:
def trans_DT(df):
    list_regist = []
    for d, diff in tqdm(df[['date', 'D1']].values):
        if diff < 999999:
            regist = date_add_days(d, -1*diff)
        else:
            regist = date_add_days(d, 0)
        list_regist.append(str(regist))
    df['Regist_date'] = list_regist
    return df
base = trans_DT(base)

100%|██████████| 1097231/1097231 [00:04<00:00, 241564.59it/s]


In [5]:
def id_split(df):
    df['DeviceInfo'].fillna('#', inplace=True)
    df['id_29'].fillna('#', inplace=True)
    df['id_30'].fillna('#', inplace=True)
    df['id_31'].fillna('#', inplace=True)
    
    df['id_29'] = df['id_29'].map(lambda x: x.lower())
    df['id_30'] = df['id_30'].map(lambda x: x.lower().replace('_', '.'))
    df['id_30'] = df['id_30'].map(lambda x: x.replace('mac os x', 'mac') if x.count('mac') else x)
    
    df['DeviceInfo'] = df['DeviceInfo'].map(lambda x: x.lower())
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]
    df['device_country'] = 'Unk'

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].map(lambda x: x.split(' ')[1] if x.count(' ') else '0' )

    df['screen_width'] = df['id_33'].str.split('x', expand=True)[0].astype('float')
    df['screen_height'] = df['id_33'].str.split('x', expand=True)[1].astype('float')

    df['id_34'] = df['id_34'].str.split(':', expand=True)[1]
    df['id_23'] = df['id_23'].str.split(':', expand=True)[1]

    df.loc[df['device_name'].str.contains('sm', na=False), 'device_name'] = 'samsung'
    df.loc[df['device_name'].str.contains('samsung', na=False), 'device_name'] = 'samsung'
    df.loc[df['device_name'].str.contains('gt-', na=False), 'device_name'] = 'samsung'
    df.loc[df['device_name'].str.contains('moto g', na=False), 'device_name'] = 'motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'motorola'
    df.loc[df['device_name'].str.contains('lg-', na=False), 'device_name'] = 'lg'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'rv'
    df.loc[df['device_name'].str.contains('huawei', na=False), 'device_name'] = 'huawei'
    df.loc[df['device_name'].str.contains('blade', na=False), 'device_name'] = 'zte'
    df.loc[df['device_name'].str.contains('blade', na=False), 'device_name'] = 'zte'
    df.loc[df['device_name'].str.contains('linux', na=False), 'device_name'] = 'linux'
    df.loc[df['device_name'].str.contains('xt', na=False), 'device_name'] = 'sony'
    df.loc[df['device_name'].str.contains('htc', na=False), 'device_name'] = 'htc'
    df.loc[df['device_name'].str.contains('asus', na=False), 'device_name'] = 'asus'
    df.loc[df['device_name'].str.contains('hi', na=False), 'device_name'] = 'huawei'
    df.loc[df['device_name'].str.contains('ale-', na=False), 'device_name'] = 'huawei'
    df.loc[df['device_name'].str.contains('-l', na=False), 'device_name'] = 'huawei'
    # Add
    df.loc[df['device_name'].str.contains('ios', na=False), 'device_name'] = 'ios'
    df.loc[df['device_name'].str.contains('android', na=False), 'device_name'] = 'android'
    df.loc[df['device_name'].str.contains('build', na=False), 'DeviceInfo']   = 'build'
    
    df['device_country'] = df[['DeviceInfo', 'id_31']].apply(lambda x: 
                                                'Korea' if x[0].count('sm')
                                                else 'Korea' if x[0].count('samsung')
                                                else 'Korea' if x[0].count('gt-'   )
                                                else 'Korea' if x[0].count('lg'    )
                                                else 'China' if x[0].count('moto g')
                                                else 'China' if x[0].count('moto'  )
                                                else 'China' if x[0].count('moto'  )
                                                else 'RV'    if x[0].count('rv:'      )
                                                else 'RV'    if x[0].count('rv'      )
                                                else 'China' if x[0].count('hi'    )
                                                else 'China' if x[0].count('redmi' )
                                                else 'China' if x[0].count('huawei')
                                                else 'China' if x[0].count('ale-'  )
                                                else 'China' if x[0].count('-l'    )
                                                else 'China' if x[0].count('blade' )
                                                else 'China' if x[0].count('zte' )
                                                else 'China' if x[0].count('tride' )
                                                else 'US'    if x[0].count('linux'    )
                                                else 'Japan' if x[0].count('xt'    )
                                                else 'China' if x[0].count('htc'   )
                                                else 'China' if x[0].count('asus'  )
                                                else 'Build' if x[0].count('build' )
                                                else 'US'    if x[0].count('window'   )
                                                else 'US'    if x[0].count('ios'      )
                                                else 'US'    if x[0].count('mac'      )
                                                else 'US'    if x[0].count('nexus'      )
                                                else 'China' if x[0].count('lenovo')
                                                else 'US'    if x[0].count('pixel'    )
                                                           
                                                else 'Korea' if x[1].count('samsun')
                                                else 'US'    if x[1].count('google')
                                                else 'Korea' if x[1].count('line')
                                                else 'Unk'
                                               , axis=1)
    
    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "others"
    df['device_name'].fillna('#', inplace=True)
    df['device_version'].fillna('0', inplace=True)
    df['fixed_DeviceInfo'] = df[['device_name', 'device_version']].apply(lambda x: x[0] + '__' + x[1], axis=1)
    gc.collect()
    
    return df

In [6]:
data = pd.concat([train_iden, test_iden], axis=0)
data = id_split(data)

In [8]:
data['browser_os'] = data[['device_name', 'id_31']].apply(lambda x: 
                     'android'      if x[1].count('mobile') and x[1].count('fire')
                     else 'ios'     if x[1].count('mobile')
                     else 'mac'     if x[1].count('safari')
                     else 'ios'     if x[1].count('ios')
                     else 'android' if x[1].count('android')
                                                              
                     else 'windows' if x[0].count('window')
                     else 'mac'     if x[0].count('mac')
                     else 'ios'     if x[0].count('ios')
                     else 'android' if x[0].count('samsun')
                     else 'windows' if x[0].count('trid')
                     else 'android' if x[0].count('moto')
                     else 'android' if x[0].count('huaw')
                     else 'android' if x[0].count('lg')
                     else 'android' if x[0].count('sony')
                     else 'android' if x[0].count('htc')
                     else 'android' if x[0].count('zte')
                     else 'linux'   if x[0].count('linux')
                     else 'windows' if x[0].count('rv')
                                                              
                     else 'windows' if x[1].count('chrome')
                     else 'windows' if x[1].count('firefox')
                     else 'windows' if x[1].count('edge')
                     else 'windows' if x[1].count('ie')
                     else 'windows' if x[1].count('window')
                     else 'windows' if x[1].count('desktop')
                     else 'windows' if x[1].count('opera')
                     else 'android' if x[1].count('samsun')
                     else 'android' if x[1].count('google')
                     else 'android' if x[1].count('line')
                     else 'other_browser_os'
                    , axis=1)

In [9]:
#========================================================================
# OS Date
#========================================================================
os_map = get_os_release_date()
o_list = []
v_list = []
list_os_date = []
data['OS_id_30'] = data['OS_id_30'].map(lambda x: x.lower())
data['version_id_30'] = data['version_id_30'].map(lambda x: x.lower())

for os, ver in data[['OS_id_30', 'version_id_30']].values:
    date = np.nan
    if os in os_map:
        ver_map = os_map[os]
        ver = ver.replace('_', '.').replace(' ', '')
        
        if os.count('window'):
            if ver in ver_map:
                date = ver_map[ver]
            elif ver.count('x'):
                date = ver_map['xp']
            elif ver.count('v'):
                date = ver_map['vista']
            elif ver.count('7'):
                date = ver_map['7']
            elif ver.count('8'):
                date = ver_map['8']
            elif ver.count('8') and ver.count('1'):
                date = ver_map['8.1']
            elif ver.count('10'):
                date = ver_map['10']
            else:
                if ver!='0':
                    print(ver)
            
        elif os.count('mac'):
            if ver in ver_map:
                date = ver_map[ver]
            elif len(ver)==5:
                d_ver = ver[:2] + '.' + ver[2:4] + '.' + ver[4]
                if d_ver in ver_map:
                    date = ver_map[d_ver]
                else:
                    print(ver)
            elif len(ver)==4:
                d_ver = ver[:2] + '.' + ver[2:]
                d3_ver = ver[:2] + '.' + ver[2] + '.' + ver[3]
                if d_ver in ver_map:
                    date = ver_map[d_ver]
                elif d3_ver in ver_map:
                    date = ver_map[d3_ver]
                else:
                    print(ver)
            elif len(ver)==3:
                d_ver = ver[:2] + '.' + ver[2]
                if d_ver in ver_map:
                    date = ver_map[d_ver]
                else:
                    print(ver)
            else:
                if ver!='0':
                    ov.append(os + ver)
                
        elif os.count('ios'):
            if ver in ver_map:
                date = ver_map[ver]
            elif len(ver)==5:
                d_ver = ver[:2] + '.' + ver[2:4] + '.' + ver[4]
                if d_ver in ver_map:
                    date = ver_map[d_ver]
                else:
                    print(ver)
            elif len(ver)==4:
                d3_ver = ver[:2] + '.' + ver[2] + '.' + ver[3]
                if d3_ver in ver_map:
                    date = ver_map[d3_ver]
                else:
                    print(ver)
            elif len(ver)==3:
                d3_ver = ver[0] + '.' + ver[1] + '.' + ver[2]
                if d3_ver in ver_map:
                    date = ver_map[d3_ver]
                else:
                    print(ver)
            else:
                if ver!='0':
                    print(ver)
            
        elif os.count('andro'):
            if ver in ver_map:
                date = ver_map[ver]
            elif ver.replace('.', '_') in ver_map:
                date = ver_map[ver.replace('.', '_')]
            elif len(ver)==5:
                print(ver)
            elif len(ver)==4:
                print(ver)
            elif len(ver)==3:
                d3_ver = ver[0] + '.' + ver[1] + '.' + ver[2]
                if d3_ver in ver_map:
                    date = ver_map[d3_ver]
                else:
                    print(ver)
            else:
                if ver!='0':
                    print(ver)
        else:
            print(os, ver)
    else:
        o_list.append(os)
        v_list.append(ver)
            
    list_os_date.append(date)
print(set(o_list))
print(set(v_list))
data['os_release_date'] = list_os_date

{'other', '#', 'func', 'linux'}
{'0'}


In [10]:
#========================================================================
# Browser Date
#========================================================================
brow_date = pd.read_csv('../input/0903_ieee__browser_release_date.csv').set_index('browser')['ReleaseDate']
data['browser_release_date'] = data['id_31'].map(brow_date)

In [11]:
data_dt = data.join(base, how='left')

user_min_dt = data_dt.groupby(col_user)['datetime'].min()
user_max_dt = data_dt.groupby(col_user)['datetime'].max()
data_dt['first_datetime'] = data_dt[col_user].map(user_min_dt)
data_dt['last_datetime'] = data_dt[col_user].map(user_max_dt)

In [12]:
#========================================================================
# FE OS Browser Date
#========================================================================

data_dt['os_release_date'] = pd.to_datetime( data_dt['os_release_date'] )

data_dt['diff__TransactionDT-os_release_date'] = data_dt['datetime'] - data_dt['os_release_date']
data_dt['diff__TransactionDT-os_release_date'] = data_dt['diff__TransactionDT-os_release_date'].map(lambda x: x.days)

data_dt['Regist_date'] = pd.to_datetime(data_dt['Regist_date'])
data_dt['diff__os_release_date-Regist_date'] = data_dt['Regist_date'] - data_dt['os_release_date']
data_dt['diff__os_release_date-Regist_date'] = data_dt['diff__os_release_date-Regist_date'].map(lambda x: x.days)

data_dt['diff__os_release_date-first_TransactionDT'] = data_dt['first_datetime'] - pd.to_datetime(data_dt['os_release_date'])
data_dt['diff__os_release_date-first_TransactionDT'] = data_dt['diff__os_release_date-first_TransactionDT'].map(lambda x: x.days)

data_dt['diff__os_release_date-last_TransactionDT'] = data_dt['last_datetime'] - pd.to_datetime(data_dt['os_release_date'])
data_dt['diff__os_release_date-last_TransactionDT'] = data_dt['diff__os_release_date-last_TransactionDT'].map(lambda x: x.days)


data_dt['browser_release_date'] = pd.to_datetime( data_dt['browser_release_date'] )

data_dt['diff__TransactionDT-browser_release_date'] = data_dt['datetime'] - data_dt['browser_release_date']
data_dt['diff__TransactionDT-browser_release_date'] = data_dt['diff__TransactionDT-browser_release_date'].map(lambda x: x.days)

data_dt['Regist_date'] = pd.to_datetime(data_dt['Regist_date'])
data_dt['diff__browser_release_date-Regist_date'] = data_dt['Regist_date'] - data_dt['browser_release_date']
data_dt['diff__browser_release_date-Regist_date'] = data_dt['diff__browser_release_date-Regist_date'].map(lambda x: x.days)

data_dt['diff__browser_release_date-first_TransactionDT'] = data_dt['first_datetime'] - pd.to_datetime(data_dt['browser_release_date'])
data_dt['diff__browser_release_date-first_TransactionDT'] = data_dt['diff__browser_release_date-first_TransactionDT'].map(lambda x: x.days)

data_dt['diff__browser_release_date-last_TransactionDT'] = data_dt['last_datetime'] - pd.to_datetime(data_dt['browser_release_date'])
data_dt['diff__browser_release_date-last_TransactionDT'] = data_dt['diff__browser_release_date-last_TransactionDT'].map(lambda x: x.days)

data_dt['diff__os_release_date-browser_release_date'] = data_dt['os_release_date'] - data_dt['browser_release_date']
data_dt['diff__os_release_date-browser_release_date'] = data_dt['diff__os_release_date-browser_release_date'].map(lambda x: x.days)

In [13]:
#========================================================================
# OS
#========================================================================
data_dt['device_os'] = data_dt[['device_name', 'id_31', 'id_30']].apply(lambda x: 
                                              'windows' if x[0].count('window')
                                              else 'mac' if x[0].count('mac')
                                              else 'ios' if x[0].count('ios')
                                              else 'android' if x[0].count('samsun')
                                              else 'windows' if x[0].count('trid')
                                              else 'android' if x[0].count('moto')
                                              else 'android' if x[0].count('huaw')
                                              else 'android' if x[0].count('lg')
                                              else 'android' if x[0].count('sony')
                                              else 'android' if x[0].count('htc')
                                              else 'android' if x[0].count('zte')
                                              else 'linux'   if x[0].count('linux')
                                              else 'android' if x[0].count('rv') and x[1].count('mobile')
                                              else 'windows' if x[0].count('rv')
                                              else 'ios'     if x[0].count('#') and x[1].count('ios')
                                              else 'android' if x[0].count('#') and x[1].count('android')
                                              else 'android' if x[0].count('#') and x[1].count('google')
                                              else 'android' if x[0].count('#') and x[1].count('sam')
                                              else 'android' if x[0].count('#') and x[1].count('line')
                                                                        
                                              else 'android' if x[2].count('android')
                                              else 'ios' if x[2].count('ios')
                                              else 'windows' if x[2].count('windows')
                                              else 'mac' if x[2].count('mac')
                                                               
                                              else 'ios'     if x[0].count('#') and x[1].count('mobile')
                                              else 'mac'     if x[0].count('#') and x[1].count('safari')
                                              else 'windows' if x[0].count('#') and x[1].count('chrome')
                                              else 'windows' if x[0].count('#') and x[1].count('chr')
                                              else 'windows' if x[0].count('#') and x[1].count('opera')
                                              else 'windows' if x[0].count('#') and x[1].count('ie')
                                              else 'windows' if x[0].count('#') and x[1].count('edge')
                                              else 'windows' if x[0].count('#') and x[1].count('desktop')
                                              else 'windows' if x[0].count('#') and x[1].count('fire')
                                                       
                                              else 'ios' if x[0].count('ther') and x[1].count('ios')
                                              else 'android' if x[0].count('ther') and x[1].count('android')
                                              else 'android' if x[0].count('ther') and x[1].count('google')
                                              else 'android' if x[0].count('ther') and x[1].count('sam')
                                              else 'android' if x[0].count('ther') and x[1].count('line')
                                                               
                                              else 'ios'     if x[0].count('ther') and x[1].count('mobile')
                                              else 'mac'     if x[0].count('ther') and x[1].count('safari')
                                              else 'windows' if x[0].count('ther') and x[1].count('chrome')
                                              else 'windows' if x[0].count('ther') and x[1].count('chr')
                                              else 'windows' if x[0].count('ther') and x[1].count('opera')
                                              else 'windows' if x[0].count('ther') and x[1].count('ie')
                                              else 'windows' if x[0].count('ther') and x[1].count('edge')
                                              else 'windows' if x[0].count('ther') and x[1].count('desktop')
                                              else 'windows' if x[0].count('ther') and x[1].count('fire')
                                              else 'other_device_os'
                                             , axis=1)

In [14]:
data_dt['browser_name'] = data_dt['id_31'].map(lambda x: 
                     'generic' if x.count('generic')
                     else 'safari' if x.count('mobile')
                     else 'safari' if x.count('safari')
                     else 'chrome' if x.count('chrome')
                     else 'firefox' if x.count('firefox')
                     else 'ie' if x.count('edge')
                     else 'ie' if x.count('ie')
                     else 'ie' if x.count('window')
                     else 'android' if x.count('android')
                     else 'desktop' if x.count('desktop')
                     else 'ios' if x.count('ios')
                     else 'samusung' if x.count('samsun')
                     else 'google' if x.count('google')
                     else 'opera' if x.count('opera')
                     else 'other_browser_name'
                    )

data_dt['browser_os'] = data_dt[['device_name', 'id_31']].apply(lambda x: 
                     'android'      if x[1].count('mobile') and x[1].count('fire')
                     else 'ios'     if x[1].count('mobile')
                     else 'mac'     if x[1].count('safari')
                     else 'ios'     if x[1].count('ios')
                     else 'android' if x[1].count('android')
                                                              
                     else 'windows' if x[0].count('window')
                     else 'mac'     if x[0].count('mac')
                     else 'ios'     if x[0].count('ios')
                     else 'android' if x[0].count('samsun')
                     else 'windows' if x[0].count('trid')
                     else 'android' if x[0].count('moto')
                     else 'android' if x[0].count('huaw')
                     else 'android' if x[0].count('lg')
                     else 'android' if x[0].count('sony')
                     else 'android' if x[0].count('htc')
                     else 'android' if x[0].count('zte')
                     else 'linux'   if x[0].count('linux')
                     else 'windows' if x[0].count('rv')
                                                              
                     else 'windows' if x[1].count('chrome')
                     else 'windows' if x[1].count('firefox')
                     else 'windows' if x[1].count('edge')
                     else 'windows' if x[1].count('ie')
                     else 'windows' if x[1].count('window')
                     else 'windows' if x[1].count('desktop')
                     else 'windows' if x[1].count('opera')
                     else 'android' if x[1].count('samsun')
                     else 'android' if x[1].count('google')
                     else 'android' if x[1].count('line')
                     else 'other_browser_os'
                    , axis=1)

In [15]:
#========================================================================
# FE OS Match Device and Browser
#========================================================================
data_dt['os_match_device_browser'] = data_dt[['device_os', 'browser_os', 'browser_name']].apply(
    lambda x: 
    1 if x[0]==x[1]
    else 1 if x[0]=='mac' and x[2]=='chrome'
    else 0
    ,axis=1)

In [16]:
#========================================================================
# FE Device CoutnryとDevice, Browser OS の結合
#========================================================================
data_dt['device_country__os_match_device_browser'] = data_dt[['device_country', 'os_match_device_browser']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_country__browser_os'] = data_dt[['device_country', 'browser_os']].apply(lambda x: x[0] + '-' + x[1] , axis=1)
data_dt['device_country__device_os'] = data_dt[['device_country', 'device_os']].apply(lambda x: x[0] + '-' + x[1] , axis=1)
data_dt['device_country__device_os__browser_os'] = data_dt[['device_country', 'device_os', 'browser_os']].apply(lambda x: x[0] + '-' + x[1] + '-' + x[2] , axis=1)

In [17]:
#========================================================================
# FE Device width/height と各種情報の結合
#========================================================================
data_dt['screen_width'].fillna(-1, inplace=True)
data_dt['screen_height'].fillna(-1, inplace=True)
data_dt['screen_type'] = (data_dt['screen_height'] > data_dt['screen_width'])*1
data_dt['screen_ratio'] = (data_dt['screen_height'] / (data_dt['screen_width']+10)).map(lambda x: np.round(x, 2))

data_dt['round_screen_width'] = data_dt['screen_width'].map(lambda x: 
                            5000 if x>=5000
                            else 4000 if x>=4000
                            else 3000 if x>=3000
                            else 2000 if x>=2000
                            else 1600 if x>=1600
                            else 1200 if x>=1200
                            else 800  if x>=800
                            else -1   if x<0
                            else 400
                           )
data_dt['round_screen_height'] = data_dt['screen_height'].map(lambda x: 
                            3000 if x>=3000
                            else 2000 if x>=2000
                            else 1600 if x>=1600
                            else 1200 if x>=1200
                            else 1000 if x>=1000
                            else 800  if x>=800
                            else 600  if x>=600
                            else -1   if x<0
                            else 400
                           )

data_dt['device_country__screen_width'] = data_dt[['device_country', 'screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_country__screen_height'] = data_dt[['device_country', 'screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_country__screen_type'] = data_dt[['device_country', 'screen_type']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['device_name__screen_width'] = data_dt[['device_name', 'screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_name__screen_height'] = data_dt[['device_name', 'screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_name__screen_type'] = data_dt[['device_name', 'screen_type']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['device_os__screen_width'] = data_dt[['device_os', 'screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_os__screen_height'] = data_dt[['device_os', 'screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_os__screen_type'] = data_dt[['device_os', 'screen_type']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['browser_os__screen_width'] = data_dt[['browser_os', 'screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['browser_os__screen_height'] = data_dt[['browser_os', 'screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['browser_os__screen_type'] = data_dt[['browser_os', 'screen_type']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)


data_dt['device_country__round_screen_width'] = data_dt[['device_country', 'round_screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_country__round_screen_height'] = data_dt[['device_country', 'round_screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['device_name__round_screen_width'] = data_dt[['device_name', 'round_screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_name__round_screen_height'] = data_dt[['device_name', 'round_screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['device_os__round_screen_width'] = data_dt[['device_os', 'round_screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_os__round_screen_height'] = data_dt[['device_os', 'round_screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

data_dt['browser_os__round_screen_width'] = data_dt[['browser_os', 'round_screen_width']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['browser_os__round_screen_height'] = data_dt[['browser_os', 'round_screen_height']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)


data_dt['device_country__screen_ratio'] = data_dt[['device_country', 'screen_ratio']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_name__screen_ratio'] = data_dt[['device_name', 'screen_ratio']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['device_os__screen_ratio'] = data_dt[['device_os', 'screen_ratio']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
data_dt['browser_os__screen_ratio'] = data_dt[['browser_os', 'screen_ratio']].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

In [18]:
#========================================================================
# FE クレンジングしたdevice, browser情報と各種idの結合
#========================================================================
cols_interact = [
    'id_12',
    'id_15',
    'id_16',
    'id_23',
    'id_27',
    'id_28',
    'id_29',
    'id_32',
    'id_34',
    'id_35',
    'id_36',
    'id_37',
    'id_38',
]

for col in tqdm(cols_interact):
    data_dt[f'device_country__{col}'] = data_dt[['device_country', col]].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
    data_dt[f'device_name__{col}']    = data_dt[['device_name', col]].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
    data_dt[f'device_os__{col}']      = data_dt[['device_os', col]].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)
    data_dt[f'browser_os__{col}']     = data_dt[['browser_os', col]].apply(lambda x: x[0] + '-' + str(x[1]) , axis=1)

100%|██████████| 13/13 [05:57<00:00, 28.78s/it]


In [19]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_categorical = get_categorical_features(data_dt, ignore_list=COLUMNS_IGNORE)
df_cat = data_dt[cols_categorical].copy()
for col in tqdm(cols_categorical):
    num = df_cat[col].value_counts().shape[0]
    df_cat[col].fillna('#', inplace=True)
    cols_cat = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature).join(dummie_feature)
    elif num<=2:
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(label_feature)
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)

100%|██████████| 107/107 [04:30<00:00,  1.86s/it]


In [20]:
data_dt_cat = data_dt.join(df_cat, how='left')
print(data_dt.shape)
print(data_dt_cat.shape)

(286140, 156)
(286140, 783)


In [21]:
#========================================================================
# FE Aggregation Time
#========================================================================
#========================================================================
# predicted_user_idを使った集計がある場合、どのuser_idがわかるようにする
#========================================================================
train_idx = base_train.index
test_idx = base_test.index
dir_save = 'check_trush'
fname = 'iden'

def get_new_columns(name, aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

cols_num = get_numeric_features(data_dt_cat, ignore_list = COLUMNS_IGNORE)
cols_cat = get_categorical_features(data_dt_cat, ignore_list=COLUMNS_IGNORE)

#========================================================================
# Parallel
#========================================================================
arg_list = get_parallel_arg_list(n_jobs=60, arg_list=cols_num)
arg_df_list = []
for arg_cols_num in tqdm(arg_list):
    tmp = data_dt_cat[arg_cols_num]
    print(tmp.shape)
    arg_df_list.append(tmp)

 12%|█▏        | 7/60 [00:02<01:42,  1.94s/it]

(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)


 33%|███▎      | 20/60 [00:03<00:38,  1.04it/s]

(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)


 63%|██████▎   | 38/60 [00:03<00:10,  2.10it/s]

(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)


100%|██████████| 60/60 [00:03<00:00, 17.53it/s]

(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 11)
(286140, 19)





In [23]:
data_dt_cat.head()

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,browser_os__id_38_ios-T_dummie,browser_os__id_38_linux-F_dummie,browser_os__id_38_linux-T_dummie,browser_os__id_38_mac-F_dummie,browser_os__id_38_mac-T_dummie,browser_os__id_38_other_browser_os-F_dummie,browser_os__id_38_other_browser_os-T_dummie,browser_os__id_38_other_browser_os-nan_dummie,browser_os__id_38_windows-F_dummie,browser_os__id_38_windows-T_dummie
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,1,0,0,0,0,0,0,0,0,0
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,0,0,0,0,0,0,0,0,0,1
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
#========================================================================
# Feature Save 1
#========================================================================

cols_feature = [col for col in data_dt_cat.columns if col not in COLUMNS_IGNORE
                and 
#                 col.count('cnt__')
                col.startswith('id_')
#                 and not col.count('country')
               ]

#========================================================================
# これも並列にすべき。めちゃおそい
#========================================================================
# for col in tqdm(cols_num):
def parallel_save_feature(df, tmp_base_train, tmp_base_test, col):
    
    tmp_train = tmp_base_train.join(df)
    tmp_test  = tmp_base_test.join(df)
    print(tmp_train[col].head())
    save_feature(tmp_train[col].to_frame(), '501', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(tmp_test[col].to_frame(),  '501', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    
# Parallel(60)([delayed(parallel_agg)(d, ac) for d, ac in zip(arg_df_list, arg_list)])
Parallel(60)([delayed(parallel_save_feature)(data_dt_cat[[col]], base_train, base_test, col) for col in cols_feature])

Exception ignored in: <function _before_at_fork_weak_calls at 0x7f06fcce6ea0>
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 265, in _before_at_fork_weak_calls
    _at_fork_weak_calls('acquire')
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 257, in _at_fork_weak_calls
    method()
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 854, in acquire
    self.lock.acquire()
KeyboardInterrupt: 
Exception ignored in: <function _before_at_fork_weak_calls at 0x7f06fcce6ea0>
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 265, in _before_at_fork_weak_calls
    _at_fork_weak_calls('acquire')
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 257, in _at_fork_weak_calls
    method()
  File "/home/yryrgogo/anaconda3/lib/python3.7/logging/__init__.py", line 854, in acquire
    self.lock.acquire()
Keyboard

In [None]:
#========================================================================
# FE Aggregation Time
#========================================================================
#========================================================================
# predicted_user_idを使った集計がある場合、どのuser_idかわかるようにする
#========================================================================
train_idx = base_train.index
test_idx = base_test.index
dir_save = 'org_use'
fname = 'iden'
list_key = [
#     'device_country',
    col_user,
#     'device_name',
#     'screen_type',
#     'round_screen_width',
#     'round_screen_height',
]

def get_new_columns(name, aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

cols_num = get_numeric_features(data_dt_cat, ignore_list = COLUMNS_IGNORE)
cols_cat = get_categorical_features(data_dt_cat, ignore_list=COLUMNS_IGNORE)

#========================================================================
# Parallel
#========================================================================
arg_list = get_parallel_arg_list(n_jobs=60, arg_list=cols_num)
arg_df_list = []
for arg_cols_num in tqdm(arg_list):
    arg_cols_num = list(set(arg_cols_num) - set(list_key))
    tmp = data_dt_cat[list_key + arg_cols_num]
    print(tmp.shape)
    arg_df_list.append(tmp)

In [None]:
#========================================================================
# Agg Feature Save 2
#========================================================================
# for df, agg_cols in zip(tqdm(arg_df_list), arg_list):
def parallel_agg(df, agg_cols):
    
    error_keys = []
    error_cols = []
    for col in agg_cols:
        aggs = {}
        aggs[col] = ['mean', 'max', 'min', 'std']
        
        for key in list_key:
            
            tmp_base_train = base_train.join(df[key])
            tmp_base_test = base_test.join(df[key])
            
            try:
                base_agg = df[key].to_frame()
            except AttributeError:
                error_keys.append(key)
                error_cols += agg_cols
            
            df_agg = df.groupby(key).agg(aggs)
            df_agg.columns = get_new_columns(fname+'_'+key+'_', aggs)
            df_agg.reset_index(inplace=True)
            
            base_train_agg = tmp_base_train.merge(df_agg, on=key, how='left')
            base_test_agg = tmp_base_test.merge(df_agg, on=key, how='left')
            
            del df_agg, tmp_base_train, tmp_base_test
            gc.collect()
            
            print(base_train_agg.shape, base_test_agg.shape)
            cols_feature = [col for col in base_train_agg.columns if col not in COLUMNS_IGNORE and col != key and col != 'D1']
            save_feature(base_train_agg[cols_feature], '501', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
            save_feature(base_test_agg[cols_feature],  '501', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
            
            del base_train_agg, base_test_agg
            gc.collect()
#     return error_keys, error_cols

In [None]:
err = Parallel(60)([delayed(parallel_agg)(d, ac) for d, ac in zip(arg_df_list, arg_list)])

In [34]:
trn_idx = base_train.index
tes_idx = base_test.index
cols = [col for col in data_dt_cat.columns if col.count('id_31')]
tmp_train = data_dt_cat.loc[trn_idx, cols]
tmp_test = data_dt_cat.loc[tes_idx, cols]

In [35]:
tmp_train['id_31'].value_counts().to_frame('train').join(tmp_test['id_31'].value_counts().to_frame('test'))

Unnamed: 0,train,test
chrome 63.0,22000,168.0
mobile safari 11.0,13423,10232.0
mobile safari generic,11474,
ie 11.0 for desktop,9030,5173.0
safari generic,8195,
chrome 62.0,7182,101.0
chrome 65.0,6871,243.0
chrome 64.0,6711,111.0
chrome 63.0 for android,5806,187.0
chrome generic,4778,
