In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime as dt

import platform
from matplotlib import font_manager, rc
#matplotlib 한글깨짐 지원
path = "c:/Windows/Fonts/malgun.ttf"
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...')
rc('axes', unicode_minus=False)  

## Read Data

In [192]:
import os
df_train = pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
df_test = pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
y_train = pd.read_csv(os.path.abspath("../input")+'/y_train.csv').age
IDtest = df_test.custid.unique()

df_train.head()

Unnamed: 0,custid,sales_month,sales_day,sales_dayofweek,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,0,6,25,일,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0
1,0,6,25,일,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0
2,0,8,26,토,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0
3,0,8,26,토,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0
4,0,9,3,일,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0


## 전처리

In [193]:
# 일시

df_train['hour'] = df_train['sales_time']//100
df_train['minute'] = df_train['sales_time']%100
df_train = df_train.rename({'sales_month':'month', 'sales_day':'day'}, axis='columns')
df_train['year'] = 2017

df_train.loc[df_train['month']>12,'year'] = 2018
df_train.loc[df_train['month']>12,'month'] = df_train.month%12

dt = pd.DataFrame(pd.to_datetime(df_train[['year', 'month', 'day', 'hour','minute']]))
dt.columns = ['dt']

df_train = pd.merge(df_train, dt, left_on = df_train.index, right_on=dt.index)

df_test['hour'] = df_test['sales_time']//100
df_test['minute'] = df_test['sales_time']%100
df_test = df_test.rename({'sales_month':'month', 'sales_day':'day'}, axis='columns')
df_test['year'] = 2017

df_test.loc[df_test['month']>12,'year'] = 2018
df_test.loc[df_test['month']>12,'month'] = df_test.month%12

dt_t = pd.DataFrame(pd.to_datetime(df_test[['year', 'month', 'day', 'hour','minute']]))
dt_t.columns = ['dt']

df_test = pd.merge(df_test, dt_t, left_on = df_test.index, right_on=dt_t.index)

del df_train['key_0']
del df_train['hour']
del df_train['minute']

del df_test['key_0']
del df_test['hour']
del df_test['minute']

In [194]:
# 지역
df_train['city']= df_train['str_nm'].replace(['무역점','본점'],'강남구').replace(
                '천호점','강동구').replace('신촌점','서대문구')
df_test['city']= df_test['str_nm'].replace(['무역점','본점'],'강남구').replace(
                '천호점','강동구').replace('신촌점','서대문구')

In [195]:
# 환불여부
df_train['refund'] = 0
df_train.loc[df_train['tot_amt'] < 0,'refund'] = 1
df_test['refund'] = 0
df_test.loc[df_test['tot_amt'] < 0,'refund'] = 1

In [196]:
# 이름 비슷한거 통일
df_train.part_nm = df_train.part_nm.replace('여성캐쥬얼','여성캐주얼').replace('스포츠캐쥬얼','스포츠캐주얼').replace(
                    '가정용품파트','가정용품').replace('생식품파트','생식품').replace('공산품파트','공산품').replace(
                    '로얄부틱','로얄부띠끄').replace('잡화파트','잡화')
df_test.part_nm = df_test.part_nm.replace('여성캐쥬얼','여성캐주얼').replace('스포츠캐쥬얼','스포츠캐주얼').replace(
                    '가정용품파트','가정용품').replace('생식품파트','생식품').replace('공산품파트','공산품').replace(
                    '로얄부틱','로얄부띠끄').replace('잡화파트','잡화')

In [197]:
# 지점명을 영어로 변경
df_train['str_nm'] = df_train['str_nm'].apply(lambda x : 'MUYEOK_BRANCH' if x == '무역점'
                                                         else 'SHINCHON_BRANCH' if x == '신촌점'
                                                         else 'HEAD_OFFICE' if x == '본점'
                                                         else 'CHUNHO_BRANCH')

df_test['str_nm'] = df_test['str_nm'].apply(lambda x : 'MUYEOK_BRANCH' if x == '무역점'
                                                         else 'SHINCHON_BRANCH' if x == '신촌점'
                                                         else 'HEAD_OFFICE' if x == '본점'
                                                         else 'CHUNHO_BRANCH')

In [198]:
# 요일명을 영어로 변경 - lgbm에서 한글 피처 명이 있으면 안됨
df_train['sales_dayofweek'] = df_train['sales_dayofweek'].apply(lambda x : 'MONDAY' if x == '월'
                                                         else 'TUESDAY' if x == '화'
                                                         else 'WEDNESDAY' if x == '수'
                                                         else 'THURSDAY' if x == '목'
                                                         else 'FRIDAY' if x == '금'
                                                         else 'SATURDAY' if x == '토'
                                                         else 'SUNDAY')

df_test['sales_dayofweek'] = df_test['sales_dayofweek'].apply(lambda x : 'MONDAY' if x == '월'
                                                         else 'TUESDAY' if x == '화'
                                                         else 'WEDNESDAY' if x == '수'
                                                         else 'THURSDAY' if x == '목'
                                                         else 'FRIDAY' if x == '금'
                                                         else 'SATURDAY' if x == '토'
                                                         else 'SUNDAY')

In [199]:
# 1달 초,중,말 분리
df_train['month_group'] = pd.cut(df_train['day'], bins = [1, 10, 20, 31], 
                              right = False, labels = ['Cho', 'Jung', 'Mal'])

df_test['month_group'] = pd.cut(df_test['day'], bins = [1, 10, 20, 31], 
                              right = False, labels = ['Cho', 'Jung', 'Mal'])

In [200]:
# 계절 분리
df_train['sales_season'] = df_train['month'].apply(lambda x : 'SPRING' if (x>=3) and (x<=5)
                                                  else 'SUMMER' if (x>=6) and (x<=8)
                                                  else 'FALL' if (x>=9) and (x<=11)
                                                  else 'WINTER')

df_test['sales_season'] = df_test['month'].apply(lambda x : 'SPRING' if (x>=3) and (x<=5)
                                                  else 'SUMMER' if (x>=6) and (x<=8)
                                                  else 'FALL' if (x>=9) and (x<=11)
                                                  else 'WINTER')

In [201]:
# 분기 분리
df_train['quarter'] = pd.cut(df_train['month'], bins = [1, 4, 7, 10, 12], 
                              right = False, labels = [1, 2, 3, 4]).astype('category')
df_test['quarter'] = pd.cut(df_test['month'], bins = [1, 4, 7, 10, 12], 
                              right = False, labels = [1, 2, 3, 4]).astype('category')

In [202]:
# 시간대 구분 (회사원 기준)
df_train['time_split'] = df_train['sales_time'].apply(lambda x : 'MORNINGTIME' if (x>=700) and (x<=1130)
                                               else 'LUNCHTIME' if (x>1130) and (x<=1300)
                                               else 'AFTERNOON' if (x>1300) and (x<=1900)
                                               else 'RESTTIME')

df_test['time_split'] = df_test['sales_time'].apply(lambda x : 'MORNINGTIME' if (x>=700) and (x<=1130)
                                               else 'LUNCHTIME' if (x>1130) and (x<=1300)
                                               else 'AFTERNOON' if (x>1300) and (x<=1900)
                                               else 'RESTTIME')

# Feature

* 1등 피쳐

In [47]:
train = df_train
test = df_test

In [48]:
# 환불금액
train['refund_tot_amt'] = train['tot_amt'].apply(lambda x : -x if x<0 else 0)
train['refund_dis_amt'] = train['dis_amt'].apply(lambda x : -x if x<0 else 0)
train['refund_net_amt'] = train['net_amt'].apply(lambda x : -x if x<0 else 0)

test['refund_tot_amt'] = test['tot_amt'].apply(lambda x : -x if x<0 else 0)
test['refund_dis_amt'] = test['dis_amt'].apply(lambda x : -x if x<0 else 0)
test['refund_net_amt'] = test['net_amt'].apply(lambda x : -x if x<0 else 0)

In [49]:
train['refund_tot_amt_log'] = np.log1p(train['refund_tot_amt'])
train['refund_dis_amt_log'] = np.log1p(train['refund_dis_amt'])
train['refund_net_amt_log'] = np.log1p(train['refund_net_amt'])

test['refund_tot_amt_log'] = np.log1p(test['refund_tot_amt'])
test['refund_dis_amt_log'] = np.log1p(test['refund_dis_amt'])
test['refund_net_amt_log'] = np.log1p(train['refund_net_amt'])

In [50]:
# 구매금액
train['tot_amt>=0'] = train['tot_amt'].apply(lambda x : x if x>=0 else 0)
train['dis_amt>=0'] = train['dis_amt'].apply(lambda x : x if x>=0 else 0)
train['net_amt>=0'] = train['net_amt'].apply(lambda x : x if x>=0 else 0)

test['tot_amt>=0'] = test['tot_amt'].apply(lambda x : x if x>=0 else 0)
test['dis_amt>=0'] = test['dis_amt'].apply(lambda x : x if x>=0 else 0)
test['net_amt>=0'] = test['net_amt'].apply(lambda x : x if x>=0 else 0)

In [51]:
train['tot_amt_log>=0'] = np.log1p(train['tot_amt>=0'])
train['dis_amt_log>=0'] = np.log1p(train['dis_amt>=0'])
train['net_amt_log>=0'] = np.log1p(train['net_amt>=0'])

test['tot_amt_log>=0'] = np.log1p(test['tot_amt>=0'])
test['dis_amt_log>=0'] = np.log1p(test['dis_amt>=0'])
test['net_amt_log>=0'] = np.log1p(test['net_amt>=0'])

In [52]:
# 구매비율
train['net_amt_ratio'] = train['net_amt>=0'] / train['tot_amt>=0']
train['dis_amt_ratio'] = train['dis_amt>=0'] / train['tot_amt>=0']
train['dis_net_ratio'] = train['dis_amt>=0'] / train['net_amt>=0']

test['net_amt_ratio'] = test['net_amt>=0'] / test['tot_amt>=0']
test['dis_amt_ratio'] = test['dis_amt>=0'] / test['tot_amt>=0']
test['dis_net_ratio'] = test['dis_amt>=0'] / test['net_amt>=0']

In [53]:
# 백화점 영업시간
train['sales_time_open'] = train['sales_time'].apply(lambda x : 1 if (x > 1030) and (x < 2000) else 0)
test['sales_time_open'] = test['sales_time'].apply(lambda x : 1 if (x > 1030) and (x < 2000) else 0)

In [54]:
# 주말 분리
train['weekend'] = train['sales_dayofweek'].apply(lambda x : 1 if (x=='토') or (x=='일') else 0)
test['weekend'] = test['sales_dayofweek'].apply(lambda x : 1 if (x=='토') or (x=='일') else 0)

In [55]:
# 할부 여부
train['inst'] = train['inst_mon'].apply(lambda x : 1 if x>1 else 0)
test['inst'] = test['inst_mon'].apply(lambda x : 1 if x>1 else 0)

In [56]:
# 방문주기 : 몇일 간격으로 구매가 이뤄졌는지
train['diff_time'] = train.groupby('custid')['dt'].diff().fillna('00:00:00').astype(str)
train['diff_time'] = train['diff_time'].str.split(' days').apply(lambda x: 0 if x[0] == '00:00:00' else x[0]).astype(int)

test['diff_time'] = test.groupby('custid')['dt'].diff().fillna('00:00:00').astype(str)
test['diff_time'] = test['diff_time'].str.split(' days').apply(lambda x: 0 if x[0] == '00:00:00' else x[0]).astype(int)

In [57]:
# 쇼핑간격 : 같은 날 몇분 간격으로 구매가 이뤄졌는지
train['day1'] = train['year'].astype(str) + '_' + train['month'].astype(str) + '_' + train['day'].astype(str)
train['shopping_gap_open'] = train.query('sales_time_open == 1').groupby(['custid', 'day1'])['dt'].diff().dt.total_seconds().fillna(0).div(60).astype(int)
train['shopping_gap_open'] = train['shopping_gap_open'].fillna(0)

test['day1'] = test['year'].astype(str) + '_' + test['month'].astype(str) + '_' + test['day'].astype(str)
test['shopping_gap_open'] = test.query('sales_time_open == 1').groupby(['custid', 'day1'])['dt'].diff().dt.total_seconds().fillna(0).div(60).astype(int)
test['shopping_gap_open'] = test['shopping_gap_open'].fillna(0)

In [58]:
# 내점 시 구매 간격
train['shopping_gap'] = train.groupby(['custid', 'day'])['dt'].diff().dt.total_seconds().fillna(0).div(60).astype(int)
train['shopping_gap'] = train['shopping_gap'].fillna(0)

test['shopping_gap'] = test.groupby(['custid', 'day'])['dt'].diff().dt.total_seconds().fillna(0).div(60).astype(int)
test['shopping_gap'] = test['shopping_gap'].fillna(0)

In [59]:
# 무이자 혜택 카드 사용 여부 / 롯데백화점 무이자 조건 참고함.
train['card_friendly'] = train.query('net_amt >= 50000 and inst_mon > 1')['inst_fee'].apply(lambda x : 1 if x == 0 else 0)
test['card_friendly'] = test.query('net_amt >= 50000 and inst_mon > 1')['inst_fee'].apply(lambda x : 1 if x == 0 else 0)
train['card_friendly'] = train['card_friendly'].fillna(0)
test['card_friendly'] = test['card_friendly'].fillna(0)

In [60]:
# 내점시 구매건수 1건
train['1shopping'] = train['shopping_gap'].apply(lambda x : 1 if x == 0 else 0)
test['1shopping'] = test['shopping_gap'].apply(lambda x : 1 if x == 0 else 0)

In [63]:
# 연령대별 브랜드 선호도에 따른 가중치
y = pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949')
df = pd.merge(train, y, on = 'custid')
train['age_group'] = df['age'].apply(lambda x : 'twenty' if (x>=20) & (x<30)
                                     else 'thirty' if (x>=30) & (x<40)
                                     else 'forty' if (x>=40) & (x<50)
                                     else 'fifty' if (x>=50) & (x<60)
                                     else 'sixty')

twenty_prefer_brd = train[train['age_group'] == 'twenty'].brd_nm.value_counts().index[1:].to_list()
thirty_prefer_brd = train[train['age_group'] == 'thirty'].brd_nm.value_counts().index[1:].to_list()
forty_prefer_brd = train[train['age_group'] == 'forty'].brd_nm.value_counts().index[1:].to_list()
fifty_prefer_brd = train[train['age_group'] == 'fifty'].brd_nm.value_counts().index[1:].to_list()
sixty_prefer_brd = train[train['age_group'] == 'sixty'].brd_nm.value_counts().index[1:].to_list()

def prefer_brd(x, list):
    for i in range(len(list)):
        if(x == list[i]):
            return len(list)-i

train['20_weight'] = train['brd_nm'].apply(lambda x: prefer_brd(x, twenty_prefer_brd)).fillna(0)
train['30_weight'] = train['brd_nm'].apply(lambda x: prefer_brd(x, thirty_prefer_brd)).fillna(0)
train['40_weight'] = train['brd_nm'].apply(lambda x: prefer_brd(x, forty_prefer_brd)).fillna(0)
train['50_weight'] = train['brd_nm'].apply(lambda x: prefer_brd(x, fifty_prefer_brd)).fillna(0)
train['60_weight'] = train['brd_nm'].apply(lambda x: prefer_brd(x, sixty_prefer_brd)).fillna(0)

test['20_weight'] = test['brd_nm'].apply(lambda x: prefer_brd(x, twenty_prefer_brd)).fillna(0)
test['30_weight'] = test['brd_nm'].apply(lambda x: prefer_brd(x, thirty_prefer_brd)).fillna(0)
test['40_weight'] = test['brd_nm'].apply(lambda x: prefer_brd(x, forty_prefer_brd)).fillna(0)
test['50_weight'] = test['brd_nm'].apply(lambda x: prefer_brd(x, fifty_prefer_brd)).fillna(0)
test['60_weight'] = test['brd_nm'].apply(lambda x: prefer_brd(x, sixty_prefer_brd)).fillna(0)

train['weight_sum'] = train['20_weight'] + train['30_weight'] + train['40_weight'] + train['50_weight'] + train['60_weight']
test['weight_sum'] = test['20_weight'] + test['30_weight'] + test['40_weight'] + test['50_weight'] + test['60_weight']

In [64]:
agg_dict = {
    # 기존 컬럼
    'custid':['count'],
    'sales_time':['min', 'max', 'std'],
    'goodcd':['nunique'],
    'import_flg':['mean', 'sum', 'nunique'],
    'tot_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'dis_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'net_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'inst_mon':['min', 'max','mean', 'sum', 'std', 'skew'],
    'inst_fee':['mean', 'sum', 'nunique'],
    'sales_dayofweek':['nunique'],
    'brd_nm': [('brd_nm_nunique', lambda x: x.nunique()), 
               ('brd_nm_ratio', lambda x: x.nunique()/x.count())],
   
    # 추가 가공 컬럼
    'tot_amt>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'dis_amt>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'net_amt>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_tot_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_dis_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_net_amt':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_tot_amt_log':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_dis_amt_log':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'refund_net_amt_log':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'tot_amt_log>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'dis_amt_log>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'net_amt_log>=0':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'net_amt_ratio':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'dis_amt_ratio':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'dis_net_ratio':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'sales_time_open':['mean', 'sum'],
    'weekend':['mean', 'sum'],
    'inst':['mean', 'sum'],
    'diff_time':['min', 'max', 'mean', 'sum', 'std', 'skew'],
    'shopping_gap_open':['min', 'max','mean', 'sum', 'std', 'skew'],
    'card_friendly':['mean', 'sum'],
    'shopping_gap':['min', 'max', 'mean', 'sum', 'std', 'skew'],
}

In [67]:
features = []
features_te = []

In [68]:
f = train.groupby('custid').agg(agg_dict)
f = f.fillna(0)
f.columns = [('_').join(column).upper() for column in f.columns.ravel()]

# 카드 무이자 혜택 사용한 고객 비율
f['cf_cc_ratio'] = f['CARD_FRIENDLY_MEAN'] / f['CUSTID_COUNT']

# 구매 비율
f['tot_dis_log_ratio'] = f['DIS_AMT_LOG>=0_SUM'] / f['TOT_AMT_LOG>=0_SUM']
f['tot_net_log_ratio'] = f['NET_AMT_LOG>=0_SUM'] / f['TOT_AMT_LOG>=0_SUM']
f['dis_net_log_ratio'] = f['DIS_AMT_LOG>=0_SUM'] / f['NET_AMT_LOG>=0_SUM']

# 백화점 영업시간에 쇼핑한 비율
f['open_gap_sum_ratio'] = (f['SHOPPING_GAP_OPEN_SUM'] / f['SHOPPING_GAP_SUM']).fillna(0)

f = f.reset_index()
features.append(f); display(f)





f_te = test.groupby('custid').agg(agg_dict)
f_te = f_te.fillna(0)
f_te.columns = [('_').join(column).upper() for column in f_te.columns.ravel()]

f_te['cf_cc_ratio'] = f_te['CARD_FRIENDLY_MEAN'] / f_te['CUSTID_COUNT']

f_te['tot_dis_log_ratio'] = (f_te['DIS_AMT_LOG>=0_SUM'] / f_te['TOT_AMT_LOG>=0_SUM']).fillna(0)
f_te['tot_net_log_ratio'] = (f_te['NET_AMT_LOG>=0_SUM'] / f_te['TOT_AMT_LOG>=0_SUM']).fillna(0)
f_te['dis_net_log_ratio'] = (f_te['DIS_AMT_LOG>=0_SUM'] / f_te['NET_AMT_LOG>=0_SUM']).fillna(0)

f_te['open_gap_sum_ratio'] = (f_te['SHOPPING_GAP_OPEN_SUM'] / f_te['SHOPPING_GAP_SUM']).fillna(0)

f_te = f_te.reset_index()
features_te.append(f_te); display(f_te)

  f.columns = [('_').join(column).upper() for column in f.columns.ravel()]


Unnamed: 0,custid,CUSTID_COUNT,SALES_TIME_MIN,SALES_TIME_MAX,SALES_TIME_STD,GOODCD_NUNIQUE,IMPORT_FLG_MEAN,IMPORT_FLG_SUM,IMPORT_FLG_NUNIQUE,TOT_AMT_MIN,...,SHOPPING_GAP_MAX,SHOPPING_GAP_MEAN,SHOPPING_GAP_SUM,SHOPPING_GAP_STD,SHOPPING_GAP_SKEW,cf_cc_ratio,tot_dis_log_ratio,tot_net_log_ratio,dis_net_log_ratio,open_gap_sum_ratio
0,0,11,1212,1930,266.443786,7,0.636364,7,2,39000,...,48,12.545455,138,18.635255,1.079753,0.082645,0.804362,0.991048,0.811628,1.000000
1,2,11,1050,1920,352.567002,7,0.090909,1,2,-1416000,...,30,4.545455,50,10.357255,2.127388,0.024793,0.623121,0.993722,0.627057,1.000000
2,3,30,1113,1943,274.230199,22,0.000000,0,1,-621000,...,132480,8756.433333,262693,33276.132785,3.660680,0.020000,0.493403,0.996948,0.494913,0.001382
3,4,4,1403,1450,21.087121,4,0.250000,1,2,49000,...,40,11.750000,47,19.120234,1.830569,0.062500,0.372814,0.997857,0.373614,1.000000
4,5,32,1100,1912,314.365407,22,0.187500,6,2,20335,...,261042,17710.375000,566732,64290.471182,3.707887,0.013672,0.556639,0.993894,0.560058,0.000314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21582,29995,76,1023,1913,231.999093,57,0.105263,8,2,-1206000,...,349870,35142.578947,2670836,82327.740203,2.514594,0.005367,0.611293,0.996477,0.613454,0.000402
21583,29996,19,1240,1912,268.480890,9,0.105263,2,2,-205000,...,40660,2142.736842,40712,9327.382537,4.358897,0.005540,0.601835,0.993031,0.606059,0.001277
21584,29997,20,1250,1730,133.563153,13,0.050000,1,2,9000,...,280,34.800000,696,63.256870,3.352420,0.007500,0.185006,0.997598,0.185451,1.000000
21585,29998,13,1303,1903,182.203007,12,0.000000,0,1,-55000,...,482390,37118.000000,482534,133787.587193,3.605551,0.041420,0.412812,0.997257,0.413947,0.000298


  f_te.columns = [('_').join(column).upper() for column in f_te.columns.ravel()]


Unnamed: 0,custid,CUSTID_COUNT,SALES_TIME_MIN,SALES_TIME_MAX,SALES_TIME_STD,GOODCD_NUNIQUE,IMPORT_FLG_MEAN,IMPORT_FLG_SUM,IMPORT_FLG_NUNIQUE,TOT_AMT_MIN,...,SHOPPING_GAP_MAX,SHOPPING_GAP_MEAN,SHOPPING_GAP_SUM,SHOPPING_GAP_STD,SHOPPING_GAP_SKEW,cf_cc_ratio,tot_dis_log_ratio,tot_net_log_ratio,dis_net_log_ratio,open_gap_sum_ratio
0,30001,27,1643,1910,66.812810,20,0.074074,2,2,-64700,...,84900,3156.555556,85227,16336.599268,5.196145,0.013717,0.398740,0.997672,0.399671,0.003837
1,30002,102,1050,1930,246.124194,48,0.245098,25,2,-410000,...,352607,20241.147059,2064597,67065.780300,3.803796,0.004902,0.529315,0.996861,0.530981,0.000715
2,30003,47,1243,1930,224.386309,28,0.106383,5,2,-138000,...,260930,28833.127660,1355157,65888.446922,2.464504,0.003169,0.301344,0.997593,0.302071,0.000143
3,30005,1,1903,1903,0.000000,1,1.000000,1,1,57000,...,0,0.000000,0,0.000000,0.000000,1.000000,0.726468,0.995316,0.729887,0.000000
4,30007,6,1540,1933,159.075663,6,0.166667,1,2,29000,...,13,3.833333,23,6.013873,1.075087,0.027778,0.483311,0.996820,0.484853,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,4,1220,1800,285.116935,4,0.250000,1,2,52000,...,10,5.000000,20,5.773503,0.000000,0.187500,0.741166,0.995568,0.744465,1.000000
14376,49990,1,1503,1503,0.000000,1,1.000000,1,1,213000,...,0,0.000000,0,0.000000,0.000000,0.000000,0.755837,0.995819,0.759011,0.000000
14377,49992,2,1740,1853,79.903066,2,0.500000,1,2,53000,...,0,0.000000,0,0.000000,0.000000,0.000000,0.738496,0.995522,0.741818,0.000000
14378,49993,4,1630,1833,84.472382,3,0.000000,0,1,20000,...,43,30.750000,123,20.548723,-1.971170,0.000000,0.197185,0.998761,0.197430,1.000000


In [69]:
# 계절별 피처 생성
f = train.groupby(['custid', 'sales_season']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Season_'+ str(column) for column in f.columns]

f['cf_cc_ratio_spring'] = f["Season_('card_friendly', 'sum', 'SPRING')"] / f["Season_('custid', 'count', 'SPRING')"]
f['tot_dis_ratio_spring'] = (f["Season_('dis_amt>=0', 'sum', 'SPRING')"] / f["Season_('tot_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f['tot_net_ratio_spring'] = (f["Season_('net_amt>=0', 'sum', 'SPRING')"] / f["Season_('tot_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f['dis_net_ratio_spring'] = (f["Season_('dis_amt>=0', 'sum', 'SPRING')"] / f["Season_('net_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f['open_gap_sum_ratio_spring'] = (f["Season_('shopping_gap_open', 'sum', 'SPRING')"] / f["Season_('shopping_gap', 'sum', 'SPRING')"]).fillna(0)

f['cf_cc_ratio_summer'] = f["Season_('card_friendly', 'sum', 'SUMMER')"] / f["Season_('custid', 'count', 'SUMMER')"]
f['tot_dis_ratio_summer'] = (f["Season_('dis_amt>=0', 'sum', 'SUMMER')"] / f["Season_('tot_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f['tot_net_ratio_summer'] = (f["Season_('net_amt>=0', 'sum', 'SUMMER')"] / f["Season_('tot_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f['dis_net_ratio_summer'] = (f["Season_('dis_amt>=0', 'sum', 'SUMMER')"] / f["Season_('net_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f['open_gap_sum_ratio_summer'] = (f["Season_('shopping_gap_open', 'sum', 'SUMMER')"] / f["Season_('shopping_gap', 'sum', 'SUMMER')"]).fillna(0)

f['cf_cc_ratio_fall'] = f["Season_('card_friendly', 'sum', 'FALL')"] / f["Season_('custid', 'count', 'FALL')"]
f['tot_dis_ratio_fall'] = (f["Season_('dis_amt>=0', 'sum', 'FALL')"] / f["Season_('tot_amt>=0', 'sum', 'FALL')"]).fillna(0)
f['tot_net_ratio_fall'] = (f["Season_('net_amt>=0', 'sum', 'FALL')"] / f["Season_('tot_amt>=0', 'sum', 'FALL')"]).fillna(0)
f['dis_net_ratio_fall'] = (f["Season_('dis_amt>=0', 'sum', 'FALL')"] / f["Season_('net_amt>=0', 'sum', 'FALL')"]).fillna(0)
f['open_gap_sum_ratio_fall'] = (f["Season_('shopping_gap_open', 'sum', 'FALL')"] / f["Season_('shopping_gap', 'sum', 'FALL')"]).fillna(0)

f['cf_cc_ratio_winter'] = f["Season_('card_friendly', 'sum', 'WINTER')"] / f["Season_('custid', 'count', 'WINTER')"]
f['tot_dis_ratio_winter'] = (f["Season_('dis_amt>=0', 'sum', 'WINTER')"] / f["Season_('tot_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f['tot_net_ratio_winter'] = (f["Season_('net_amt>=0', 'sum', 'WINTER')"] / f["Season_('tot_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f['dis_net_ratio_winter'] = (f["Season_('dis_amt>=0', 'sum', 'WINTER')"] / f["Season_('net_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f['open_gap_sum_ratio_winter'] = (f["Season_('shopping_gap_open', 'sum', 'WINTER')"] / f["Season_('shopping_gap', 'sum', 'WINTER')"]).fillna(0)


f = f.reset_index()
features.append(f)


f_te = test.groupby(['custid', 'sales_season']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Season_'+ str(column) for column in f_te.columns]

f_te['cf_cc_ratio_spring'] = f_te["Season_('card_friendly', 'sum', 'SPRING')"] / f_te["Season_('custid', 'count', 'SPRING')"]
f_te['tot_dis_ratio_spring'] = (f_te["Season_('dis_amt>=0', 'sum', 'SPRING')"] / f_te["Season_('tot_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f_te['tot_net_ratio_spring'] = (f_te["Season_('net_amt>=0', 'sum', 'SPRING')"] / f_te["Season_('tot_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f_te['dis_net_ratio_spring'] = (f_te["Season_('dis_amt>=0', 'sum', 'SPRING')"] / f_te["Season_('net_amt>=0', 'sum', 'SPRING')"]).fillna(0)
f_te['open_gap_sum_ratio_spring'] = (f_te["Season_('shopping_gap_open', 'sum', 'SPRING')"] / f_te["Season_('shopping_gap', 'sum', 'SPRING')"]).fillna(0)

f_te['cf_cc_ratio_summer'] = f_te["Season_('card_friendly', 'sum', 'SUMMER')"] / f_te["Season_('custid', 'count', 'SUMMER')"]
f_te['tot_dis_ratio_summer'] = (f_te["Season_('dis_amt>=0', 'sum', 'SUMMER')"] / f_te["Season_('tot_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f_te['tot_net_ratio_summer'] = (f_te["Season_('net_amt>=0', 'sum', 'SUMMER')"] / f_te["Season_('tot_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f_te['dis_net_ratio_summer'] = (f_te["Season_('dis_amt>=0', 'sum', 'SUMMER')"] / f_te["Season_('net_amt>=0', 'sum', 'SUMMER')"]).fillna(0)
f_te['open_gap_sum_ratio_summer'] = (f_te["Season_('shopping_gap_open', 'sum', 'SUMMER')"] / f_te["Season_('shopping_gap', 'sum', 'SUMMER')"]).fillna(0)

f_te['cf_cc_ratio_fall'] = f_te["Season_('card_friendly', 'sum', 'FALL')"] / f_te["Season_('custid', 'count', 'FALL')"]
f_te['tot_dis_ratio_fall'] = (f_te["Season_('dis_amt>=0', 'sum', 'FALL')"] / f_te["Season_('tot_amt>=0', 'sum', 'FALL')"]).fillna(0)
f_te['tot_net_ratio_fall'] = (f_te["Season_('net_amt>=0', 'sum', 'FALL')"] / f_te["Season_('tot_amt>=0', 'sum', 'FALL')"]).fillna(0)
f_te['dis_net_ratio_fall'] = (f_te["Season_('dis_amt>=0', 'sum', 'FALL')"] / f_te["Season_('net_amt>=0', 'sum', 'FALL')"]).fillna(0)
f_te['open_gap_sum_ratio_fall'] = (f_te["Season_('shopping_gap_open', 'sum', 'FALL')"] / f_te["Season_('shopping_gap', 'sum', 'FALL')"]).fillna(0)

f_te['cf_cc_ratio_winter'] = f_te["Season_('card_friendly', 'sum', 'WINTER')"] / f_te["Season_('custid', 'count', 'WINTER')"]
f_te['tot_dis_ratio_winter'] = (f_te["Season_('dis_amt>=0', 'sum', 'WINTER')"] / f_te["Season_('tot_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f_te['tot_net_ratio_winter'] = (f_te["Season_('net_amt>=0', 'sum', 'WINTER')"] / f_te["Season_('tot_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f_te['dis_net_ratio_winter'] = (f_te["Season_('dis_amt>=0', 'sum', 'WINTER')"] / f_te["Season_('net_amt>=0', 'sum', 'WINTER')"]).fillna(0)
f_te['open_gap_sum_ratio_winter'] = (f_te["Season_('shopping_gap_open', 'sum', 'WINTER')"] / f_te["Season_('shopping_gap', 'sum', 'WINTER')"]).fillna(0)

f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Season_('custid', 'count', 'FALL')","Season_('custid', 'count', 'SPRING')","Season_('custid', 'count', 'SUMMER')","Season_('custid', 'count', 'WINTER')","Season_('sales_time', 'min', 'FALL')","Season_('sales_time', 'min', 'SPRING')","Season_('sales_time', 'min', 'SUMMER')","Season_('sales_time', 'min', 'WINTER')","Season_('sales_time', 'max', 'FALL')",...,cf_cc_ratio_fall,tot_dis_ratio_fall,tot_net_ratio_fall,dis_net_ratio_fall,open_gap_sum_ratio_fall,cf_cc_ratio_winter,tot_dis_ratio_winter,tot_net_ratio_winter,dis_net_ratio_winter,open_gap_sum_ratio_winter
0,30001,3.0,9.0,9.0,6.0,1820.0,1653.0,1650.0,1643.0,1910.0,...,0.333333,0.001197,0.998803,0.001198,1.000000,0.166667,0.015864,0.984136,0.016120,1.000000
1,30002,27.0,27.0,26.0,22.0,1050.0,1150.0,1113.0,1120.0,1930.0,...,0.407407,0.041677,0.958323,0.043490,0.000829,0.681818,0.041784,0.958216,0.043606,0.000411
2,30003,24.0,10.0,6.0,7.0,1310.0,1243.0,1510.0,1400.0,1840.0,...,0.083333,0.015545,0.984455,0.015790,0.000170,0.285714,0.000000,1.000000,0.000000,0.000190
3,30005,0.0,0.0,1.0,0.0,0.0,0.0,1903.0,0.0,0.0,...,,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000
4,30007,0.0,3.0,0.0,3.0,0.0,1540.0,0.0,1643.0,0.0,...,,0.000000,0.000000,0.000000,0.000000,0.333333,0.027526,0.972474,0.028305,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,3.0,0.0,0.0,1.0,1220.0,0.0,0.0,1800.0,1240.0,...,0.666667,0.050000,0.950000,0.052632,1.000000,1.000000,0.050000,0.950000,0.052632,0.000000
14376,49990,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1503.0,0.0,...,,0.000000,0.000000,0.000000,0.000000,0.000000,0.050000,0.950000,0.052632,0.000000
14377,49992,0.0,1.0,1.0,0.0,0.0,1853.0,1740.0,0.0,0.0,...,,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000
14378,49993,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1630.0,0.0,...,,0.000000,0.000000,0.000000,0.000000,0.000000,0.024471,0.975529,0.025084,1.000000


In [70]:
# 분기별 피처 생성
f = train.groupby(['custid', 'quarter']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Quarter_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)


f_te = test.groupby(['custid', 'quarter']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Quarter_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Quarter_('custid', 'count', 1)","Quarter_('custid', 'count', 2)","Quarter_('custid', 'count', 3)","Quarter_('custid', 'count', 4)","Quarter_('sales_time', 'min', 1)","Quarter_('sales_time', 'min', 2)","Quarter_('sales_time', 'min', 3)","Quarter_('sales_time', 'min', 4)","Quarter_('sales_time', 'max', 1)",...,"Quarter_('shopping_gap', 'sum', 3)","Quarter_('shopping_gap', 'sum', 4)","Quarter_('shopping_gap', 'std', 1)","Quarter_('shopping_gap', 'std', 2)","Quarter_('shopping_gap', 'std', 3)","Quarter_('shopping_gap', 'std', 4)","Quarter_('shopping_gap', 'skew', 1)","Quarter_('shopping_gap', 'skew', 2)","Quarter_('shopping_gap', 'skew', 3)","Quarter_('shopping_gap', 'skew', 4)"
0,30001,4,11,9,0,1720.0,1653.0,1650.0,0.0,1843.0,...,100,0,23.094011,25595.392808,12.692955,0.000000,0.000000,3.316623,0.683052,0.000000
1,30002,22,26,28,22,1120.0,1150.0,1113.0,1050.0,1810.0,...,132973,573720,118543.907638,42557.457483,18364.464293,58791.148200,2.204495,5.099011,4.100932,2.042788
2,30003,9,9,20,5,1510.0,1243.0,1310.0,1403.0,1930.0,...,441873,88187,110949.695811,6.220486,54604.158256,39417.742373,1.348328,1.510651,2.236558,2.236068
3,30005,0,1,0,0,0.0,1903.0,0.0,0.0,0.0,...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,30007,6,0,0,0,1540.0,0.0,0.0,0.0,1933.0,...,0,0,6.013873,0.000000,0.000000,0.000000,1.075087,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,1,0,0,3,1800.0,0.0,0.0,1220.0,1800.0,...,0,20,0.000000,0.000000,0.000000,5.773503,0.000000,0.000000,0.000000,-1.732051
14376,49990,1,0,0,0,1503.0,0.0,0.0,0.0,1503.0,...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14377,49992,0,1,1,0,0.0,1853.0,1740.0,0.0,0.0,...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14378,49993,4,0,0,0,1630.0,0.0,0.0,0.0,1833.0,...,0,0,20.548723,0.000000,0.000000,0.000000,-1.971170,0.000000,0.000000,0.000000


In [71]:
# month_group별 피처 생성
f = train.groupby(['custid', 'month_group']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Month_group_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'month_group']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Month_group_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Month_group_('custid', 'count', 'Cho')","Month_group_('custid', 'count', 'Jung')","Month_group_('custid', 'count', 'Mal')","Month_group_('sales_time', 'min', 'Cho')","Month_group_('sales_time', 'min', 'Jung')","Month_group_('sales_time', 'min', 'Mal')","Month_group_('sales_time', 'max', 'Cho')","Month_group_('sales_time', 'max', 'Jung')","Month_group_('sales_time', 'max', 'Mal')",...,"Month_group_('shopping_gap', 'mean', 'Mal')","Month_group_('shopping_gap', 'sum', 'Cho')","Month_group_('shopping_gap', 'sum', 'Jung')","Month_group_('shopping_gap', 'sum', 'Mal')","Month_group_('shopping_gap', 'std', 'Cho')","Month_group_('shopping_gap', 'std', 'Jung')","Month_group_('shopping_gap', 'std', 'Mal')","Month_group_('shopping_gap', 'skew', 'Cho')","Month_group_('shopping_gap', 'skew', 'Jung')","Month_group_('shopping_gap', 'skew', 'Mal')"
0,30001,5,10,11,1703.0,1700.0,1643.0,1910.0,1843.0,1740.0,...,9.727273,50,85070,107,14.142136,26841.770537,12.190906,0.883883,3.162275,1.126517
1,30002,24,58,20,1150.0,1050.0,1223.0,1903.0,1930.0,1903.0,...,35222.200000,261786,1098367,704444,39359.449146,67809.508594,88816.066414,3.853733,4.016644,2.919578
2,30003,29,8,10,1243.0,1403.0,1250.0,1910.0,1923.0,1930.0,...,26062.700000,701350,393180,260627,49403.209240,97277.899977,82404.493861,2.006612,1.936742,3.162278
3,30005,1,0,0,1903.0,0.0,0.0,1903.0,0.0,0.0,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,30007,0,3,3,0.0,1643.0,1540.0,0.0,1933.0,1843.0,...,4.333333,0,10,13,0.000000,5.773503,7.505553,0.000000,1.732051,1.732051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,4,0,0,1220.0,0.0,0.0,1800.0,0.0,0.0,...,0.000000,20,0,0,5.773503,0.000000,0.000000,0.000000,0.000000,0.000000
14376,49990,0,0,1,0.0,0.0,1503.0,0.0,0.0,1503.0,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14377,49992,1,1,0,1740.0,1853.0,0.0,1740.0,1853.0,0.0,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14378,49993,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [72]:
# 주말여부별 피처 생성
f = train.groupby(['custid', 'weekend']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Weekend_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'weekend']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Weekend_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Weekend_('custid', 'count', 0)","Weekend_('sales_time', 'min', 0)","Weekend_('sales_time', 'max', 0)","Weekend_('sales_time', 'std', 0)","Weekend_('goodcd', 'nunique', 0)","Weekend_('import_flg', 'mean', 0)","Weekend_('import_flg', 'sum', 0)","Weekend_('import_flg', 'nunique', 0)","Weekend_('tot_amt', 'min', 0)",...,"Weekend_('shopping_gap_open', 'std', 0)","Weekend_('shopping_gap_open', 'skew', 0)","Weekend_('card_friendly', 'mean', 0)","Weekend_('card_friendly', 'sum', 0)","Weekend_('shopping_gap', 'min', 0)","Weekend_('shopping_gap', 'max', 0)","Weekend_('shopping_gap', 'mean', 0)","Weekend_('shopping_gap', 'sum', 0)","Weekend_('shopping_gap', 'std', 0)","Weekend_('shopping_gap', 'skew', 0)"
0,30001,27,1643,1910,66.812810,20,0.074074,2,2,-64700,...,15.400633,1.093679,0.370370,10.0,0,84900,3156.555556,85227,16336.599268,5.196145
1,30002,102,1050,1930,246.124194,48,0.245098,25,2,-410000,...,33.977775,5.782726,0.500000,51.0,0,352607,20241.147059,2064597,67065.780300,3.803796
2,30003,47,1243,1930,224.386309,28,0.106383,5,2,-138000,...,9.104746,3.298368,0.148936,7.0,0,260930,28833.127660,1355157,65888.446922,2.464504
3,30005,1,1903,1903,0.000000,1,1.000000,1,1,57000,...,0.000000,0.000000,1.000000,1.0,0,0,0.000000,0,0.000000,0.000000
4,30007,6,1540,1933,159.075663,6,0.166667,1,2,29000,...,6.013873,1.075087,0.166667,1.0,0,13,3.833333,23,6.013873,1.075087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,4,1220,1800,285.116935,4,0.250000,1,2,52000,...,5.773503,0.000000,0.750000,3.0,0,10,5.000000,20,5.773503,0.000000
14376,49990,1,1503,1503,0.000000,1,1.000000,1,1,213000,...,0.000000,0.000000,0.000000,0.0,0,0,0.000000,0,0.000000,0.000000
14377,49992,2,1740,1853,79.903066,2,0.500000,1,2,53000,...,0.000000,0.000000,0.000000,0.0,0,0,0.000000,0,0.000000,0.000000
14378,49993,4,1630,1833,84.472382,3,0.000000,0,1,20000,...,20.548723,-1.971170,0.000000,0.0,0,43,30.750000,123,20.548723,-1.971170


In [73]:
# 요일별 피처 생성
f = train.groupby(['custid', 'sales_dayofweek']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Weekday_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'sales_dayofweek']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Weekday_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Weekday_('custid', 'count', 'FRIDAY')","Weekday_('custid', 'count', 'MONDAY')","Weekday_('custid', 'count', 'SATURDAY')","Weekday_('custid', 'count', 'SUNDAY')","Weekday_('custid', 'count', 'THURSDAY')","Weekday_('custid', 'count', 'TUESDAY')","Weekday_('custid', 'count', 'WEDNESDAY')","Weekday_('sales_time', 'min', 'FRIDAY')","Weekday_('sales_time', 'min', 'MONDAY')",...,"Weekday_('shopping_gap', 'std', 'THURSDAY')","Weekday_('shopping_gap', 'std', 'TUESDAY')","Weekday_('shopping_gap', 'std', 'WEDNESDAY')","Weekday_('shopping_gap', 'skew', 'FRIDAY')","Weekday_('shopping_gap', 'skew', 'MONDAY')","Weekday_('shopping_gap', 'skew', 'SATURDAY')","Weekday_('shopping_gap', 'skew', 'SUNDAY')","Weekday_('shopping_gap', 'skew', 'THURSDAY')","Weekday_('shopping_gap', 'skew', 'TUESDAY')","Weekday_('shopping_gap', 'skew', 'WEDNESDAY')"
0,30001,0.0,3.0,5.0,15.0,0.0,3.0,1.0,0.0,1700.0,...,0.000000,49008.377855,0.000000,0.000000,1.457863,0.608581,0.821782,0.000000,1.732051,0.000000
1,30002,23.0,4.0,26.0,4.0,8.0,19.0,18.0,1113.0,1520.0,...,8.626165,93103.010159,72953.880702,4.397775,2.000000,2.642911,0.822699,0.278841,2.905287,4.242636
2,30003,19.0,3.0,10.0,0.0,6.0,5.0,4.0,1250.0,1243.0,...,134653.733376,4.774935,44068.334342,2.141352,1.732051,1.778927,0.000000,0.968249,0.827604,2.000000
3,30005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,30007,2.0,0.0,1.0,3.0,0.0,0.0,0.0,1923.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.732051,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.000000,5.773503,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14376,49990,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14377,49992,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14378,49993,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.000000,0.000000,20.548723,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.971170


In [74]:
# sales_time_open 별 피처
f = train.groupby(['custid', 'sales_time_open']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Sales_time_open_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'sales_time_open']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Sales_time_open_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Sales_time_open_('custid', 'count', 0)","Sales_time_open_('custid', 'count', 1)","Sales_time_open_('sales_time', 'min', 0)","Sales_time_open_('sales_time', 'min', 1)","Sales_time_open_('sales_time', 'max', 0)","Sales_time_open_('sales_time', 'max', 1)","Sales_time_open_('sales_time', 'std', 0)","Sales_time_open_('sales_time', 'std', 1)","Sales_time_open_('goodcd', 'nunique', 0)",...,"Sales_time_open_('shopping_gap', 'max', 0)","Sales_time_open_('shopping_gap', 'max', 1)","Sales_time_open_('shopping_gap', 'mean', 0)","Sales_time_open_('shopping_gap', 'mean', 1)","Sales_time_open_('shopping_gap', 'sum', 0)","Sales_time_open_('shopping_gap', 'sum', 1)","Sales_time_open_('shopping_gap', 'std', 0)","Sales_time_open_('shopping_gap', 'std', 1)","Sales_time_open_('shopping_gap', 'skew', 0)","Sales_time_open_('shopping_gap', 'skew', 1)"
0,30001,0.0,27.0,0.0,1643.0,0.0,1910.0,0.0,66.812810,0.0,...,0.0,84900.0,0.0,3156.555556,0.0,85227.0,0.0,16336.599268,0.0,5.196145
1,30002,0.0,102.0,0.0,1050.0,0.0,1930.0,0.0,246.124194,0.0,...,0.0,352607.0,0.0,20241.147059,0.0,2064597.0,0.0,67065.780300,0.0,3.803796
2,30003,0.0,47.0,0.0,1243.0,0.0,1930.0,0.0,224.386309,0.0,...,0.0,260930.0,0.0,28833.127660,0.0,1355157.0,0.0,65888.446922,0.0,2.464504
3,30005,0.0,1.0,0.0,1903.0,0.0,1903.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
4,30007,0.0,6.0,0.0,1540.0,0.0,1933.0,0.0,159.075663,0.0,...,0.0,13.0,0.0,3.833333,0.0,23.0,0.0,6.013873,0.0,1.075087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,0.0,4.0,0.0,1220.0,0.0,1800.0,0.0,285.116935,0.0,...,0.0,10.0,0.0,5.000000,0.0,20.0,0.0,5.773503,0.0,0.000000
14376,49990,0.0,1.0,0.0,1503.0,0.0,1503.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
14377,49992,0.0,2.0,0.0,1740.0,0.0,1853.0,0.0,79.903066,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
14378,49993,0.0,4.0,0.0,1630.0,0.0,1833.0,0.0,84.472382,0.0,...,0.0,43.0,0.0,30.750000,0.0,123.0,0.0,20.548723,0.0,-1.971170


In [75]:
# str_nm별 피처
f = train.groupby(['custid', 'str_nm']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Str_nm_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'str_nm']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Str_nm_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Str_nm_('custid', 'count', 'CHUNHO_BRANCH')","Str_nm_('custid', 'count', 'HEAD_OFFICE')","Str_nm_('custid', 'count', 'MUYEOK_BRANCH')","Str_nm_('custid', 'count', 'SHINCHON_BRANCH')","Str_nm_('sales_time', 'min', 'CHUNHO_BRANCH')","Str_nm_('sales_time', 'min', 'HEAD_OFFICE')","Str_nm_('sales_time', 'min', 'MUYEOK_BRANCH')","Str_nm_('sales_time', 'min', 'SHINCHON_BRANCH')","Str_nm_('sales_time', 'max', 'CHUNHO_BRANCH')",...,"Str_nm_('shopping_gap', 'sum', 'MUYEOK_BRANCH')","Str_nm_('shopping_gap', 'sum', 'SHINCHON_BRANCH')","Str_nm_('shopping_gap', 'std', 'CHUNHO_BRANCH')","Str_nm_('shopping_gap', 'std', 'HEAD_OFFICE')","Str_nm_('shopping_gap', 'std', 'MUYEOK_BRANCH')","Str_nm_('shopping_gap', 'std', 'SHINCHON_BRANCH')","Str_nm_('shopping_gap', 'skew', 'CHUNHO_BRANCH')","Str_nm_('shopping_gap', 'skew', 'HEAD_OFFICE')","Str_nm_('shopping_gap', 'skew', 'MUYEOK_BRANCH')","Str_nm_('shopping_gap', 'skew', 'SHINCHON_BRANCH')"
0,30001,0.0,24.0,3.0,0.0,0.0,1650.0,1643.0,0.0,0.0,...,50.0,0.0,0.0,17327.688357,16.502525,0.000000,0.0,4.898973,-0.090858,0.000000
1,30002,0.0,39.0,63.0,0.0,0.0,1120.0,1050.0,0.0,0.0,...,703516.0,0.0,0.0,85123.862341,51644.194353,0.000000,0.0,2.731583,5.465295,0.000000
2,30003,0.0,0.0,0.0,47.0,0.0,0.0,0.0,1243.0,0.0,...,0.0,1355157.0,0.0,0.000000,0.000000,65888.446922,0.0,0.000000,0.000000,2.464504
3,30005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1903.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,30007,0.0,0.0,3.0,3.0,0.0,0.0,1540.0,1643.0,0.0,...,10.0,13.0,0.0,0.000000,5.773503,7.505553,0.0,0.000000,1.732051,1.732051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,0.0,4.0,0.0,0.0,0.0,1220.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.773503,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
14376,49990,0.0,0.0,1.0,0.0,0.0,0.0,1503.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
14377,49992,0.0,2.0,0.0,0.0,0.0,1740.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
14378,49993,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1630.0,0.0,...,0.0,123.0,0.0,0.000000,0.000000,20.548723,0.0,0.000000,0.000000,-1.971170


In [76]:
# 할부비율별피쳐
f = train.groupby(['custid', 'inst_mon']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Inst_mon_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'inst_mon']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Inst_mon_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Inst_mon_('custid', 'count', 1)","Inst_mon_('custid', 'count', 2)","Inst_mon_('custid', 'count', 3)","Inst_mon_('custid', 'count', 4)","Inst_mon_('custid', 'count', 5)","Inst_mon_('custid', 'count', 6)","Inst_mon_('custid', 'count', 7)","Inst_mon_('custid', 'count', 8)","Inst_mon_('custid', 'count', 9)",...,"Inst_mon_('shopping_gap', 'skew', 3)","Inst_mon_('shopping_gap', 'skew', 4)","Inst_mon_('shopping_gap', 'skew', 5)","Inst_mon_('shopping_gap', 'skew', 6)","Inst_mon_('shopping_gap', 'skew', 7)","Inst_mon_('shopping_gap', 'skew', 8)","Inst_mon_('shopping_gap', 'skew', 9)","Inst_mon_('shopping_gap', 'skew', 10)","Inst_mon_('shopping_gap', 'skew', 11)","Inst_mon_('shopping_gap', 'skew', 12)"
0,30001,17.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.802818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30002,48.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.302450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30003,39.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.865544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30005,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30007,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14376,49990,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14377,49992,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14378,49993,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# time_split 별 피처 생성
f = train.groupby(['custid', 'time_split']).agg(agg_dict).unstack()
f = f.fillna(0)
f.columns = ['Time_split_'+ str(column) for column in f.columns]
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'time_split']).agg(agg_dict).unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Time_split_'+ str(column) for column in f_te.columns]
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Time_split_('custid', 'count', 'AFTERNOON')","Time_split_('custid', 'count', 'LUNCHTIME')","Time_split_('custid', 'count', 'MORNINGTIME')","Time_split_('custid', 'count', 'RESTTIME')","Time_split_('sales_time', 'min', 'AFTERNOON')","Time_split_('sales_time', 'min', 'LUNCHTIME')","Time_split_('sales_time', 'min', 'MORNINGTIME')","Time_split_('sales_time', 'min', 'RESTTIME')","Time_split_('sales_time', 'max', 'AFTERNOON')",...,"Time_split_('shopping_gap', 'sum', 'MORNINGTIME')","Time_split_('shopping_gap', 'sum', 'RESTTIME')","Time_split_('shopping_gap', 'std', 'AFTERNOON')","Time_split_('shopping_gap', 'std', 'LUNCHTIME')","Time_split_('shopping_gap', 'std', 'MORNINGTIME')","Time_split_('shopping_gap', 'std', 'RESTTIME')","Time_split_('shopping_gap', 'skew', 'AFTERNOON')","Time_split_('shopping_gap', 'skew', 'LUNCHTIME')","Time_split_('shopping_gap', 'skew', 'MORNINGTIME')","Time_split_('shopping_gap', 'skew', 'RESTTIME')"
0,30001,26.0,0.0,0.0,1.0,1643.0,0.0,0.0,1910.0,1850.0,...,0.0,20.0,16647.858724,0.000000,0.000000,0.000000,5.099012,0.000000,0.0,0.00000
1,30002,64.0,30.0,4.0,4.0,1303.0,1133.0,1050.0,1903.0,1900.0,...,309554.0,60.0,66275.440738,55904.297443,154772.333369,5.773503,4.418212,2.726686,2.0,0.00000
2,30003,37.0,4.0,0.0,6.0,1310.0,1243.0,0.0,1903.0,1853.0,...,0.0,260974.0,61988.130826,5.000000,0.000000,106520.634175,2.275048,2.000000,0.0,2.44949
3,30005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1903.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000
4,30007,4.0,0.0,0.0,2.0,1540.0,0.0,0.0,1923.0,1843.0,...,0.0,10.0,6.500000,0.000000,0.000000,7.071068,2.000000,0.000000,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,1.0,3.0,0.0,0.0,1800.0,1220.0,0.0,0.0,1800.0,...,0.0,0.0,0.000000,5.773503,0.000000,0.000000,0.000000,-1.732051,0.0,0.00000
14376,49990,1.0,0.0,0.0,0.0,1503.0,0.0,0.0,0.0,1503.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000
14377,49992,2.0,0.0,0.0,0.0,1740.0,0.0,0.0,0.0,1853.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000
14378,49993,4.0,0.0,0.0,0.0,1630.0,0.0,0.0,0.0,1833.0,...,0.0,0.0,20.548723,0.000000,0.000000,0.000000,-1.971170,0.000000,0.0,0.00000


In [78]:
# 요일별 비율
f = train.groupby('custid')['sales_dayofweek'].agg([
    ('Weekday_mon_ratio', lambda x: np.mean(x.isin(['MONDAY']))),
    ('Weekday_tue_ratio', lambda x: np.mean(x.isin(['TUESDAY']))),
    ('Weekday_wed_ratio', lambda x: np.mean(x.isin(['WEDNESDAY']))),
    ('Weekday_thu_ratio', lambda x: np.mean(x.isin(['THURSDAY']))),
    ('Weekday_fri_ratio', lambda x: np.mean(x.isin(['FRIDAY']))),
    ('Weekday_sat_ratio', lambda x: np.mean(x.isin(['SATURDAY']))),
    ('Weekday_sun_ratio', lambda x: np.mean(x.isin(['SUNDAY'])))
]).reset_index()
features.append(f)

f_te = test.groupby('custid')['sales_dayofweek'].agg([
    ('Weekday_mon_ratio', lambda x: np.mean(x.isin(['MONDAY']))),
    ('Weekday_tue_ratio', lambda x: np.mean(x.isin(['TUESDAY']))),
    ('Weekday_wed_ratio', lambda x: np.mean(x.isin(['WEDNESDAY']))),
    ('Weekday_thu_ratio', lambda x: np.mean(x.isin(['THURSDAY']))),
    ('Weekday_fri_ratio', lambda x: np.mean(x.isin(['FRIDAY']))),
    ('Weekday_sat_ratio', lambda x: np.mean(x.isin(['SATURDAY']))),
    ('Weekday_sun_ratio', lambda x: np.mean(x.isin(['SUNDAY'])))
]).reset_index()
features_te.append(f_te); f_te

Unnamed: 0,custid,Weekday_mon_ratio,Weekday_tue_ratio,Weekday_wed_ratio,Weekday_thu_ratio,Weekday_fri_ratio,Weekday_sat_ratio,Weekday_sun_ratio
0,30001,0.111111,0.111111,0.037037,0.000000,0.000000,0.185185,0.555556
1,30002,0.039216,0.186275,0.176471,0.078431,0.225490,0.254902,0.039216
2,30003,0.063830,0.106383,0.085106,0.127660,0.404255,0.212766,0.000000
3,30005,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
4,30007,0.000000,0.000000,0.000000,0.000000,0.333333,0.166667,0.500000
...,...,...,...,...,...,...,...,...
14375,49988,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14376,49990,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
14377,49992,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.500000
14378,49993,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000


In [79]:
# month_group별 비율
f = train.groupby('custid')['month_group'].agg([
    ('Month_Cho_ratio', lambda x: np.mean(x.isin(['Cho']))),
    ('Month_Jung_ratio', lambda x: np.mean(x.isin(['Jung']))),
    ('Month_Mal_ratio', lambda x: np.mean(x.isin(['Mal'])))
]).reset_index()
features.append(f)

f_te = test.groupby('custid')['month_group'].agg([
    ('Month_Cho_ratio', lambda x: np.mean(x.isin(['Cho']))),
    ('Month_Jung_ratio', lambda x: np.mean(x.isin(['Jung']))),
    ('Month_Mal_ratio', lambda x: np.mean(x.isin(['Mal'])))
]).reset_index()
features_te.append(f_te); f_te

Unnamed: 0,custid,Month_Cho_ratio,Month_Jung_ratio,Month_Mal_ratio
0,30001,0.185185,0.370370,0.407407
1,30002,0.235294,0.568627,0.196078
2,30003,0.617021,0.170213,0.212766
3,30005,1.000000,0.000000,0.000000
4,30007,0.000000,0.500000,0.500000
...,...,...,...,...
14375,49988,1.000000,0.000000,0.000000
14376,49990,0.000000,0.000000,1.000000
14377,49992,0.500000,0.500000,0.000000
14378,49993,0.000000,0.000000,0.000000


In [80]:
# str_nm별 비율
f = train.groupby('custid')['str_nm'].agg([
    ('Str_nm_mu_ratio', lambda x: np.mean(x.isin(['MUYEOK_BRANCH']))),
    ('Str_nm_shin_ratio', lambda x: np.mean(x.isin(['SHINCHON_BRANCH']))),
    ('Str_nm_bon_ratio', lambda x: np.mean(x.isin(['HEAD_OFFICE']))),
    ('Str_nm_chun_ratio', lambda x: np.mean(x.isin(['CHUNHO_BRACH'])))
]).reset_index()
features.append(f)

f_te = test.groupby('custid')['str_nm'].agg([
    ('Str_nm_mu_ratio', lambda x: np.mean(x.isin(['MUYEOK_BRANCH']))),
    ('Str_nm_shin_ratio', lambda x: np.mean(x.isin(['SHINCHON_BRANCH']))),
    ('Str_nm_bon_ratio', lambda x: np.mean(x.isin(['HEAD_OFFICE']))),
    ('Str_nm_chun_ratio', lambda x: np.mean(x.isin(['CHUNHO_BRACH'])))
]).reset_index()
features_te.append(f_te); f_te

Unnamed: 0,custid,Str_nm_mu_ratio,Str_nm_shin_ratio,Str_nm_bon_ratio,Str_nm_chun_ratio
0,30001,0.111111,0.0,0.888889,0.0
1,30002,0.617647,0.0,0.382353,0.0
2,30003,0.000000,1.0,0.000000,0.0
3,30005,0.000000,1.0,0.000000,0.0
4,30007,0.500000,0.5,0.000000,0.0
...,...,...,...,...,...
14375,49988,0.000000,0.0,1.000000,0.0
14376,49990,1.000000,0.0,0.000000,0.0
14377,49992,0.000000,0.0,1.000000,0.0
14378,49993,0.000000,1.0,0.000000,0.0


In [81]:
# year별 비율
f = train.groupby('custid')['year'].agg([
    ('2017_ratio', lambda x: np.mean(x.isin([2017]))),
    ('2018_ratio', lambda x: np.mean(x.isin([2018]))),
]).reset_index()
features.append(f)

f_te = test.groupby('custid')['year'].agg([
    ('2017_ratio', lambda x: np.mean(x.isin([2017]))),
    ('2018_ratio', lambda x: np.mean(x.isin([2018]))),
]).reset_index()
features_te.append(f_te); f_te

Unnamed: 0,custid,2017_ratio,2018_ratio
0,30001,0.740741,0.259259
1,30002,0.647059,0.352941
2,30003,0.808511,0.191489
3,30005,1.000000,0.000000
4,30007,0.000000,1.000000
...,...,...,...
14375,49988,0.750000,0.250000
14376,49990,0.000000,1.000000
14377,49992,0.500000,0.500000
14378,49993,0.000000,1.000000


In [82]:
# time_split별 비율
f = train.groupby('custid')['time_split'].agg([
    ('morningtime_ratio', lambda x: np.mean(x.isin(['MORNINGTIME']))),
    ('lunchtime_ratio', lambda x: np.mean(x.isin(['LUNCHTIME']))),
    ('afternoon_ratio', lambda x: np.mean(x.isin(['AFTERNOON']))),
    ('resttime_ratio', lambda x: np.mean(x.isin(['RESTTIME']))),
]).reset_index()
features.append(f)

f_te = test.groupby('custid')['time_split'].agg([
    ('morningtime_ratio', lambda x: np.mean(x.isin(['MORNINGTIME']))),
    ('lunchtime_ratio', lambda x: np.mean(x.isin(['LUNCHTIME']))),
    ('afternoon_ratio', lambda x: np.mean(x.isin(['AFTERNOON']))),
    ('resttime_ratio', lambda x: np.mean(x.isin(['RESTTIME']))),
]).reset_index()
features_te.append(f_te); f_te

Unnamed: 0,custid,morningtime_ratio,lunchtime_ratio,afternoon_ratio,resttime_ratio
0,30001,0.000000,0.000000,0.962963,0.037037
1,30002,0.039216,0.294118,0.627451,0.039216
2,30003,0.000000,0.085106,0.787234,0.127660
3,30005,0.000000,0.000000,0.000000,1.000000
4,30007,0.000000,0.000000,0.666667,0.333333
...,...,...,...,...,...
14375,49988,0.000000,0.750000,0.250000,0.000000
14376,49990,0.000000,0.000000,1.000000,0.000000
14377,49992,0.000000,0.000000,1.000000,0.000000
14378,49993,0.000000,0.000000,1.000000,0.000000


In [84]:
# 내점별 구매건수
train['day1'] = pd.to_datetime(train['day1'], format = '%Y_%m_%d')
f = train.groupby(['custid', pd.Grouper(key = 'day1', freq='1d')])['month'].agg(['count'])
f = f.groupby('custid')['count'].agg([('per_visit_cmean', 'mean'),
                                     ('per_visit_cmax', 'max'),
                                     ('per_visit_cstd', 'std')])
f = f.fillna(0)
features.append(f)

test['day1'] = pd.to_datetime(test['day1'], format = '%Y_%m_%d')
f_te = test.groupby(['custid', pd.Grouper(key = 'day1', freq='1d')])['month'].agg(['count'])
f_te = f_te.groupby('custid')['count'].agg([('per_visit_cmean', 'mean'),
                                     ('per_visit_cmax', 'max'),
                                     ('per_visit_cstd', 'std')])
f_te = f_te.fillna(0)
features_te.append(f_te); f_te

Unnamed: 0_level_0,per_visit_cmean,per_visit_cmax,per_visit_cstd
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30001,2.700000,6,1.418136
30002,3.517241,11,2.429296
30003,1.880000,7,1.536229
30005,1.000000,1,0.000000
30007,1.500000,2,0.577350
...,...,...,...
49988,2.000000,3,1.414214
49990,1.000000,1,0.000000
49992,1.000000,1,0.000000
49993,4.000000,4,0.000000


In [85]:
# 고객별 내점 1번당 1건 구매한 횟수
a = train.groupby(['custid', 'day1'])['shopping_gap'].sum()
a = pd.DataFrame(a)
a['1shopping'] = a['shopping_gap'].apply(lambda x : 1 if x==0 else 0)
f = a.groupby('custid')['1shopping'].sum()

f = f.fillna(0)
features.append(f)

a = test.groupby(['custid', 'day1'])['shopping_gap'].sum()
a = pd.DataFrame(a)
a['1shopping'] = a['shopping_gap'].apply(lambda x : 1 if x==0 else 0)
f_te = a.groupby('custid')['1shopping'].sum()

f_te = f_te.fillna(0)
features_te.append(f_te);f_te

custid
30001     3
30002     3
30003    10
30005     1
30007     2
         ..
49988     1
49990     1
49992     2
49993     0
49994     1
Name: 1shopping, Length: 14380, dtype: int64

In [86]:
# 최애브랜드 사용횟수
def g(x):
    y = x.value_counts()
    y = y.iloc[0]
    return y

f = train.groupby('custid')['brd_nm'].agg([('love_brd_count', g)]).reset_index()
f = f.fillna(0)
features.append(f)

f_te = test.groupby('custid')['brd_nm'].agg([('love_brd_count', g)]).reset_index()
f_te = f_te.fillna(0);f_te
features_te.append(f_te); f_te

Unnamed: 0,custid,love_brd_count
0,30001,4
1,30002,14
2,30003,6
3,30005,1
4,30007,1
...,...,...
14375,49988,1
14376,49990,1
14377,49992,1
14378,49993,2


In [87]:
# 고객별 구매가 이루어진 날에 구매를 한 브랜드에서 환불이 일어난 총횟수
f = train.groupby(['custid', 'day1', 'brd_nm'])['refund'].agg([('Y/N', 'sum')]).groupby('custid')['Y/N'].agg([('same_brd_refund', 'sum')])
f = f.fillna(0)
features.append(f)

f_te = test.groupby(['custid', 'day1', 'brd_nm'])['refund'].agg([('Y/N', 'sum')]).groupby('custid')['Y/N'].agg([('same_brd_refund', 'sum')])
f_te = f_te.fillna(0)
features_te.append(f_te); f_te

Unnamed: 0_level_0,same_brd_refund
custid,Unnamed: 1_level_1
30001,1
30002,8
30003,4
30005,0
30007,0
...,...
49988,0
49990,0
49992,0
49993,0


In [88]:
# 연령대별 브랜드 선호도에 따른 가중치 비율
f = train.groupby('custid')['20_weight', '30_weight', '40_weight', '50_weight', '60_weight', 'weight_sum'].sum()

f['20_weight_ratio'] = f['20_weight'] / f['weight_sum']
f['30_weight_ratio'] = f['30_weight'] / f['weight_sum'] 
f['40_weight_ratio'] = f['40_weight'] / f['weight_sum'] 
f['50_weight_ratio'] = f['50_weight'] / f['weight_sum'] 
f['60_weight_ratio'] = f['60_weight'] / f['weight_sum'] 

f = f.fillna(0)
features.append(f)

f_te = test.groupby('custid')['20_weight', '30_weight', '40_weight', '50_weight', '60_weight', 'weight_sum'].sum()

f_te['20_weight_ratio'] = f_te['20_weight'] / f_te['weight_sum']
f_te['30_weight_ratio'] = f_te['30_weight'] / f_te['weight_sum'] 
f_te['40_weight_ratio'] = f_te['40_weight'] / f_te['weight_sum'] 
f_te['50_weight_ratio'] = f_te['50_weight'] / f_te['weight_sum'] 
f_te['60_weight_ratio'] = f_te['60_weight'] / f_te['weight_sum'] 

f_te = f_te.fillna(0)
features_te.append(f_te); f_te

  f = train.groupby('custid')['20_weight', '30_weight', '40_weight', '50_weight', '60_weight', 'weight_sum'].sum()
  f_te = test.groupby('custid')['20_weight', '30_weight', '40_weight', '50_weight', '60_weight', 'weight_sum'].sum()


Unnamed: 0_level_0,20_weight,30_weight,40_weight,50_weight,60_weight,weight_sum,20_weight_ratio,30_weight_ratio,40_weight_ratio,50_weight_ratio,60_weight_ratio
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30001,30038.0,33371.0,31011.0,30122.0,21971.0,146513.0,0.205019,0.227768,0.211660,0.205593,0.149959
30002,136159.0,151101.0,148895.0,130531.0,98467.0,665153.0,0.204703,0.227167,0.223851,0.196242,0.148037
30003,58428.0,60961.0,56333.0,49867.0,32837.0,258426.0,0.226092,0.235893,0.217985,0.192964,0.127065
30005,1579.0,1699.0,1645.0,1492.0,1168.0,7583.0,0.208229,0.224054,0.216933,0.196756,0.154029
30007,9201.0,9656.0,9429.0,8212.0,6044.0,42542.0,0.216280,0.226976,0.221640,0.193033,0.142071
...,...,...,...,...,...,...,...,...,...,...,...
49988,4733.0,6048.0,5576.0,4278.0,3421.0,24056.0,0.196749,0.251413,0.231792,0.177835,0.142210
49990,1582.0,1706.0,1653.0,1500.0,1198.0,7639.0,0.207095,0.223328,0.216390,0.196361,0.156827
49992,2679.0,2985.0,2910.0,2724.0,2116.0,13414.0,0.199717,0.222529,0.216938,0.203071,0.157746
49993,2543.0,3273.0,2908.0,2174.0,1491.0,12389.0,0.205263,0.264186,0.234724,0.175478,0.120349


In [89]:
# str_nm별 브랜드 구매 정보들
a = set(train['brd_nm'].unique()) - set(test['brd_nm'].unique())
b = set(test['brd_nm'].unique()) - set(train['brd_nm'].unique())
train['brd_nm_share'] = train['brd_nm'].apply(lambda x : 'etc' if x in a else x)
test['brd_nm_share'] = test['brd_nm'].apply(lambda x : 'etc' if x in b else x)

f = train.groupby(['custid', 'str_nm'])['brd_nm_share'].value_counts().unstack().unstack()
f = f.fillna(0)
f.columns = ['Str_nm_Brd_nm_'+ str(column) for column in f.columns];
f = f.reset_index()
features.append(f)

f_te = test.groupby(['custid', 'str_nm'])['brd_nm_share'].value_counts().unstack().unstack()
f_te = f_te.fillna(0)
f_te.columns = ['Str_nm_Brd_nm_'+ str(column) for column in f_te.columns];
f_te = f_te.reset_index()
features_te.append(f_te);f_te

Unnamed: 0,custid,"Str_nm_Brd_nm_('012베네통', 'CHUNHO_BRANCH')","Str_nm_Brd_nm_('012베네통', 'HEAD_OFFICE')","Str_nm_Brd_nm_('012베네통', 'MUYEOK_BRANCH')","Str_nm_Brd_nm_('012베네통', 'SHINCHON_BRANCH')","Str_nm_Brd_nm_('1492', 'CHUNHO_BRANCH')","Str_nm_Brd_nm_('1492', 'HEAD_OFFICE')","Str_nm_Brd_nm_('1492', 'MUYEOK_BRANCH')","Str_nm_Brd_nm_('1492', 'SHINCHON_BRANCH')","Str_nm_Brd_nm_('1492마일즈', 'CHUNHO_BRANCH')",...,"Str_nm_Brd_nm_('휴먼앤휴먼', 'MUYEOK_BRANCH')","Str_nm_Brd_nm_('휴먼앤휴먼', 'SHINCHON_BRANCH')","Str_nm_Brd_nm_('흙침대', 'CHUNHO_BRANCH')","Str_nm_Brd_nm_('흙침대', 'HEAD_OFFICE')","Str_nm_Brd_nm_('흙침대', 'MUYEOK_BRANCH')","Str_nm_Brd_nm_('흙침대', 'SHINCHON_BRANCH')","Str_nm_Brd_nm_('희원상사', 'CHUNHO_BRANCH')","Str_nm_Brd_nm_('희원상사', 'HEAD_OFFICE')","Str_nm_Brd_nm_('희원상사', 'MUYEOK_BRANCH')","Str_nm_Brd_nm_('희원상사', 'SHINCHON_BRANCH')"
0,30001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,49988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14376,49990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14377,49992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14378,49993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* word2vec

In [90]:
# 구매 정보 관련
train['customer_info'] = train['brd_nm'].astype(str) + '_' + train['corner_nm'].astype(str) + '_' + train['pc_nm'].astype(str) + '_' + train['part_nm'].astype(str)
test['customer_info'] = test['brd_nm'].astype(str) + '_' + test['corner_nm'].astype(str) + '_' + test['pc_nm'].astype(str) + '_' + test['part_nm'].astype(str)

train_data = list(train.groupby('custid')['customer_info'].unique())
test_data = list(test.groupby('custid')['customer_info'].unique())

In [91]:
import random
def oversample(x, n):
    lst = []
    for i in x:
        tmp = []
        for j in range(n):
            random.shuffle(i)
            tmp += list(i)
            lst.append(tmp)
    return lst

In [93]:
from gensim.models import word2vec
w2v_input = oversample(train_data, 10)
w2v = word2vec.Word2Vec(sentences = w2v_input, vector_size = 100, window = 5, min_count = 1, sg = 1)

In [95]:
from tqdm import tqdm
train_mean_vector = []
for words in tqdm(train_data):
    tmp = np.zeros(100)
    cnt = 0
    for word in words:
        try:
            tmp += w2v[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    train_mean_vector.append(tmp)
train_mean_vector = np.array(train_mean_vector)

  tmp /= cnt
100%|█████████████████████████████████████████████████████████████████████████| 21587/21587 [00:00<00:00, 60627.54it/s]


In [96]:
test_mean_vector = []
for words in tqdm(test_data):
    tmp = np.zeros(100)
    cnt = 0
    for word in words:
        try:
            tmp += w2v[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    test_mean_vector.append(tmp)
test_mean_vector = np.array(test_mean_vector)

  tmp /= cnt
100%|█████████████████████████████████████████████████████████████████████████| 14380/14380 [00:00<00:00, 59540.66it/s]


In [97]:
# 구매시간 정보
train['customer_info_time'] = train['str_nm'].astype(str) + '_' + train['sales_dayofweek'].astype(str) + '_' + train['sales_season'].astype(str) + '_' + train['time_split'].astype(str)
test['customer_info_time'] = test['str_nm'].astype(str) + '_' + test['sales_dayofweek'].astype(str) + '_' + test['sales_season'].astype(str) + '_' + test['time_split'].astype(str)

train_data = list(train.groupby('custid')['customer_info_time'].unique())
test_data = list(test.groupby('custid')['customer_info_time'].unique())

In [98]:
from gensim.models import word2vec
w2v_input = oversample(train_data, 10)
w2v = word2vec.Word2Vec(sentences = w2v_input, vector_size = 100, window = 5, min_count = 1, sg = 1)

In [99]:
train_mean_vector1 = []
for words in tqdm(train_data):
    tmp = np.zeros(100)
    cnt = 0
    for word in words:
        try:
            tmp += w2v[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    train_mean_vector1.append(tmp)
train_mean_vector1 = np.array(train_mean_vector1)

  tmp /= cnt
100%|█████████████████████████████████████████████████████████████████████████| 21587/21587 [00:00<00:00, 60800.36it/s]


In [100]:
test_mean_vector1 = []
for words in tqdm(test_data):
    tmp = np.zeros(100)
    cnt = 0
    for word in words:
        try:
            tmp += w2v[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    test_mean_vector1.append(tmp)
test_mean_vector1 = np.array(test_mean_vector1)

  tmp /= cnt
100%|█████████████████████████████████████████████████████████████████████████| 14380/14380 [00:00<00:00, 68669.92it/s]


In [101]:
data = pd.DataFrame({'custid': train.custid.unique()})

for f in features :
    data = pd.merge(data, f, how='left', on='custid')
    
data = data.fillna(0)

In [102]:
data_te = pd.DataFrame({'custid': test.custid.unique()})
for f in features_te :
    data_te = pd.merge(data_te, f, how='left', on='custid')
    
data_te = data_te.fillna(0)

In [103]:
data = data.replace([np.inf, -np.inf], 0)

data_te = data_te.replace([np.inf, -np.inf], 0)

In [104]:
numeric_columns = data.dtypes[data.dtypes != 'object'].index.tolist()

In [105]:
# non-skew에만 처리를 하기 위해서 numeric_columns에서 skew와 non-skew로 구분
skewlist = []
numeric_columns_ns = []

for i in numeric_columns:
    if 'SKEW' in i:
        skewlist.append(i)
    else:
        numeric_columns_ns.append(i)

In [106]:
# non-skew standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
data_te[numeric_columns] = scaler.transform(data_te[numeric_columns])

In [107]:
train_mean_vector = pd.DataFrame(train_mean_vector)
data = pd.concat([data[numeric_columns_ns], data[skewlist], train_mean_vector], axis=1)

test_mean_vector = pd.DataFrame(test_mean_vector)
test_mean_vector.index = data_te.index
data_te = pd.concat([data_te[numeric_columns_ns], data_te[skewlist], test_mean_vector], axis=1)

In [108]:
train_mean_vector1 = pd.DataFrame(train_mean_vector1)
train_mean_vector1.columns = train_mean_vector1.columns.astype(str) + "_time"
data = pd.concat([data, train_mean_vector1], axis=1)

test_mean_vector1 = pd.DataFrame(test_mean_vector1)
test_mean_vector1.index = data_te.index
test_mean_vector1.columns = test_mean_vector1.columns.astype(str) + "_time"
data_te = pd.concat([data_te, test_mean_vector1], axis=1)

In [109]:
y_train = pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949')
data.shape, data_te.shape, y_train.shape

((21587, 13835), (14380, 13835), (21587, 2))

In [110]:
data.columns = data.columns.astype(str)
data_te.columns = data_te.columns.astype(str)

In [112]:
import os
data.to_csv('1등피쳐_train.csv')
data_te.to_csv('1등피쳐_test.csv')