In [90]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

# Modeling
from catboost import CatBoostClassifier

# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from tqdm import tqdm
from typing import List ,Dict, Tuple
import platform
import sys
import sklearn

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

In [91]:
print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Windows-10-10.0.19041-SP0
- python: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
- pandas: 1.3.4
- numpy: 1.22.1
- sklearn: 1.0.1


In [92]:
DATA_PATH = "data/"
SUBMIT_PATH = "data/submission/"
SEED = 42

In [136]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

d_code = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
h_code = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
l_code = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train.shape, test.shape

((501951, 35), (46404, 34))

## preprocessing & engineering

### person_rn, contents_rn 활용

In [137]:
train['person_contents_mul'] = train['person_rn'] * train['contents_rn']
test['person_contents_mul'] = test['person_rn'] * test['contents_rn']

In [138]:
train['person_contents_sum'] = train['person_rn'] + train['contents_rn']
test['person_contents_sum'] = test['person_rn'] + test['contents_rn']

### contents_open_dt 관련
target encoding

In [139]:
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'])
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'])

In [140]:
train['contents_open_hour'] = train['contents_open_dt'].dt.hour
test['contents_open_hour'] = test['contents_open_dt'].dt.hour

In [141]:
train_hour = train.groupby('contents_open_hour').target.sum() / train.groupby('contents_open_hour').target.size()
train['contents_open_hour'] = train['contents_open_hour'].apply(lambda x: train_hour[x])
test['contents_open_hour'] = test['contents_open_hour'].apply(lambda x: train_hour[x])

### 전처리
- 코드표 결합
- 같은 범주 일치 확인

In [142]:
d_code.columns = ['attribute_d', 'attribute_d_d', 'attribute_d_s', 'attribute_d_m', 'attribute_d_l']
h_code.columns = ['attribute_h', 'attribute_h_m', 'attribute_h_l']
l_code.columns = ['attribute_l', 'attribute_l_d', 'attribute_l_s', 'attribute_l_m', 'attribute_l_l']

In [143]:
def merge_codes(df : pd.DataFrame, 
                df_code : pd.DataFrame,
                col : str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)

In [144]:
def preprocess_data(
                    df : pd.DataFrame, 
                    is_train : bool = True, 
                    cols_merge : List[Tuple[str, pd.DataFrame]] = [], 
                    cols_equi : List[Tuple[str, str]] = [] ,
                    cols_drop : List[str] = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_open_dt']
                    ) -> Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df['target'].to_numpy()
        df = df.drop(columns='target')

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f'{col1}_{col2}'] = (df[col1] == df[col2]).astype(int)
        
    df = df.drop(columns=cols_drop)
    return (df, y_data)

In [145]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ('person_prefer_d_1', d_code),
              ('person_prefer_d_2', d_code),
              ('person_prefer_d_3', d_code),
              ('contents_attribute_d', d_code),
              ('person_prefer_h_1', h_code),
              ('person_prefer_h_2', h_code),
              ('person_prefer_h_3', h_code),
              ('contents_attribute_h', h_code),
              ('contents_attribute_l', l_code),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ('contents_attribute_c', 'person_prefer_c'),
    ('contents_attribute_e', 'person_prefer_e'),

    ('person_prefer_d_1_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_1_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_2', 'contents_attribute_d'),
    ('person_prefer_d_2_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_2_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_2_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_2_attribute_d_l', 'contents_attribute_d_attribute_d_l'),
    ('person_prefer_d_3', 'contents_attribute_d'),
    ('person_prefer_d_3_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_3_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_3_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_3_attribute_d_l', 'contents_attribute_d_attribute_d_l'),

    ('person_prefer_h_2', 'contents_attribute_h'),
    ('person_prefer_h_2_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_2_attribute_h_l', 'contents_attribute_h_attribute_h_l'),
    ('person_prefer_h_3', 'contents_attribute_h'),
    ('person_prefer_h_3_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_3_attribute_h_l', 'contents_attribute_h_attribute_h_l'),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_rn', 'contents_open_dt', 'person_rn']

In [146]:
x_train, y_train = preprocess_data(train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_test, _ = preprocess_data(test, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train.shape, y_train.shape, x_test.shape

((501951, 79), (501951,), (46404, 79))

### 대중소세 코드 값 가지고 비교

In [147]:
# D : 대-중-소-세-d 일치여부 score
x_train['person_D_code1_score'] = (x_train['d_l_match_yn'] + x_train['person_prefer_d_1_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_1_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['d_m_match_yn'] + x_train['d_s_match_yn'])
x_train['person_D_code2_score'] = (x_train['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_train['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_2_attribute_d_d_contents_attribute_d_attribute_d_d']\
                                + x_train['person_prefer_d_2_contents_attribute_d'])
x_train['person_D_code3_score'] = (x_train['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_train['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_3_attribute_d_d_contents_attribute_d_attribute_d_d']\
                                + x_train['person_prefer_d_3_contents_attribute_d'])

x_test['person_D_code1_score'] =( x_test['d_l_match_yn'] + x_test['person_prefer_d_1_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_1_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['d_m_match_yn'] + x_test['d_s_match_yn'])
x_test['person_D_code2_score'] = (x_test['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_test['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_2_attribute_d_d_contents_attribute_d_attribute_d_d']\
                                + x_test['person_prefer_d_2_contents_attribute_d'])
x_test['person_D_code3_score'] = (x_test['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_test['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_3_attribute_d_d_contents_attribute_d_attribute_d_d']\
                                + x_test['person_prefer_d_3_contents_attribute_d'])

In [148]:
# H : 대-중-d 일치여부 score
x_train['person_H_code1_score'] = (x_train['h_l_match_yn'] + x_train['h_m_match_yn'] + x_train['h_s_match_yn'])
x_train['person_H_code2_score'] = (x_train['person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_train['person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m']\
                                 + x_train['person_prefer_h_2_contents_attribute_h'])
x_train['person_H_code3_score'] = (x_train['person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_train['person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m']\
                                 + x_train['person_prefer_h_3_contents_attribute_h'])

x_test['person_H_code1_score'] = (x_test['h_l_match_yn'] + x_test['h_m_match_yn']  + x_test['h_s_match_yn'])
x_test['person_H_code2_score'] = (x_test['person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_test['person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m']\
                                 + x_test['person_prefer_h_2_contents_attribute_h'])
x_test['person_H_code3_score'] = (x_test['person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_test['person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m']\
                                 + x_test['person_prefer_h_3_contents_attribute_h'])

In [149]:
x_train['content_L_code_sum'] = x_train['contents_attribute_l_attribute_l_l'] + x_train['contents_attribute_l_attribute_l_m'] \
                                + x_train['contents_attribute_l_attribute_l_s'] + x_train['contents_attribute_l_attribute_l_d']

x_test['content_L_code_sum'] = x_test['contents_attribute_l_attribute_l_l'] + x_test['contents_attribute_l_attribute_l_m'] \
                               + x_test['contents_attribute_l_attribute_l_s'] + x_test['contents_attribute_l_attribute_l_d']

In [150]:
x_train['D_H_1_mul'] = x_train['person_D_code1_score'] * x_train['person_H_code1_score']
x_train['D_H_2_mul'] = x_train['person_D_code2_score'] * x_train['person_H_code2_score']
x_train['D_H_3_mul'] = x_train['person_D_code3_score'] * x_train['person_H_code3_score']
x_train['D_H_1_sum'] = x_train['person_D_code1_score'] + x_train['person_H_code1_score']
x_train['D_H_2_sum'] = x_train['person_D_code2_score'] + x_train['person_H_code2_score']
x_train['D_H_3_sum'] = x_train['person_D_code3_score'] + x_train['person_H_code3_score']

x_test['D_H_1_mul'] = x_test['person_D_code1_score'] * x_test['person_H_code1_score']
x_test['D_H_2_mul'] = x_test['person_D_code2_score'] * x_test['person_H_code2_score']
x_test['D_H_3_mul'] = x_test['person_D_code3_score'] * x_test['person_H_code3_score']
x_test['D_H_1_sum'] = x_test['person_D_code1_score'] + x_test['person_H_code1_score']
x_test['D_H_2_sum'] = x_test['person_D_code2_score'] + x_test['person_H_code2_score']
x_test['D_H_3_sum'] = x_test['person_D_code3_score'] + x_test['person_H_code3_score']

In [151]:
x_train['DD_12_diff'] = x_train['person_D_code1_score'] - x_train['person_D_code2_score']
x_train['DD_13_diff'] = x_train['person_D_code1_score'] - x_train['person_D_code3_score']
x_train['DD_23_diff'] = x_train['person_D_code2_score'] - x_train['person_D_code3_score']

x_test['DD_12_diff'] = x_test['person_D_code1_score'] - x_test['person_D_code2_score']
x_test['DD_13_diff'] = x_test['person_D_code1_score'] - x_test['person_D_code3_score']
x_test['DD_23_diff'] = x_test['person_D_code2_score'] - x_test['person_D_code3_score']

In [152]:
x_train['HH_12_diff'] = x_train['person_H_code1_score'] - x_train['person_H_code2_score']
x_train['HH_13_diff'] = x_train['person_H_code1_score'] - x_train['person_H_code3_score']
x_train['HH_23_diff'] = x_train['person_H_code2_score'] - x_train['person_H_code3_score']

x_test['HH_12_diff'] = x_test['person_H_code1_score'] - x_test['person_H_code2_score']
x_test['HH_13_diff'] = x_test['person_H_code1_score'] - x_test['person_H_code3_score']
x_test['HH_23_diff'] = x_test['person_H_code2_score'] - x_test['person_H_code3_score']

In [153]:
x_train['DDD_sum'] = x_train['person_D_code1_score'] + x_train['person_D_code2_score'] + x_train['person_D_code3_score']
x_train['HHH_sum'] = x_train['person_H_code1_score'] + x_train['person_H_code2_score'] + x_train['person_H_code3_score']

x_test['DDD_sum'] = x_test['person_D_code1_score'] + x_test['person_D_code2_score'] + x_test['person_D_code3_score']
x_test['HHH_sum'] = x_test['person_H_code1_score'] + x_test['person_H_code2_score'] + x_test['person_H_code3_score']

In [154]:
x_train['person_contents_e_diff'] = x_train['person_prefer_e'] - x_train['contents_attribute_e']
x_test['person_contents_e_diff'] = x_test['person_prefer_e'] - x_test['contents_attribute_e']

In [155]:
x_train['D_E_1_mul'] = x_train['person_D_code1_score'] * x_train['person_contents_e_diff']
x_train['D_E_2_mul'] = x_train['person_D_code2_score'] * x_train['person_contents_e_diff']
x_train['D_E_3_mul'] = x_train['person_D_code3_score'] * x_train['person_contents_e_diff']
x_train['D_E_1_sum'] = x_train['person_D_code1_score'] + x_train['person_contents_e_diff']
x_train['D_E_2_sum'] = x_train['person_D_code2_score'] + x_train['person_contents_e_diff']
x_train['D_E_3_sum'] = x_train['person_D_code3_score'] + x_train['person_contents_e_diff']

x_test['D_E_1_mul'] = x_test['person_D_code1_score'] * x_test['person_contents_e_diff']
x_test['D_E_2_mul'] = x_test['person_D_code2_score'] * x_test['person_contents_e_diff']
x_test['D_E_3_mul'] = x_test['person_D_code3_score'] * x_test['person_contents_e_diff']
x_test['D_E_1_sum'] = x_test['person_D_code1_score'] + x_test['person_contents_e_diff']
x_test['D_E_2_sum'] = x_test['person_D_code2_score'] + x_test['person_contents_e_diff']
x_test['D_E_3_sum'] = x_test['person_D_code3_score'] + x_test['person_contents_e_diff']

In [156]:
x_train['H_E_1_mul'] = x_train['person_H_code1_score'] * x_train['person_contents_e_diff']
x_train['H_E_2_mul'] = x_train['person_H_code2_score'] * x_train['person_contents_e_diff']
x_train['H_E_3_mul'] = x_train['person_H_code3_score'] * x_train['person_contents_e_diff']
x_train['H_E_1_sum'] = x_train['person_H_code1_score'] + x_train['person_contents_e_diff']
x_train['H_E_2_sum'] = x_train['person_H_code2_score'] + x_train['person_contents_e_diff']
x_train['H_E_3_sum'] = x_train['person_H_code3_score'] + x_train['person_contents_e_diff']

x_test['H_E_1_mul'] = x_test['person_H_code1_score'] * x_test['person_contents_e_diff']
x_test['H_E_2_mul'] = x_test['person_H_code2_score'] * x_test['person_contents_e_diff']
x_test['H_E_3_mul'] = x_test['person_H_code3_score'] * x_test['person_contents_e_diff']
x_test['H_E_1_sum'] = x_test['person_H_code1_score'] + x_test['person_contents_e_diff']
x_test['H_E_2_sum'] = x_test['person_H_code2_score'] + x_test['person_contents_e_diff']
x_test['H_E_3_sum'] = x_test['person_H_code3_score'] + x_test['person_contents_e_diff']

In [157]:
x_train['L_E_mul'] = x_train['content_L_code_sum'] * x_train['person_contents_e_diff']
x_train['L_E_sum'] = x_train['content_L_code_sum'] + x_train['person_contents_e_diff']

x_test['L_E_mul'] = x_test['content_L_code_sum'] * x_test['person_contents_e_diff']
x_test['L_E_sum'] = x_test['content_L_code_sum'] + x_test['person_contents_e_diff']

### J assemble

In [158]:
data = pd.concat([x_train, train.target],axis=1)

In [159]:
a= pd.pivot_table(data, values='target',index='contents_attribute_j',
    columns='contents_attribute_j_1',
    aggfunc='sum', fill_value=0)

In [160]:
b= pd.pivot_table(data, values='target',index='contents_attribute_j',
    columns='contents_attribute_j_1',
    aggfunc='count', fill_value=0)

In [161]:
a/b

contents_attribute_j_1,1,2,3,4,5,6,8,9,10
contents_attribute_j,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.615385,0.093927,0.32543,0.530409,0.527519,,,,
2,,,,,,0.487139,0.437956,0.431964,0.472427


In [162]:
x_train['j_assemble']=x_train['contents_attribute_j'] * x_train['contents_attribute_j_1']
x_test['j_assemble']=x_test['contents_attribute_j'] * x_test['contents_attribute_j_1']

### A assemble

In [163]:
a= pd.pivot_table(data, values='target',index='person_attribute_a',
    columns='person_attribute_a_1',
    aggfunc='sum', fill_value=0)

In [164]:
b= pd.pivot_table(data, values='target',index='person_attribute_a',
    columns='person_attribute_a_1',
    aggfunc='count', fill_value=0)

In [165]:
a/b

person_attribute_a_1,0,1,2,3,4,5,6,7
person_attribute_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.519501,0.499953,0.498492,0.488989,0.485181,0.497021,0.516074,0.567246
2,0.491229,0.481594,0.474026,0.45526,0.454665,0.482408,0.482432,0.504021


In [166]:
x_train['a_assemble']=x_train['person_attribute_a'].astype(str) + x_train['person_attribute_a_1'].astype(str) + '_' + x_train['contents_attribute_a'].astype(str)
x_test['a_assemble']=x_test['person_attribute_a'].astype(str) + x_test['person_attribute_a_1'].astype(str) +'_' + x_test['contents_attribute_a'].astype(str)

### D assemble

- contents_d 와 person_d의 관계를 보기 위해 pivot_table을 이용하여 target encoding을 만듦

D_1_L assemble

In [36]:
a= pd.pivot_table(data, values='target',index='person_prefer_d_1_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='sum', fill_value=0)

In [37]:
b= pd.pivot_table(data, values='target',index='person_prefer_d_1_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='count', fill_value=0)

In [38]:
ab=(a/b).fillna(0)
ab

contents_attribute_d_attribute_d_l,1,216,377,482,522,618,744,864,926,1235,1258
person_prefer_d_1_attribute_d_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.577416,0.348087,0.292436,0.275927,0.326954,0.283027,0.431737,0.313196,0.372367,0.170455,0.25
216,0.5249,0.598649,0.259259,0.321429,0.505848,0.379648,0.394518,0.465668,0.472571,0.27381,0.0
377,0.501129,0.286765,0.505151,0.307004,0.415954,0.330265,0.383857,0.246269,0.347168,0.089286,0.0
482,0.48742,0.291429,0.293035,0.625022,0.239437,0.323583,0.342105,0.196429,0.332207,0.172414,0.0
522,0.560093,0.493617,0.33829,0.255814,0.660348,0.415686,0.556684,0.430233,0.476839,0.333333,0.0
618,0.498507,0.362729,0.315197,0.349638,0.319444,0.579576,0.470266,0.440068,0.467452,0.373239,0.0
744,0.527941,0.353226,0.24196,0.292419,0.364179,0.431117,0.5602,0.43191,0.444589,0.393782,0.0
864,0.476625,0.487385,0.277108,0.263158,0.571429,0.408696,0.517885,0.589229,0.501591,0.38806,0.0
926,0.420474,0.396458,0.183044,0.215447,0.247984,0.345951,0.406093,0.414907,0.502301,0.280255,0.0
1235,0.537815,0.582278,0.307692,0.0,0.285714,0.60396,0.520833,0.611111,0.507317,0.564444,0.0


In [39]:
d_1_l_target = []
for i in range(11):
    d_1_l_target.append(list(ab.iloc[i,:] + ab.iloc[:,i]))
d_1_l_target= pd.DataFrame(d_1_l_target, columns=list(ab.columns))
d_1_l_target.index = list(ab.columns)

In [40]:
x_train['d_1_l_target']=0
x_test['d_1_l_target']=0

D_2_L assemble

In [41]:
a= pd.pivot_table(data, values='target',index='person_prefer_d_2_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='sum', fill_value=0)

In [42]:
b= pd.pivot_table(data, values='target',index='person_prefer_d_2_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='count', fill_value=0)

In [43]:
ab=(a/b).fillna(0)
ab

contents_attribute_d_attribute_d_l,1,216,377,482,522,618,744,864,926,1235,1258
person_prefer_d_2_attribute_d_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.576027,0.396947,0.332855,0.372775,0.34847,0.279459,0.450208,0.297393,0.373357,0.180828,0.333333
216,0.534165,0.610883,0.248031,0.32,0.579869,0.396226,0.403592,0.485549,0.472514,0.229167,0.0
377,0.519663,0.291284,0.491415,0.578765,0.373377,0.296623,0.384468,0.245614,0.332158,0.160714,0.0
482,0.493294,0.330769,0.47032,0.587237,0.297872,0.340094,0.385852,0.153846,0.364078,0.222222,0.0
522,0.563205,0.447115,0.39313,0.392157,0.671146,0.453744,0.558659,0.5,0.456716,0.333333,0.0
618,0.505607,0.360759,0.350323,0.421795,0.330677,0.568209,0.479619,0.501027,0.47754,0.331288,0.0
744,0.518487,0.356498,0.282209,0.352239,0.377309,0.460793,0.548083,0.512167,0.447857,0.38785,0.0
864,0.514042,0.492147,0.327869,0.225806,0.588235,0.51306,0.505654,0.581687,0.518349,0.436364,0.0
926,0.450721,0.42648,0.212121,0.246454,0.256881,0.359005,0.416403,0.422455,0.501685,0.281188,0.0
1235,0.487179,0.579439,0.275862,0.0,0.333333,0.565217,0.495726,0.530612,0.438298,0.592593,0.0


In [176]:
d_2_l_target = []
for i in range(11):
    d_2_l_target.append(list(ab.iloc[i,:] + ab.iloc[:,i]))
d_2_l_target= pd.DataFrame(d_2_l_target, columns=list(ab.columns))
d_2_l_target.index = list(ab.columns)

In [177]:
x_train['d_2_l_target']=0
x_test['d_2_l_target']=0

D_3_L assemble

In [178]:
a= pd.pivot_table(data, values='target',index='person_prefer_d_3_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='sum', fill_value=0)

In [179]:
b= pd.pivot_table(data, values='target',index='person_prefer_d_3_attribute_d_l',
    columns='contents_attribute_d_attribute_d_l',
    aggfunc='count', fill_value=0)

In [180]:
ab=(a/b).fillna(0)
ab

contents_attribute_d_attribute_d_l,1,216,377,482,522,618,744,864,926,1235,1258
person_prefer_d_3_attribute_d_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.57355,0.448115,0.408043,0.500075,0.329302,0.301267,0.436716,0.330151,0.371168,0.177515,0.333333
216,0.540398,0.580759,0.311321,0.357143,0.618221,0.353846,0.45098,0.455128,0.491622,0.214286,0.0
377,0.534091,0.351351,0.48716,0.421739,0.531599,0.316505,0.420891,0.263736,0.380074,0.25,0.0
482,0.569832,0.382353,0.34957,0.603416,0.365854,0.439141,0.394495,0.321429,0.409884,0.142857,0.0
522,0.553817,0.477089,0.357143,0.25641,0.658374,0.414747,0.563327,0.404255,0.463566,0.333333,0.0
618,0.504097,0.372781,0.338006,0.359055,0.318408,0.560235,0.501558,0.485666,0.485141,0.371069,0.0
744,0.50182,0.348794,0.272066,0.348837,0.369021,0.500905,0.537848,0.446735,0.480244,0.341137,0.0
864,0.478873,0.60719,0.375,0.176471,0.702703,0.46472,0.52057,0.58593,0.515181,0.375,0.0
926,0.460313,0.437612,0.202077,0.255814,0.256513,0.357879,0.48025,0.440427,0.498098,0.301887,0.0
1235,0.514706,0.633333,0.346154,0.333333,0.307692,0.59,0.544643,0.461538,0.5,0.570652,0.0


In [181]:
d_3_l_target = []
for i in range(11):
    d_3_l_target.append(list(ab.iloc[i,:] + ab.iloc[:,i]))
d_3_l_target= pd.DataFrame(d_3_l_target, columns=list(ab.columns))
d_3_l_target.index = list(ab.columns)

In [183]:
x_train['d_3_l_target']=0
x_test['d_3_l_target']=0

In [184]:
for i in tqdm(range(x_train.shape[0])):
    x_train.loc[i,'d_1_l_target'] = d_1_l_target.loc[x_train.person_prefer_d_1_attribute_d_l[i],x_train.contents_attribute_d_attribute_d_l[i]]
    x_train.loc[i,'d_2_l_target'] = d_2_l_target.loc[x_train.person_prefer_d_2_attribute_d_l[i],x_train.contents_attribute_d_attribute_d_l[i]]
    x_train.loc[i,'d_3_l_target'] = d_3_l_target.loc[x_train.person_prefer_d_3_attribute_d_l[i],x_train.contents_attribute_d_attribute_d_l[i]]

100%|█████████████████████████████████████████████████████████████████████████| 501951/501951 [49:28<00:00, 169.08it/s]


In [185]:
for i in tqdm(range(x_test.shape[0])):
    x_test.loc[i,'d_1_l_target'] = d_1_l_target.loc[x_test.person_prefer_d_1_attribute_d_l[i],x_test.contents_attribute_d_attribute_d_l[i]]
    x_test.loc[i,'d_2_l_target'] = d_2_l_target.loc[x_test.person_prefer_d_2_attribute_d_l[i],x_test.contents_attribute_d_attribute_d_l[i]]
    x_test.loc[i,'d_3_l_target'] = d_3_l_target.loc[x_test.person_prefer_d_3_attribute_d_l[i],x_test.contents_attribute_d_attribute_d_l[i]]

100%|██████████████████████████████████████████████████████████████████████████| 46404/46404 [00:32<00:00, 1428.47it/s]


---

In [168]:
# 제곱했을 때 성능이 좋아 사용함
x_train['d_1_l_target'] = x_train['d_1_l_target'] **2
x_train['d_2_l_target'] = x_train['d_2_l_target'] **2
x_train['d_3_l_target'] = x_train['d_3_l_target'] **2

x_test['d_1_l_target'] = x_test['d_1_l_target'] **2
x_test['d_2_l_target'] = x_test['d_2_l_target'] **2
x_test['d_3_l_target'] = x_test['d_3_l_target'] **2

In [171]:
x_train.shape, x_test.shape

((501951, 120), (46404, 120))

In [172]:
x_train

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,H_E_1_sum,H_E_2_sum,H_E_3_sum,L_E_mul,L_E_sum,j_assemble,a_assemble,d_1_l_target,d_2_l_target,d_3_l_target
0,1,1,1,0,0,0,1,4,3,5,...,4,5,4,27336,6838,20,14_3,1.433521,1.492711,1.349122
1,0,0,0,1,1,0,1,3,4,1,...,2,1,1,0,6834,5,13_3,0.762106,0.866970,0.977157
2,0,0,0,1,0,0,2,0,3,5,...,0,-1,0,-6782,6781,20,20_1,0.629744,1.327229,0.887617
3,0,0,0,1,0,0,2,0,2,5,...,1,0,0,0,6834,5,20_3,0.551123,0.573028,0.527987
4,1,1,1,0,0,0,1,3,4,5,...,0,0,0,0,6834,20,13_1,1.433521,1.492711,1.349122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,0,0,0,1,0,0,1,1,2,2,...,-1,-2,-2,-6102,3049,5,11_3,0.628597,0.679103,0.691360
501947,1,1,0,1,0,0,1,6,2,1,...,1,3,1,0,2398,20,16_3,1.333635,1.327229,1.315839
501948,1,1,1,1,0,0,1,7,4,1,...,-2,-3,-3,-9801,3264,5,17_1,1.333635,1.327229,1.315839
501949,1,0,0,1,0,0,1,1,2,1,...,1,0,1,0,3968,5,11_2,1.333635,1.327229,1.315839


---

## person 과 contents의 관계

person과 contents의 관계를 보기 위해서는 train-data에 있는 person의 종류와 test-data에 있는 종류를 포함하고 있어야 한다. \
person_rn의 unique값은 포함하지 않는 갯수가 많기 때문에 다른 유의미한 피처로 피쳐를 만들어 주겠다.\
**test-data의 값이 train-data에 포함만 한다면 nunique값이 크더라도 문제가 되지 않는다. 그래서 최대한 세밀하게 나눌 것**

#### # nan값을 해결하기 위해서 다른 조합으로 person과 contents 를 구성한 뒤 nan값이 있는 행의 위치 파악 후 대체하는 방식으로 진행
- nan을 모두 해결하기 위해 4번 반복하여 진행

#### person

In [173]:
len(set(test.person_rn.unique())-set(train.person_rn.unique()))
# 포함하지 않는 person의 종류가 너무 많다. 
# 군집을 활용해서 person_num을 따로 만들어 줌.

26386

In [174]:
# d_l + h_l
x_train['person_num']=x_train.person_prefer_d_1_attribute_d_l.astype(str) + '-' + x_train.person_prefer_h_1_attribute_h_l.astype(str)

x_test['person_num']=x_test.person_prefer_d_1_attribute_d_l.astype(str) + '-' + x_test.person_prefer_h_1_attribute_h_l.astype(str)

In [175]:
set(x_test.person_num.unique()) - set(x_train.person_num.unique())

set()

In [176]:
x_train.person_num.nunique(),x_test.person_num.nunique()

(190, 183)

#### contents

In [177]:
len(set(test.contents_rn.unique())-set(train.contents_rn.unique()))
# 포함하지 않는 contents의 종류가 너무 많다. 
# 군집을 활용해서 contents_num을 따로 만들어 줌.
# test의 종류에 포함하는 가장 많은 unique를 가진 조합을 일일이 찾았다.

27826

In [178]:
# d_l + h_l
x_train['contents_num']=x_train.contents_attribute_d_attribute_d_l.astype(str) + '-' + x_train.contents_attribute_h_attribute_h_l.astype(str)

x_test['contents_num']=x_test.contents_attribute_d_attribute_d_l.astype(str) + '-' + x_test.contents_attribute_h_attribute_h_l.astype(str)

In [179]:
set(x_test.contents_num.unique()) - set(x_train.contents_num.unique())

set()

In [180]:
x_train.contents_num.nunique(),x_test.contents_num.nunique()

(171, 169)

#### target값으로 rate를 구함
- target encoding을 사용하려 했으나 과적합으로 인식되어 제거

In [181]:
x_train = pd.concat([x_train,train.target],axis=1)

In [182]:
train_rating = x_train.groupby(['person_num','contents_num'])['target'].agg([
                                            ('구매건수', np.size)]).reset_index()

In [183]:
x_train = pd.merge(x_train,train_rating,how='left',on=['person_num','contents_num'])
x_test = pd.merge(x_test,train_rating,how='left',on=['person_num','contents_num'])

In [184]:
x_test.iloc[:,-3:].info()
# 350개의 nan값

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46404 entries, 0 to 46403
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   person_num    46404 non-null  object 
 1   contents_num  46404 non-null  object 
 2   구매건수          46054 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


#### person_2

In [185]:
# d_l + e
x_train['person_num_2']=x_train.person_prefer_d_1_attribute_d_l.astype(str)  + '-' + x_train.person_prefer_e.astype(str) 

x_test['person_num_2']=x_test.person_prefer_d_1_attribute_d_l.astype(str)  + '-' + x_test.person_prefer_e.astype(str) 

In [186]:
set(x_test.person_num_2.unique()) - set(x_train.person_num_2.unique())

set()

In [187]:
x_train.person_num_2.nunique(),x_test.person_num_2.nunique()

(120, 109)

#### contents_2

In [188]:
# d_l + e
x_train['contents_num_2']=x_train.contents_attribute_d_attribute_d_l.astype(str) + '-' + x_train.contents_attribute_e.astype(str)

x_test['contents_num_2']=x_test.contents_attribute_d_attribute_d_l.astype(str) + '-' + x_test.contents_attribute_e.astype(str)

In [189]:
set(x_test.contents_num_2.unique()) - set(x_train.contents_num_2.unique())

{'618-10'}

In [190]:
x_train.contents_num_2.nunique(),x_test.contents_num_2.nunique()

(116, 107)

#### target값으로 rate를 구함

In [191]:
train_rating_2 = x_train.groupby(['person_num_2','contents_num_2'])['target'].agg([
                                            ('구매건수_2', np.size)]).reset_index()

In [192]:
train_rating_2.head()

Unnamed: 0,person_num_2,contents_num_2,구매건수_2
0,1-0,1-0,20
1,1-0,1-1,159
2,1-0,1-10,20
3,1-0,1-11,23
4,1-0,1-2,337


In [193]:
# 구매건수가 안맞을 수 있기 때문에 나누기 2 정도 해줌
train_rating_2.구매건수_2 = (train_rating_2.구매건수_2 /2).astype(int)

In [194]:
x_train = pd.merge(x_train,train_rating_2,how='left',on=['person_num_2','contents_num_2'])
x_test = pd.merge(x_test,train_rating_2,how='left',on=['person_num_2','contents_num_2'])

In [195]:
a = x_test[x_test.구매건수.isnull()].index
x_test.loc[a, '구매건수'] = x_test.loc[a,'구매건수_2']

In [196]:
x_test.iloc[:,-6:].info()
# 20개의 nan값

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46404 entries, 0 to 46403
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   person_num      46404 non-null  object 
 1   contents_num    46404 non-null  object 
 2   구매건수            46383 non-null  float64
 3   person_num_2    46404 non-null  object 
 4   contents_num_2  46404 non-null  object 
 5   구매건수_2          46244 non-null  float64
dtypes: float64(2), object(4)
memory usage: 3.5+ MB


#### person_3

In [197]:
# h_l + e
x_train['person_num_3']=x_train.person_prefer_h_1_attribute_h_l.astype(str)  + '-' + x_train.person_prefer_e.astype(str) 

x_test['person_num_3']=x_test.person_prefer_h_1_attribute_h_l.astype(str)  + '-' + x_test.person_prefer_e.astype(str) 

In [198]:
set(x_test.person_num_3.unique()) - set(x_train.person_num_3.unique())

set()

In [199]:
x_train.person_num_3.nunique(),x_test.person_num_3.nunique()

(224, 206)

#### contents_3

In [200]:
# h_l + e
x_train['contents_num_3']=x_train.contents_attribute_h_attribute_h_l.astype(str) + '-' + x_train.contents_attribute_e.astype(str)

x_test['contents_num_3']=x_test.contents_attribute_h_attribute_h_l.astype(str) + '-' + x_test.contents_attribute_e.astype(str)

In [201]:
set(x_test.contents_num_3.unique()) - set(x_train.contents_num_3.unique())

set()

In [202]:
x_train.contents_num_3.nunique(),x_test.contents_num_3.nunique()

(203, 190)

#### target값으로 rate를 구함

In [203]:
train_rating_3 = x_train.groupby(['person_num_3','contents_num_3'])['target'].agg([
                                            ('구매건수_3', np.size)]).reset_index()

In [204]:
train_rating_3.head()

Unnamed: 0,person_num_3,contents_num_3,구매건수_3
0,1-0,149-1,1
1,1-0,149-10,1
2,1-0,149-3,20
3,1-0,149-4,38
4,1-0,149-5,31


In [205]:
x_train = pd.merge(x_train,train_rating_3,how='left',on=['person_num_3','contents_num_3'])
x_test = pd.merge(x_test,train_rating_3,how='left',on=['person_num_3','contents_num_3'])

In [206]:
a = x_test[x_test.구매건수.isnull()].index
x_test.loc[a, '구매건수'] = x_test.loc[a,'구매건수_3']

In [207]:
x_test.iloc[:,-9:].info()
# 13개 nan값

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46404 entries, 0 to 46403
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   person_num      46404 non-null  object 
 1   contents_num    46404 non-null  object 
 2   구매건수            46391 non-null  float64
 3   person_num_2    46404 non-null  object 
 4   contents_num_2  46404 non-null  object 
 5   구매건수_2          46244 non-null  float64
 6   person_num_3    46404 non-null  object 
 7   contents_num_3  46404 non-null  object 
 8   구매건수_3          46039 non-null  float64
dtypes: float64(3), object(6)
memory usage: 4.5+ MB


#### person_4

In [208]:
# h_l + c
x_train['person_num_4']=x_train.person_prefer_h_1_attribute_h_l.astype(str)  + '-' + x_train.person_prefer_c.astype(str) 

x_test['person_num_4']=x_test.person_prefer_h_1_attribute_h_l.astype(str)  + '-' + x_test.person_prefer_c.astype(str) 

In [209]:
set(x_test.person_num_4.unique()) - set(x_train.person_num_4.unique())

set()

In [210]:
x_train.person_num_4.nunique(),x_test.person_num_4.nunique()

(95, 93)

#### contents_4

In [211]:
# h_l + c
x_train['contents_num_4']=x_train.contents_attribute_h_attribute_h_l.astype(str) + '-' + x_train.contents_attribute_c.astype(str)

x_test['contents_num_4']=x_test.contents_attribute_h_attribute_h_l.astype(str) + '-' + x_test.contents_attribute_c.astype(str)

In [212]:
set(x_test.contents_num_4.unique()) - set(x_train.contents_num_4.unique())

set()

In [213]:
x_train.contents_num_4.nunique(),x_test.contents_num_4.nunique()

(68, 67)

#### target값으로 rate를 구함

In [214]:
train_rating_4 = x_train.groupby(['person_num_4','contents_num_4'])['target'].agg([
                                            ('구매건수_4', np.size)]).reset_index()

In [215]:
train_rating_4.head()

Unnamed: 0,person_num_4,contents_num_4,구매건수_4
0,1-1,149-1,224
1,1-1,149-2,4
2,1-1,149-3,64
3,1-1,149-4,2
4,1-1,169-1,571


In [216]:
# 구매건수가 안맞을 수 있기 때문에 나누기 3 정도 해줌
train_rating_4.구매건수_4 = (train_rating_4.구매건수_4 /3).astype(int)

In [217]:
x_train = pd.merge(x_train,train_rating_4,how='left',on=['person_num_4','contents_num_4'])
x_test = pd.merge(x_test,train_rating_4,how='left',on=['person_num_4','contents_num_4'])

In [218]:
a = x_test[x_test.구매건수.isnull()].index
x_test.loc[a, '구매건수'] = x_test.loc[a,'구매건수_4']

In [219]:
x_test.iloc[:,-12:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46404 entries, 0 to 46403
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   person_num      46404 non-null  object 
 1   contents_num    46404 non-null  object 
 2   구매건수            46402 non-null  float64
 3   person_num_2    46404 non-null  object 
 4   contents_num_2  46404 non-null  object 
 5   구매건수_2          46244 non-null  float64
 6   person_num_3    46404 non-null  object 
 7   contents_num_3  46404 non-null  object 
 8   구매건수_3          46039 non-null  float64
 9   person_num_4    46404 non-null  object 
 10  contents_num_4  46404 non-null  object 
 11  구매건수_4          46330 non-null  float64
dtypes: float64(4), object(8)
memory usage: 5.6+ MB


---

In [220]:
#2개의 nan값은 그냥 mean으로 채워줌
x_test.구매건수.fillna(x_train.구매건수.mean(),inplace=True)

In [225]:
drop_train =['person_num', 'contents_num', 'target', 'person_num_2',
       'contents_num_2', '구매건수_2', 'person_num_3', 'contents_num_3', '구매건수_3',
       'person_num_4', 'contents_num_4', '구매건수_4']
drop_test = ['person_num', 'contents_num', '구매건수', 'person_num_2',
       'contents_num_2', '구매건수_2', 'person_num_3', 'contents_num_3', '구매건수_3',
       'person_num_4', 'contents_num_4', '구매건수_4']

In [226]:
x_train.drop(drop_train,axis=1,inplace=True)
x_test.drop(drop_test,axis=1,inplace=True)

In [227]:
x_train.shape, x_test.shape

((501951, 121), (46404, 120))

### 범주형 칼럼 리스트

In [92]:
num_features = ['content_L_code_sum','L_E_mul', 'L_E_sum','구매건수']

In [93]:
target_features =['d_1_l_target','d_2_l_target','d_3_l_target']

In [94]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()
cat_features = list(set(cat_features) - set(num_features) - set(target_features))

### 가우스-랭크 스케일링

In [95]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted

class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [96]:
scaler = GaussRankScaler()
x_train[num_features] = pd.DataFrame(scaler.fit_transform(x_train[num_features]))
x_test[num_features] = pd.DataFrame(scaler.transform(x_test[num_features]))

x_train[num_features].columns = num_features
x_test[num_features].columns = num_features

### 학습 파라미터

In [97]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [98]:
x_train[num_features] = x_train[num_features].astype(float)
x_train[cat_features] = x_train[cat_features].astype(str)

x_test[num_features] = x_test[num_features].astype(float)
x_test[cat_features] = x_test[cat_features].astype(str)

### 학습

In [141]:
scores = []
models = []

for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED,
                               #task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience,
            verbose=100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.086395
0:	learn: 0.6264016	test: 0.6276883	best: 0.6276883 (0)	total: 2.69s	remaining: 2h 14m 14s
100:	learn: 0.6656540	test: 0.6887130	best: 0.6888372 (97)	total: 5m 47s	remaining: 2h 46m 1s
200:	learn: 0.6739496	test: 0.6963631	best: 0.6963841 (199)	total: 11m 16s	remaining: 2h 37m 2s
300:	learn: 0.6780798	test: 0.6984130	best: 0.6985309 (294)	total: 16m 42s	remaining: 2h 29m 53s
400:	learn: 0.6807623	test: 0.6992166	best: 0.6994820 (394)	total: 22m 28s	remaining: 2h 25m 38s
500:	learn: 0.6827867	test: 0.7006053	best: 0.7006513 (499)	total: 27m 32s	remaining: 2h 17m 21s
600:	learn: 0.6848511	test: 0.7005639	best: 0.7008799 (570)	total: 33m 5s	remaining: 2h 12m 6s
700:	learn: 0.6862534	test: 0.7011576	best: 0.7013679 (684)	total: 38m 45s	remaining: 2h 7m 8s
800:	learn: 0.6875419	test: 0.7020456	best: 0.7020456 (800)	total: 44m 13s	remaining: 2h 1m 25s
900:	learn: 0.6889196	test: 0.7024394	best: 0.7026571 (868)	total: 50m 13s	remaining: 1h 56m 59s
1000:	learn: 0.

---

### cv 결과 확인

In [120]:
print(scores)
print(np.mean(scores))

[0.6941032634251858, 0.6931561936259771, 0.6906621008861878, 0.6916580262041664, 0.6902584941867027]
0.691967615665644


### threshold 정의

In [183]:
threshold = 0.375

### threshold값 변경에 따른 검증점수 확인 및 추론

In [123]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7183292814156469, 0.7164586924231714, 0.7154945273631841, 0.7168591518943898, 0.7139016163930254]
0.7162086538978836


### 산술평균 앙상블

In [148]:
pred_list_T = pd.DataFrame(pred_list).T

In [150]:
pred_list_T.to_csv(f"{SUBMIT_PATH}0123_sh_70435_proba.csv", index=False)