In [117]:
import os
import sys
import platform, psutil
import pandas as pd
import numpy as np
import catboost
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler

In [118]:
# 개발환경(OS) 및 라이브러리 버전
print(f"- os: {platform.platform()}")
print(f"- Process information: {platform.processor()}")
print(f"- Process Architecture: {platform.machine()}")
print(f"- RAM Size: {str(round(psutil.virtual_memory().total / (1024.0 **3)))} GB")
print()
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")
print(f"- catboost: {catboost.__version__}")
print(f"- optuna: {optuna.__version__}")

- os: Windows-10-10.0.19041-SP0
- Process information: Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
- Process Architecture: AMD64
- RAM Size: 16 GB

- python: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
- pandas: 1.1.2
- numpy: 1.19.2
- sklearn: 0.23.2
- catboost: 1.0.4
- optuna: 2.10.0


# 전처리

In [1]:
from enum import Enum
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

In [2]:
class StrEnum(str, Enum):
    def _generate_next_value_(name, start, count, last_values):
        return name

    def __repr__(self):
        return self.value

    def __str__(self):
        return self.value

In [3]:
class CodeD(StrEnum):
    L = "속성 D 대분류코드"
    M = "속성 D 중분류코드"
    S = "속성 D 소분류코드"
    XS = "속성 D 세분류코드"

In [4]:
class CodeH(StrEnum):
    L = "속성 H 대분류코드"
    M = "속성 H 중분류코드"

In [5]:
class CodeL(StrEnum):
    L = "속성 L 대분류코드"
    M = "속성 L 중분류코드"
    S = "속성 L 소분류코드"
    XS = "속성 L 세분류코드"

In [6]:
def load_data():
    train = pd.read_csv("./data/train.csv", encoding='UTF8')
    test = pd.read_csv("./data/test.csv", encoding='UTF8')
    return train, test

In [7]:
def load_code():
    codeD = pd.read_csv("./data/속성_D_코드.csv", index_col=0, encoding='UTF8').T.to_dict()
    codeH = pd.read_csv("./data/속성_H_코드.csv", index_col=0, encoding='UTF8').T.to_dict()
    codeL = pd.read_csv("./data/속성_L_코드.csv", index_col=0, encoding='UTF8').T.to_dict()
    return codeD, codeH, codeL

In [8]:
def add_subcode(df, codeD, codeH, codeL):
    df = df.copy()
    
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: codeD[x][CodeD.L])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: codeD[x][CodeD.M])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: codeD[x][CodeD.S])
    df['person_prefer_d_1_xs'] = df['person_prefer_d_1'].apply(lambda x: codeD[x][CodeD.XS])
    
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: codeD[x][CodeD.L])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: codeD[x][CodeD.M])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: codeD[x][CodeD.S])
    df['person_prefer_d_2_xs'] = df['person_prefer_d_2'].apply(lambda x: codeD[x][CodeD.XS])
    
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: codeD[x][CodeD.L])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: codeD[x][CodeD.M])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: codeD[x][CodeD.S])
    df['person_prefer_d_3_xs'] = df['person_prefer_d_3'].apply(lambda x: codeD[x][CodeD.XS])

    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: codeD[x][CodeD.L])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: codeD[x][CodeD.M])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: codeD[x][CodeD.S])
    df['contents_attribute_d_xs'] = df['contents_attribute_d'].apply(lambda x: codeD[x][CodeD.XS])
    
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: codeH[x][CodeH.L])
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: codeH[x][CodeH.M])

    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: codeH[x][CodeH.L])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: codeH[x][CodeH.M])
    
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: codeH[x][CodeH.L])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: codeH[x][CodeH.M])
    
    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: codeH[x][CodeH.L])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: codeH[x][CodeH.M])
    
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: codeL[x][CodeL.L])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: codeL[x][CodeL.M])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: codeL[x][CodeL.S])
    df['contents_attribute_l_xs'] = df['contents_attribute_l'].apply(lambda x: codeL[x][CodeL.XS])
    
    return df

In [9]:
# person d와 contents d의 매치 여부에 따라 새로운 컬럼 생성

def add_D_match(df):
    df = df.copy()
    df['d_1_l_match_yn'] = df['person_prefer_d_1_l'] == df['contents_attribute_d_l']
    df['d_1_m_match_yn'] = df['person_prefer_d_1_m'] == df['contents_attribute_d_m']
    df['d_1_s_match_yn'] = df['person_prefer_d_1_s'] == df['contents_attribute_d_s']
    df['d_1_xs_match_yn'] = df['person_prefer_d_1_xs'] == df['contents_attribute_d_xs']
    df['d_1_match_yn'] = df['person_prefer_d_1'] == df['contents_attribute_d']
    
    df['d_2_l_match_yn'] = df['person_prefer_d_2_l'] == df['contents_attribute_d_l']
    df['d_2_m_match_yn'] = df['person_prefer_d_2_m'] == df['contents_attribute_d_m']
    df['d_2_s_match_yn'] = df['person_prefer_d_2_s'] == df['contents_attribute_d_s']
    df['d_2_xs_match_yn'] = df['person_prefer_d_2_xs'] == df['contents_attribute_d_xs']
    df['d_2_match_yn'] = df['person_prefer_d_2'] == df['contents_attribute_d']
    
    df['d_3_l_match_yn'] = df['person_prefer_d_3_l'] == df['contents_attribute_d_l']
    df['d_3_m_match_yn'] = df['person_prefer_d_3_m'] == df['contents_attribute_d_m']
    df['d_3_s_match_yn'] = df['person_prefer_d_3_s'] == df['contents_attribute_d_s']
    df['d_3_xs_match_yn'] = df['person_prefer_d_3_xs'] == df['contents_attribute_d_xs']
    df['d_3_match_yn'] = df['person_prefer_d_3'] == df['contents_attribute_d']
    return df

In [10]:
# person h와 contents h의 매치 여부에 따라 새로운 컬럼 생성

def add_H_match(df):
    df = df.copy()
    df['h_1_l_match_yn'] = df['person_prefer_h_1_l'] == df['contents_attribute_h_l']
    df['h_1_m_match_yn'] = df['person_prefer_h_1_m'] == df['contents_attribute_h_m']
    df['h_1_match_yn'] = df['person_prefer_h_1'] == df['contents_attribute_h']
    
    df['h_2_l_match_yn'] = df['person_prefer_h_2_l'] == df['contents_attribute_h_l']
    df['h_2_m_match_yn'] = df['person_prefer_h_2_m'] == df['contents_attribute_h_m']
    df['h_2_match_yn'] = df['person_prefer_h_2'] == df['contents_attribute_h']
    
    df['h_3_l_match_yn'] = df['person_prefer_h_3_l'] == df['contents_attribute_h_l']
    df['h_3_m_match_yn'] = df['person_prefer_h_3_m'] == df['contents_attribute_h_m']
    df['h_3_match_yn'] = df['person_prefer_h_3'] == df['contents_attribute_h']
    return df

In [11]:
# person e와 contents e의 차이에 대한 새로운 컬럼 생성

def add_E_abs(df):
    df = df.copy()
    df["e_abs"] = df.apply(lambda x: abs(x.person_prefer_e - x.contents_attribute_e), axis=1)
    return df

In [12]:
# person a와 contents a의 일치 여부에 대한 새로운 컬럼 형성

def encode_A(x):
    if x.contents_attribute_a == 3:
        return 2
    if x.person_attribute_a == x.contents_attribute_a:
        return 1
    return 0

def add_A_match(df):
    df = df.copy()
    df["a_match_ynx"] = df.apply(lambda x: encode_A(x), axis=1)
    
    return df


In [13]:
# person c와 contents c의 일치 여부에 대한 새로운 컬럼 형성

def add_C_match(df):
    df = df.copy()
    df['c_match_yn'] = df['person_prefer_c'] == df['contents_attribute_c']
    return df

In [14]:
def preprocess_bool(df):
    df = df.copy()
    for col in df.select_dtypes(include='bool').columns:
        df[col] = df[col].astype(int)
        
    return df

In [15]:
def preprocess_add_match_columns(df):
    df = (df.copy()
           .pipe(add_D_match)
           .pipe(add_H_match)
           .pipe(add_A_match)
           .pipe(add_C_match)
           .pipe(add_E_abs))
    return df

In [16]:
def preprocess(df):
    df = preprocess_add_match_columns(df)
    df = preprocess_bool(df)
    return df

In [17]:
def init_data():
    train, test = load_data()
    codeD, codeH, codeL = load_code()
    
    train = add_subcode(train, codeD, codeH, codeL)
    test = add_subcode(test, codeD, codeH, codeL)
    
    train_preprocessed = preprocess(train)
    test_preprocessed = preprocess(test)
    return train_preprocessed, test_preprocessed

In [18]:
train, test = init_data()

In [19]:
df_by_contents_rn = train.groupby('contents_rn').filter(lambda x: len(x) > 1)

In [20]:
df_by_person_rn = train.groupby('person_rn').filter(lambda x: len(x) > 1)

In [21]:
contents_attributes = ['contents_attribute_i', 'contents_attribute_a', 'contents_attribute_j_1', 'contents_attribute_j', 'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l', 'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e', 'contents_attribute_h']
contents_rn_not_mathced = []

for contents_rn in set(df_by_contents_rn['contents_rn']):
    df = df_by_contents_rn[df_by_contents_rn['contents_rn'] == contents_rn]
    for contents_attribute in contents_attributes:
        if len(set(df[contents_attribute])) != 1:
            print(f'contents rn({contents_rn}) 안맞음 : {contents_attribute}')
            contents_rn_not_mathced.append(contents_rn)

contents rn(262795) 안맞음 : contents_attribute_l
contents rn(584221) 안맞음 : contents_attribute_l
contents rn(64544) 안맞음 : contents_attribute_l
contents rn(68370) 안맞음 : contents_attribute_l
contents rn(68446) 안맞음 : contents_attribute_l
contents rn(70260) 안맞음 : contents_attribute_l
contents rn(81058) 안맞음 : contents_attribute_l
contents rn(344737) 안맞음 : contents_attribute_l
contents rn(83700) 안맞음 : contents_attribute_l
contents rn(105560) 안맞음 : contents_attribute_l
contents rn(111232) 안맞음 : contents_attribute_l
contents rn(115453) 안맞음 : contents_attribute_l
contents rn(115454) 안맞음 : contents_attribute_l
contents rn(643370) 안맞음 : contents_attribute_l
contents rn(644714) 안맞음 : contents_attribute_l
contents rn(654322) 안맞음 : contents_attribute_l
contents rn(657364) 안맞음 : contents_attribute_l
contents rn(657768) 안맞음 : contents_attribute_l
contents rn(133871) 안맞음 : contents_attribute_l
contents rn(659463) 안맞음 : contents_attribute_l
contents rn(136035) 안맞음 : contents_attribute_l
contents rn(138762)

In [22]:
contents_rn_not_mathced

[262795,
 584221,
 64544,
 68370,
 68446,
 70260,
 81058,
 344737,
 83700,
 105560,
 111232,
 115453,
 115454,
 643370,
 644714,
 654322,
 657364,
 657768,
 133871,
 659463,
 136035,
 138762,
 404740,
 427239,
 428819,
 437370,
 701844,
 468740,
 736713,
 485167,
 485698,
 486410,
 489505,
 496088]

### 날짜를 Y/M/D로 나누기

In [23]:
from datetime import datetime

def preprocessing_contents_open_dt(data):
    data['contents_open_dt'] = data['contents_open_dt'].astype('str')
    DATE = data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    
    DATE = pd.DataFrame(DATE)
    DATE = DATE.rename(columns = {'contents_open_dt': 'date'})
    
    DATE['Y'] = DATE['date'].apply(lambda x: x.timetuple()[0])
    DATE['M'] = DATE['date'].apply(lambda x: x.timetuple()[1])
    DATE['D'] = DATE['date'].apply(lambda x: x.timetuple()[2])
    DATE['id'] = data['id']
    
    data = data.merge(DATE, on = 'id', how = 'left')
    data = data.drop(columns = ['date', 'contents_open_dt'])
    return data

train = preprocessing_contents_open_dt(train)
test = preprocessing_contents_open_dt(test)

### 컬럼 제거

prefer_f, prefer_g : 임포턴스가 0이다

id : 임포턴스가 너무 높다

In [24]:
train.columns

Index(['id', 'd_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'contents_rn', 'target',
       'person_prefer_d_1_l', 'person_prefer_d_1_m', 'person_prefer_d_1_s',
       'person_prefer_d_1_xs', 'person_prefer_d_2_l', 'person_prefer_d_2_m',
       'person_prefer_d_2_s', 'person_prefer_d_2_xs', 'person_prefer_d_3_l',
       'person_pr

In [25]:
cols = ['person_prefer_f','person_prefer_g','id','d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn','h_m_match_yn', 'h_s_match_yn']

train = train.drop(cols,axis = 1)

In [26]:
train

Unnamed: 0,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,...,h_2_match_yn,h_3_l_match_yn,h_3_m_match_yn,h_3_match_yn,a_match_ynx,c_match_yn,e_abs,Y,M,D
0,1,4,3,5,275,370,369,8,4,95,...,0,0,0,0,2,0,4,2020,1,17
1,1,3,4,1,114,181,175,4,131,101,...,0,1,0,0,2,1,0,2020,6,18
2,2,0,3,5,464,175,452,3,54,263,...,0,1,0,0,0,0,1,2020,7,8
3,2,0,2,5,703,705,704,3,72,227,...,0,0,0,0,2,0,0,2020,1,13
4,1,3,4,5,275,370,369,4,214,210,...,0,0,0,0,1,0,0,2020,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,1,1,2,2,1192,935,1228,3,59,4,...,0,0,0,0,2,0,2,2020,3,13
501947,1,6,2,1,118,113,110,4,105,142,...,1,1,0,0,2,1,0,2020,1,20
501948,1,7,4,1,147,46,145,4,59,127,...,0,0,0,0,1,1,3,2020,8,5
501949,1,1,2,1,46,147,145,4,251,49,...,0,1,0,0,0,1,0,2020,6,15


In [27]:
cols = ['person_prefer_f','person_prefer_g','id','d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn','h_m_match_yn', 'h_s_match_yn']

test = test.drop(cols,axis = 1)

In [28]:
test

Unnamed: 0,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,...,h_2_match_yn,h_3_l_match_yn,h_3_m_match_yn,h_3_match_yn,a_match_ynx,c_match_yn,e_abs,Y,M,D
0,1,1,2,1,857,851,1227,4,263,56,...,0,0,0,0,2,1,1,2020,12,1
1,2,0,2,2,683,1086,662,2,258,263,...,1,0,0,0,2,0,2,2020,12,17
2,2,3,2,1,514,790,1233,0,177,170,...,0,1,0,0,2,1,4,2020,12,10
3,1,2,2,5,114,181,175,4,177,170,...,0,1,0,0,2,0,1,2020,12,3
4,1,6,4,5,1082,1078,1056,5,178,177,...,1,0,0,0,1,0,1,2020,12,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,2,0,4,1,147,46,145,4,2,4,...,0,0,0,0,2,1,1,2020,12,17
46400,2,0,4,1,176,120,159,4,86,31,...,0,0,0,0,2,1,0,2020,12,29
46401,2,0,3,1,145,46,147,5,288,279,...,0,1,0,0,2,1,1,2020,12,10
46402,1,3,3,2,145,46,147,6,288,279,...,0,1,0,0,1,0,1,2020,12,7


### content_rn에 따른 속성 맞춰주기 

같은 컨텐츠는 같은 컨텐츠 속성을 갖고 있을 것으므로 이를 맞춰주겠다.

contents_attribute_l에만 존재

In [29]:
#다른 컨텐츠 속성을 갖고 있는 contents_rn

contents_rn_not_mathced

[262795,
 584221,
 64544,
 68370,
 68446,
 70260,
 81058,
 344737,
 83700,
 105560,
 111232,
 115453,
 115454,
 643370,
 644714,
 654322,
 657364,
 657768,
 133871,
 659463,
 136035,
 138762,
 404740,
 427239,
 428819,
 437370,
 701844,
 468740,
 736713,
 485167,
 485698,
 486410,
 489505,
 496088]

In [30]:
train.loc[(train['contents_rn'] == 584221)]['contents_attribute_l']

40723     234
91078     234
91079     234
91102     234
161273    234
161364    234
161366    234
469925     36
Name: contents_attribute_l, dtype: int64

In [31]:
train.loc[(train['contents_rn'] == 64544)]['contents_attribute_l']

171732    245
199663    245
223251     38
Name: contents_attribute_l, dtype: int64

In [32]:
index_l = train.loc[(train['contents_rn'] == 584221)]['contents_attribute_l'].index

In [33]:
index_l

Int64Index([40723, 91078, 91079, 91102, 161273, 161364, 161366, 469925], dtype='int64')

In [34]:
train.loc[(train['contents_rn'] == 584221)]['contents_attribute_l'].index

Int64Index([40723, 91078, 91079, 91102, 161273, 161364, 161366, 469925], dtype='int64')

In [35]:
from scipy.stats import mode

int(mode(train[train['contents_rn'] == 584221]['contents_attribute_l'])[0])

234

In [36]:
for i in contents_rn_not_mathced:
    if len(train[train['contents_rn'] == i]['contents_attribute_l']) == 2:
        print(i)
    else:
        index_l = train.loc[(train['contents_rn'] == i)]['contents_attribute_l'].index
        mode_num = int(mode(train[train['contents_rn'] == i]['contents_attribute_l'])[0])
        train['contents_attribute_l'][index_l] = mode_num

262795
70260
115453
644714
657768
428819
468740
736713
486410
489505
496088


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a Data

In [37]:
train.loc[(train['contents_rn'] == 485698)]['contents_attribute_l']

301763    359
301768    359
348920    359
Name: contents_attribute_l, dtype: int64

### catboost 인코딩

In [38]:
train.columns

Index(['person_attribute_a', 'person_attribute_a_1', 'person_attribute_b',
       'person_prefer_c', 'person_prefer_d_1', 'person_prefer_d_2',
       'person_prefer_d_3', 'person_prefer_e', 'person_prefer_h_1',
       'person_prefer_h_2', 'person_prefer_h_3', 'contents_attribute_i',
       'contents_attribute_a', 'contents_attribute_j_1',
       'contents_attribute_j', 'contents_attribute_c', 'contents_attribute_k',
       'contents_attribute_l', 'contents_attribute_d', 'contents_attribute_m',
       'contents_attribute_e', 'contents_attribute_h', 'person_rn',
       'contents_rn', 'target', 'person_prefer_d_1_l', 'person_prefer_d_1_m',
       'person_prefer_d_1_s', 'person_prefer_d_1_xs', 'person_prefer_d_2_l',
       'person_prefer_d_2_m', 'person_prefer_d_2_s', 'person_prefer_d_2_xs',
       'person_prefer_d_3_l', 'person_prefer_d_3_m', 'person_prefer_d_3_s',
       'person_prefer_d_3_xs', 'contents_attribute_d_l',
       'contents_attribute_d_m', 'contents_attribute_d_s',
       'c

In [39]:
train_encode_str=train.applymap(str)

In [40]:
test_encode_str=test.applymap(str)

In [41]:
train.columns

Index(['person_attribute_a', 'person_attribute_a_1', 'person_attribute_b',
       'person_prefer_c', 'person_prefer_d_1', 'person_prefer_d_2',
       'person_prefer_d_3', 'person_prefer_e', 'person_prefer_h_1',
       'person_prefer_h_2', 'person_prefer_h_3', 'contents_attribute_i',
       'contents_attribute_a', 'contents_attribute_j_1',
       'contents_attribute_j', 'contents_attribute_c', 'contents_attribute_k',
       'contents_attribute_l', 'contents_attribute_d', 'contents_attribute_m',
       'contents_attribute_e', 'contents_attribute_h', 'person_rn',
       'contents_rn', 'target', 'person_prefer_d_1_l', 'person_prefer_d_1_m',
       'person_prefer_d_1_s', 'person_prefer_d_1_xs', 'person_prefer_d_2_l',
       'person_prefer_d_2_m', 'person_prefer_d_2_s', 'person_prefer_d_2_xs',
       'person_prefer_d_3_l', 'person_prefer_d_3_m', 'person_prefer_d_3_s',
       'person_prefer_d_3_xs', 'contents_attribute_d_l',
       'contents_attribute_d_m', 'contents_attribute_d_s',
       'c

In [42]:
target=train.loc[:,'target']

In [43]:
pip install --upgrade category_encoders

Requirement already up-to-date: category_encoders in c:\users\jlee0\anaconda3\lib\site-packages (2.3.0)
Note: you may need to restart the kernel to use updated packages.


X_train: 명목형, 순서형 변수 모두 합친 x_feature(target 제외)

In [44]:
X_train = train_encode_str

In [45]:
X_train.drop(['target'],inplace=True, axis=1)

In [46]:
len(X_train.columns)

82

In [47]:
X_test=test_encode_str

In [48]:
len(X_test.columns)

82

In [49]:
import category_encoders as ce
# cols에 기재된 column에 대해서만 encoding 진행됨
CBE_encoder = ce.cat_boost.CatBoostEncoder(cols=['person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'contents_rn',
       'person_prefer_d_1_l', 'person_prefer_d_1_m', 'person_prefer_d_1_s',
       'person_prefer_d_1_xs', 'person_prefer_d_2_l', 'person_prefer_d_2_m',
       'person_prefer_d_2_s', 'person_prefer_d_2_xs', 'person_prefer_d_3_l',
       'person_prefer_d_3_m', 'person_prefer_d_3_s', 'person_prefer_d_3_xs',
       'contents_attribute_d_l', 'contents_attribute_d_m',
       'contents_attribute_d_s', 'contents_attribute_d_xs',
       'person_prefer_h_1_l', 'person_prefer_h_1_m', 'person_prefer_h_2_l',
       'person_prefer_h_2_m', 'person_prefer_h_3_l', 'person_prefer_h_3_m',
       'contents_attribute_h_l', 'contents_attribute_h_m',
       'contents_attribute_l_l', 'contents_attribute_l_m',
       'contents_attribute_l_s', 'contents_attribute_l_xs', 'd_1_l_match_yn',
       'd_1_m_match_yn', 'd_1_s_match_yn', 'd_1_xs_match_yn', 'd_1_match_yn',
       'd_2_l_match_yn', 'd_2_m_match_yn', 'd_2_s_match_yn', 'd_2_xs_match_yn',
       'd_2_match_yn', 'd_3_l_match_yn', 'd_3_m_match_yn', 'd_3_s_match_yn',
       'd_3_xs_match_yn', 'd_3_match_yn', 'h_1_l_match_yn', 'h_1_m_match_yn',
       'h_1_match_yn', 'h_2_l_match_yn', 'h_2_m_match_yn', 'h_2_match_yn',
       'h_3_l_match_yn', 'h_3_m_match_yn', 'h_3_match_yn', 'a_match_ynx',
       'c_match_yn', 'e_abs', 'Y', 'M', 'D'])

test_cbe는 target value 없어서 train data로 split

# 모델링 - LGBM

LGBM으로 기본 모델링 후 이를 성능을 향상 시키겠다

In [56]:
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold,GridSearchCV
from sklearn.metrics import f1_score 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [57]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=.3)

In [58]:
from lightgbm import LGBMClassifier

#optuna로 뽑은 최적 파라미터 사용
lgbm = LGBMClassifier(objective="binary", reg_alpha= 1.987904330777592e-05, reg_lambda= 0.028054003730936226, max_depth= 11, num_leaves= 141, colsample_bytree= 0.5109126733153162, subsample= 0.9787092394351908, subsample_freq= 8, min_child_samples= 95, max_bin= 469)

In [59]:
from sklearn.pipeline import Pipeline

scaler = CBE_encoder
base_model = lgbm
pipe = Pipeline([('scaler',scaler),('base_model',base_model)])

In [61]:
pipe.fit(x_train, y_train)
print('train_set score : ', pipe.score(x_train,y_train))
print('test_set score : ', pipe.score(x_test,y_test))

train_set score :  0.7785977544718455
test_set score :  0.6480682135125443


In [62]:
result=pipe.predict_proba(X_test)[:,1]
result_lgbm=pd.DataFrame(result)

In [63]:
result_lgbm

Unnamed: 0,0
0,0.466087
1,0.378593
2,0.337272
3,0.238087
4,0.366112
...,...
46399,0.585523
46400,0.627200
46401,0.574791
46402,0.582383


# 랜덤포레스트

In [66]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=.3,random_state=42)

In [67]:
scaler = CBE_encoder
rf = RandomForestClassifier(n_estimators = 106, max_depth = 75, 
                            min_samples_split = 54, min_samples_leaf = 146, 
                            max_samples= 0.6166889653236419, max_leaf_nodes= 200)
pipe_rf = Pipeline([('scaler',scaler),('base_model',rf)])
pipe_rf.fit(x_train,y_train)

Pipeline(steps=[('scaler',
                 CatBoostEncoder(cols=['person_attribute_a',
                                       'person_attribute_a_1',
                                       'person_attribute_b', 'person_prefer_c',
                                       'person_prefer_d_1', 'person_prefer_d_2',
                                       'person_prefer_d_3', 'person_prefer_e',
                                       'person_prefer_h_1', 'person_prefer_h_2',
                                       'person_prefer_h_3',
                                       'contents_attribute_i',
                                       'contents_attribute_a',
                                       'contents_attribute_j_1',
                                       'contents_...
                                       'contents_attribute_h', 'person_rn',
                                       'contents_rn', 'person_prefer_d_1_l',
                                       'person_prefer_d_1_m',
         

In [68]:
pred_rf = pipe_rf.predict_proba(X_test)[:,1]

result_rf=pd.DataFrame(pred_rf)

In [69]:
result_rf

Unnamed: 0,0
0,0.440198
1,0.404950
2,0.361323
3,0.360771
4,0.365312
...,...
46399,0.588303
46400,0.628416
46401,0.594461
46402,0.594600


# 모델링 - Catboost

In [70]:
train, test = init_data()

In [71]:
train['contents_open_dt'] =  pd.to_datetime(train['contents_open_dt'], format='%Y-%m-%d %H:%M:%S')
test['contents_open_dt'] =  pd.to_datetime(test['contents_open_dt'], format='%Y-%m-%d %H:%M:%S')

In [72]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
train["contents_open_dt"] = min_max_scaler.fit_transform(train[["contents_open_dt"]])
test["contents_open_dt"] = min_max_scaler.fit_transform(test[["contents_open_dt"]])

In [73]:
train['contents_open_dt'] = train['contents_open_dt'].astype('float') 
test['contents_open_dt'] = test['contents_open_dt'].astype('float') 

In [74]:
cols = ['person_prefer_f','person_prefer_g','id']

train = train.drop(cols,axis = 1)
test = test.drop(cols,axis = 1)

In [75]:
cols = ['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn']

train = train.drop(cols,axis = 1)
target = train.loc[:, 'target']
train.drop(['target'], inplace=True, axis=1)

test = test.drop(cols,axis = 1)

In [76]:
# cols에 기재된 column에 대해서만 encoding 진행됨
cat_features=['person_attribute_a',
       'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m',
       'contents_attribute_h', 'person_rn', 'contents_rn',
       'person_prefer_d_1_l', 'person_prefer_d_1_m', 'person_prefer_d_1_s',
       'person_prefer_d_1_xs', 'person_prefer_d_2_l', 'person_prefer_d_2_m',
       'person_prefer_d_2_s', 'person_prefer_d_2_xs', 'person_prefer_d_3_l',
       'person_prefer_d_3_m', 'person_prefer_d_3_s', 'person_prefer_d_3_xs',
       'contents_attribute_d_l', 'contents_attribute_d_m',
       'contents_attribute_d_s', 'contents_attribute_d_xs',
       'person_prefer_h_1_l', 'person_prefer_h_1_m', 'person_prefer_h_2_l',
       'person_prefer_h_2_m', 'person_prefer_h_3_l', 'person_prefer_h_3_m',
       'contents_attribute_h_l', 'contents_attribute_h_m',
       'contents_attribute_l_l', 'contents_attribute_l_m',
       'contents_attribute_l_s', 'contents_attribute_l_xs', 'd_1_l_match_yn',
       'd_1_m_match_yn', 'd_1_s_match_yn', 'd_1_xs_match_yn', 'd_1_match_yn',
       'd_2_l_match_yn', 'd_2_m_match_yn', 'd_2_s_match_yn', 'd_2_xs_match_yn',
       'd_2_match_yn', 'd_3_l_match_yn', 'd_3_m_match_yn', 'd_3_s_match_yn',
       'd_3_xs_match_yn', 'd_3_match_yn', 'h_1_l_match_yn', 'h_1_m_match_yn',
       'h_1_match_yn', 'h_2_l_match_yn', 'h_2_m_match_yn', 'h_2_match_yn',
       'h_3_l_match_yn', 'h_3_m_match_yn', 'h_3_match_yn', 'a_match_ynx',
       'c_match_yn', 'e_abs']

In [80]:
## 본학습

from sklearn.model_selection import KFold, StratifiedKFold
from catboost import CatBoostClassifier, Pool, cv

SEED = 42

cv = KFold(n_splits=7, shuffle=True)

models = []
cv_scores = []

params = {
          'bagging_temperature': 0.0133,
          'max_depth': 8,
          'random_strength': 51,
        #   'colsample_bylevel': 0.9445,
          'l2_leaf_reg': 7.486e-06,
          'min_child_samples': 44,
          'max_bin': 427,
          'od_type': 'IncToDec'
          }


for train_idx, valid_idx in cv.split(train, target):
    x_train_fold, x_valid_fold = train.iloc[train_idx], train.iloc[valid_idx]
    y_train_fold, y_valid_fold = target[train_idx], target[valid_idx]
    
    # model = CatBoostClassifier(task_type="GPU", cat_features=cat_features, eval_metric='F1', one_hot_max_size=6, random_state=SEED, od_type="IncToDec", max_bin=427, min_child_samples=44, l2_leaf_reg=7.45e-06)
    model = CatBoostClassifier(iterations=3000, cat_features=cat_features, eval_metric='F1', one_hot_max_size=5, random_state=SEED, **params)
    
    model.fit(x_train_fold, y_train_fold, eval_set=[(x_valid_fold, y_valid_fold)] , early_stopping_rounds=50)
    models.append(model)
    
    cv_scores.append(model.get_best_score()['validation']['F1'])
    

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6672278	test: 0.6656423	best: 0.6656423 (0)	total: 191ms	remaining: 9m 32s
1:	learn: 0.5397235	test: 0.5576168	best: 0.6656423 (0)	total: 2.95s	remaining: 1h 13m 47s
2:	learn: 0.5900566	test: 0.5970804	best: 0.6656423 (0)	total: 5.86s	remaining: 1h 37m 34s
3:	learn: 0.5907248	test: 0.5948342	best: 0.6656423 (0)	total: 7.85s	remaining: 1h 37m 58s
4:	learn: 0.5958231	test: 0.6005109	best: 0.6656423 (0)	total: 10.4s	remaining: 1h 43m 35s
5:	learn: 0.6044849	test: 0.6074104	best: 0.6656423 (0)	total: 12.5s	remaining: 1h 43m 38s
6:	learn: 0.6070912	test: 0.6100218	best: 0.6656423 (0)	total: 13.6s	remaining: 1h 36m 54s
7:	learn: 0.6039258	test: 0.6063580	best: 0.6656423 (0)	total: 15.6s	remaining: 1h 37m 30s
8:	learn: 0.6045733	test: 0.6095196	best: 0.6656423 (0)	total: 18.2s	remaining: 1h 41m
9:	learn: 0.6077597	test: 0.6119596	best: 0.6656423 (0)	total: 20.7s	remaining: 1h 42m 58s
10:	learn: 0.6133200	test: 0.6177150	best: 0.6656423 (0)	total: 23s	remaining: 1h 44m 2s
11:	learn

37:	learn: 0.6177749	test: 0.6331020	best: 0.6430617 (11)	total: 1m 34s	remaining: 2h 2m 22s
38:	learn: 0.6178326	test: 0.6336890	best: 0.6430617 (11)	total: 1m 38s	remaining: 2h 4m 18s
39:	learn: 0.6180233	test: 0.6337790	best: 0.6430617 (11)	total: 1m 39s	remaining: 2h 3m 15s
40:	learn: 0.6183036	test: 0.6344422	best: 0.6430617 (11)	total: 1m 44s	remaining: 2h 5m 8s
41:	learn: 0.6175999	test: 0.6340564	best: 0.6430617 (11)	total: 1m 46s	remaining: 2h 5m 20s
42:	learn: 0.6165738	test: 0.6336662	best: 0.6430617 (11)	total: 1m 50s	remaining: 2h 6m 15s
43:	learn: 0.6169923	test: 0.6334573	best: 0.6430617 (11)	total: 1m 53s	remaining: 2h 6m 38s
44:	learn: 0.6166960	test: 0.6331949	best: 0.6430617 (11)	total: 1m 54s	remaining: 2h 5m 42s
45:	learn: 0.6151370	test: 0.6313414	best: 0.6430617 (11)	total: 1m 56s	remaining: 2h 5m 12s
46:	learn: 0.6152205	test: 0.6310229	best: 0.6430617 (11)	total: 2m	remaining: 2h 6m 1s
47:	learn: 0.6156447	test: 0.6324394	best: 0.6430617 (11)	total: 2m 4s	remai

In [81]:
preds = []
for model in models:
    pred = model.predict_proba(test)[:, 1]
    preds.append(pred)

# 앙상블

In [96]:
result1 = np.array(result_lgbm)
result2 = np.array(result_rf)
result3 = np.mean(preds, axis=0) # catboost
result3 = result3.reshape(46404, 1)

In [97]:
result = (result1 + result2 + result3) / 3

In [98]:
result

array([[0.46878647],
       [0.4221174 ],
       [0.39836031],
       ...,
       [0.5595245 ],
       [0.56210162],
       [0.58240072]])

In [99]:
threshold = 0.37

def transform_to_answer(pred):
    pred = pred >= threshold
    return pred.astype('float')

In [104]:
result_sub = transform_to_answer(result)

# 제출

In [113]:
#submission = pd.read_csv("./data/sample_submission.csv")


submission['target'] = result_sub
submission.to_csv('final_submission.csv', index=False)