In [7]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV, ShapleyImportanceEvaluator
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer
plt.rc('font', family='malgun gothic')

In [8]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

data = pd.concat([X_train, X_test]).reset_index().drop(columns='index')

data.loc[data[data['직종']=='호텔/콘도/리조트'].index, '직종'] = '기타 직종'

# 피쳐 정리

In [9]:
g_d = []
for i in data['출신대학']:
    if '여자' in i:
        g_d.append('yes')
    else :
        g_d.append('no')
data['여대'] = g_d

null_list = data[data['근무경력']==0].index
data['근무형태'][null_list] = data['근무형태'][null_list].fillna('경력없음')
data['근무형태'] = data['근무형태'].fillna('missing')


data['대학성적'] = data.대학성적.fillna(60)

In [10]:
# 자격증 보기 쉽게 변환
data['자격증'] = data['자격증'].str.replace('無', 'x')
data['자격증'] = data['자격증'].str.replace('有', 'o')

# 근무 형태 처리
null_list = data[data['근무경력']==0].index
data['근무형태'][null_list] = data['근무형태'][null_list].fillna('경력없음')
data['근무형태'] = data['근무형태'].fillna('missing')

#
data['근무형태'] = data['근무형태'].str.replace(',', ' ')
data['근무형태'] = data['근무형태'].str.strip()

#
hyung_list = []
for i in data['근무형태']:
    if i[:3] == '정규직' and '해외취업' in i:
        i = '정규직(해외o)'
    elif i[:3] == '정규직' and '해외취업' not in i:
        i = '정규직(해외x)'
    hyung_list.append(i)    
data['근무형태'] = hyung_list

#
hyun_list2 = []
for i in data['근무형태']:
    if i[:3] == '계약직':
        i = '계약직'
    elif i in ['인턴','파견직']:
        i = '계약직'
    elif i in ['해외취업','병역특례']:
        i = '기타'    
    hyun_list2.append(i)   
data['근무형태'] = hyun_list2

# 어학 시험 전처리
data['어학시험'] = data['어학시험'].fillna('없음')


# 대학 전공 변환
col_list = []
abc = []
for i in data.대학전공:
    if '(' in i:
        for j in range(len(i)):
            if i[j] == '(':
                a = j
            elif i[j] == ')':
                b = j
        last = i[b+1:]
        first = i[:a]
        i = first + last
        abc.append(i)
    col_list.append(i)    
data['대학전공'] = col_list

#
data['대학전공'] = data['대학전공'].str.replace(',',' ')
data['대학전공'] = data['대학전공'].str.replace('/', ' ')
data['대학전공'] = data['대학전공'].str.replace('.',' ')
data['대학전공'] = data['대학전공'].str.strip()
data['대학전공'] = data['대학전공'].str.replace(' ','')

#
col = []
for i in data['대학전공']:
    if i[-1]=='과':
        i = i[:-1]
    col.append(i)    
data['대학전공'] = col

#
col2 = []
for i in data['대학전공']:
    if i[-1]!='학':
        i = i+'학'
    col2.append(i)    
data['대학전공'] = col2


# 근무경력 처리
month_list = []
for i in data['근무경력'].str.split('년'):
    if len(i) == 1:
        i = int(i[0][0])
    else:
        i = int(i[0])*12 + int(i[1].split('개월')[0].strip())
    month_list.append(i)    
data['근무경력'] = month_list


# 세부직종 처리
def full(job):
    value = data[data['세부직종']==job]['직무태그'].value_counts().index[0]
    data.loc[data[data['세부직종']==job]['직무태그'].index, '직무태그'] = data[data['세부직종']==job]['직무태그'].fillna(value)
    
for i in data.세부직종.value_counts().index:
    full(i)
    
# 어학시험 처리
language1 = []
for i in data['어학시험']:
    if i == ' ':
        i = '없음'
    language1.append(i) 
data['어학시험'] = language1

#
language3 = []
for i in data['어학시험']:
    if i[:5] == 'TOEFL':
        i = '토플'
    language3.append(i) 
data['어학시험'] = language3

#
language4 = []
for i in data['어학시험']:
    if i in '기타':
        i = '기타'
    language4.append(i)
data['어학시험'] = language4


# 근무지역 처리
place_col = []
for i in data.근무지역.str.split(','):
    place_col.append(sorted(list(set(i))))
    
plc_col = []
for i in place_col:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    plc_col.append(result)    
data['근무지역'] = plc_col

# 직무태그 처리  

data['직무태그'] = data['직무태그'].str.replace(' ','')

job_list = []
for i in data['직무태그'].str.split(','):
    job_list.append(sorted(i))    
job_col = []

for i in job_list:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    job_col.append(result)   
data['직무태그'] = job_col


a=[]
for i in data.어학시험:
    if i != '없음':
        i = '있음'
    a.append(i)
data['어학시험'] = a

a=[]
for i in data.대학성적:
    if i > 70:
        i = 'A'
    elif i == 70:
        i = 'B'
    elif i >= 60:
        i = 'C'
    else:
        i = 'D'
    a.append(i)
data['대학성적'] = a


a = []
for i in data.직무태그.str.split(' '):
    i = i[0]
    a.append(i)
data['첫직무태그'] = a

data['직무태그'] = data['직무태그'].str.replace(' ',',')

In [11]:
X_train = data[:16570]
X_test = data[16570:]

X_test = X_test.reset_index()
X_test.drop(columns='index', inplace=True)

numeric_features = data.select_dtypes('number').columns.to_list()
categorical_features = data.select_dtypes('object').columns.to_list()

In [12]:
# # 스태킹시 활용하기 위해 파일 저장

# X_train.to_csv('X_train2.csv', encoding='cp949')
# X_test.to_csv('X_test2.csv', encoding='cp949')

# K-Fold 5

In [13]:
scores = []  # CV 결과 저장
oof_pred = np.zeros(X_test.shape[0])  # OOF 저장
kfold = KFold(n_splits=5, shuffle=True, random_state=0) # K-Folds cross-validator # 셔플 안하면 원래 데이터 순서대로.

for train_index, valid_index in kfold.split(X_train, y_train): 
    # 학습/검증 데이터 분할
    train_x, valid_x = X_train.iloc[train_index], X_train.iloc[valid_index]
    train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]

    # 범주형피처 직접 처리와 Early stopping을 사용하여 CatBoost 모델링
    model = CatBoostRegressor(cat_features=categorical_features, verbose=False, random_state=0)
    model.fit(train_x, train_y,
              eval_set=[(valid_x,valid_y)],
              early_stopping_rounds=100,
             )

    # CV 스코어 계산 및 저장
    rmse = np.sqrt(mean_squared_error(valid_y, model.predict(valid_x)))
    scores.append(rmse)

    # OOF 예측값 저장
    oof_pred += model.predict(X_test) / kfold.get_n_splits()
    
scores = np.array(scores)
print("CV scores: ", scores)
print("CV mean = %.2f" % scores.mean(), "with std = %.2f" % scores.std())

CV scores:  [833.56560936 814.81911517 832.89778947 803.97517589 808.73486582]
CV mean = 818.80 with std = 12.28


In [14]:
# submission 화일 생성
filename = f'[4]catboost_modelD_{scores.mean():.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)