In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV, ShapleyImportanceEvaluator
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
plt.rc('font', family='malgun gothic')

In [2]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

data = pd.concat([X_train, X_test]).reset_index().drop(columns='index')

data.loc[data[data['직종']=='호텔/콘도/리조트'].index, '직종'] = '기타 직종'

# 피쳐 정리

In [3]:
# 대학성적 중앙값으로 처리
imp = SimpleImputer(strategy='median')
data['대학성적'] = imp.fit_transform(data['대학성적'].values.reshape(-1,1))

# 자격증 보기 쉽게 변환
data['자격증'] = data['자격증'].str.replace('無', 'x')
data['자격증'] = data['자격증'].str.replace('有', 'o')

# 근무 형태 처리
null_list = data[data['근무경력']==0].index
data['근무형태'][null_list] = data['근무형태'][null_list].fillna('경력없음')
data['근무형태'] = data['근무형태'].fillna('missing')

#
data['근무형태'] = data['근무형태'].str.replace(',', ' ')
data['근무형태'] = data['근무형태'].str.strip()
data['근무형태'] = data.근무형태.str.replace(' ',',')



# 어학 시험 전처리
data['어학시험'] = data['어학시험'].fillna('없음')


# 대학 전공 변환
col_list = []
abc = []
for i in data.대학전공:
    if '(' in i:
        for j in range(len(i)):
            if i[j] == '(':
                a = j
            elif i[j] == ')':
                b = j
        last = i[b+1:]
        first = i[:a]
        i = first + last
        abc.append(i)
    col_list.append(i)    
data['대학전공'] = col_list

#
data['대학전공'] = data['대학전공'].str.replace(',',' ')
data['대학전공'] = data['대학전공'].str.replace('/', ' ')
data['대학전공'] = data['대학전공'].str.replace('.',' ')
data['대학전공'] = data['대학전공'].str.strip()
data['대학전공'] = data['대학전공'].str.replace(' ','')

#
col = []
for i in data['대학전공']:
    if i[-1]=='과':
        i = i[:-1]
    col.append(i)    
data['대학전공'] = col

#
col2 = []
for i in data['대학전공']:
    if i[-1]!='학':
        i = i+'학'
    col2.append(i)    
data['대학전공'] = col2


# 세부직종 처리
def full(job):
    value = data[data['세부직종']==job]['직무태그'].value_counts().index[0]
    data.loc[data[data['세부직종']==job]['직무태그'].index, '직무태그'] = data[data['세부직종']==job]['직무태그'].fillna(value)
    
for i in data.세부직종.value_counts().index:
    full(i)
    
# 어학시험 처리
language1 = []
for i in data['어학시험']:
    if i == ' ':
        i = '없음'
    language1.append(i) 
data['어학시험'] = language1

#
language3 = []
for i in data['어학시험']:
    if i[:5] == 'TOEFL':
        i = '토플'
    language3.append(i) 
data['어학시험'] = language3

#
language4 = []
for i in data['어학시험']:
    if i in '기타':
        i = '기타'
    language4.append(i)
data['어학시험'] = language4


# 직무태그 처리  

data['직무태그'] = data['직무태그'].str.replace(' ','')

job_list = []
for i in data['직무태그'].str.split(','):
    job_list.append(sorted(i))    
job_col = []

for i in job_list:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    job_col.append(result)
    
data['직무태그'] = job_col

# 대학전공 New 처리

In [4]:
data['대학전공'] = data['대학전공'].str.replace('복수','')
data['대학전공'] = data['대학전공'].str.replace('전공','')
data['대학전공'] = data['대학전공'].str.replace('졸업','')
data['대학전공'] = data['대학전공'].str.replace('중퇴','')

In [5]:
a = []
for i in data['대학전공']:
    if i[-1] == '학':
        if i[-2:] == '공학':
            pass
        else:
            i = i[:-1]
        a.append(i)
data['대학전공'] = a

from konlpy.tag import Okt

okt = Okt()

a = []
for i in data.대학전공:
    a.append(okt.morphs(i))
data['대학전공'] = a

col = []
for i in a:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    col.append(result)
    
data['대학전공'] = col

data['대학전공'] = data['대학전공'].str.strip()
data['대학전공'] = data['대학전공'].str.replace(' ',',')

In [6]:
# 단어별로 구분하여 Bow 방식 후 PCA

from sklearn.feature_extraction.text import CountVectorizer

sentences = data['대학전공']

vectorizer = CountVectorizer()

features = vectorizer.fit_transform(sentences)
vocab = vectorizer.get_feature_names()
new = pd.DataFrame(features.toarray(), columns = vocab)

X_train_New = new[:16570]
X_test_New = new[16570:]

max_d = num_d = new.shape[1]
pca = PCA(n_components=max_d).fit(X_train_New)
cumsum = np.cumsum(pca.explained_variance_ratio_)
num_d = np.argmax(cumsum >= 0.95) + 1
if num_d == 1: num_d = max_d
pca = PCA(n_components=num_d, random_state=0)  
X_train_pca = pca.fit_transform(X_train_New)
X_test_pca = pca.transform(X_test_New)

train = pd.DataFrame(X_train_pca)
test = pd.DataFrame(X_test_pca)

all_pca = pd.concat([train, test]).reset_index().drop(columns = 'index')
data = pd.concat([data, all_pca], axis=1).drop(columns='대학전공')

# 근무경력 New 처리

In [7]:
# 년 / 개월 구분하여 열 생성
for i in range(len(data.근무경력)):
    if len(data.근무경력[i]) == 3 or len(data.근무경력[i]) == 4:
        data.근무경력[i] = '0년 ' + data.근무경력[i]

we = data.근무경력.str.split(' ',expand=True)
we.columns = ['년','개월']

# year, month로 구분하여 열 생성
we = data.근무경력.str.split(' ',expand=True)
we.columns = ['년','개월']

we_y = we.년.str.split('년', expand=True) 
we_y.columns = ['년','삭제']
we_y.drop('삭제',axis=1,inplace=True)
we_y = we_y.astype(int) 

we_m = we.개월.str.split('개월', expand=True)
we_m.columns = ['개월','삭제']
we_m.drop('삭제',axis=1,inplace=True)
we_m = we_m.astype(int)

data['근무경력_y'] = we_y['년']
data['근무경력_m'] = we_m['개월']

X_train = data.iloc[:X_train.shape[0],:].reset_index(drop=True)
X_test = data.iloc[X_train.shape[0]:,:].reset_index(drop=True)

In [8]:
a = []

for i in data['근무경력_y']:
    if i==0: 
        b = '경력없음' 
    elif i <=2:
        b = '하'
    elif i <=4:
        b = '중'
    elif i <=6:
        b = '중중'
    else:
        b = '상'
    a.append(b)
data['숙련도'] = a

data.drop(columns='근무경력', inplace=True)

# 직무태그 New 처리

In [9]:
data['직무태그'] = data['직무태그'].str.strip()
data['직무태그'] = data['직무태그'].str.replace(' ',',')

a = []
for i in data.직무태그.str.split(','):
    b = len(i)
    a.append(b)
data['직무태그_갯수'] = a

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = data['직무태그']

vectorizer = CountVectorizer()

features = vectorizer.fit_transform(sentences)
vocab = vectorizer.get_feature_names()
new = pd.DataFrame(features.toarray(), columns = vocab)

X_train_New = new[:16570]
X_test_New = new[16570:]

max_d = num_d = new.shape[1]
pca = PCA(n_components=max_d).fit(X_train_New)
cumsum = np.cumsum(pca.explained_variance_ratio_)
num_d = np.argmax(cumsum >= 0.8) + 1
if num_d == 1: num_d = max_d
pca = PCA(n_components=num_d, random_state=0)  
X_train_pca = pca.fit_transform(X_train_New)
X_test_pca = pca.transform(X_test_New)

train = pd.DataFrame(X_train_pca)
test = pd.DataFrame(X_test_pca)

all_pca = pd.concat([train, test]).reset_index().drop(columns = 'index')
data = pd.concat([data, all_pca], axis=1).drop(columns='직무태그')

# 근무지역 New 처리

In [11]:
a = []

for i in data.근무지역.str.split(','):
    b = i[0]
    a.append(b)
data['첫희망지역'] = a

In [12]:
data['근무지역'] = data['근무지역'].str.replace(',',' ')
data['근무지역'] = data['근무지역'].str.strip()
data['근무지역'] = data['근무지역'].str.replace(' ',',')

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = data['근무지역']

vectorizer = CountVectorizer()

features = vectorizer.fit_transform(sentences)
vocab = vectorizer.get_feature_names()
new = pd.DataFrame(features.toarray(), columns = vocab)

X_train_New = new[:16570]
X_test_New = new[16570:]

max_d = num_d = new.shape[1]
pca = PCA(n_components=max_d).fit(X_train_New)
cumsum = np.cumsum(pca.explained_variance_ratio_)
num_d = np.argmax(cumsum >= 0.9) + 1
if num_d == 1: num_d = max_d
pca = PCA(n_components=num_d, random_state=0)  
X_train_pca = pca.fit_transform(X_train_New)
X_test_pca = pca.transform(X_test_New)
print(X_train_pca.shape)

train = pd.DataFrame(X_train_pca)
test = pd.DataFrame(X_test_pca)

all_pca = pd.concat([train, test]).reset_index().drop(columns = 'index')
data = pd.concat([data, all_pca], axis=1).drop(columns='근무지역')

(16570, 13)


# 근무형태 PCA

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = data['근무형태']

vectorizer = CountVectorizer()

features = vectorizer.fit_transform(sentences)
vocab = vectorizer.get_feature_names()
new = pd.DataFrame(features.toarray(), columns = vocab)

X_train_New = new[:16570]
X_test_New = new[16570:]

max_d = num_d = new.shape[1]
pca = PCA(n_components=max_d).fit(X_train_New)
cumsum = np.cumsum(pca.explained_variance_ratio_)
num_d = np.argmax(cumsum >= 0.95) + 1
if num_d == 1: num_d = max_d
pca = PCA(n_components=num_d, random_state=0)  
X_train_pca = pca.fit_transform(X_train_New)
X_test_pca = pca.transform(X_test_New)

train = pd.DataFrame(X_train_pca)
test = pd.DataFrame(X_test_pca)

all_pca = pd.concat([train, test]).reset_index().drop(columns = 'index')
data = pd.concat([data, all_pca], axis=1).drop(columns='근무형태')

In [15]:
data.columns = [i for i in range(data.columns.size)]

In [16]:
X_train = data[:16570]
X_test = data[16570:]

X_test = X_test.reset_index()
X_test.drop(columns='index', inplace=True)

numeric_features = data.select_dtypes('number').columns.to_list()
categorical_features = data.select_dtypes('object').columns.to_list()

In [17]:
# # 스태킹시 활용하기 위해 파일 저장

# X_train.to_csv('X_train1.csv', encoding='cp949')
# X_test.to_csv('X_test1.csv', encoding='cp949')

# K-Fold 5

In [18]:
scores = []  # CV 결과 저장
oof_pred = np.zeros(X_test.shape[0])  # OOF 저장
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

for train_index, valid_index in kfold.split(X_train, y_train): 
    # 학습/검증 데이터 분할
    train_x, valid_x = X_train.iloc[train_index], X_train.iloc[valid_index]
    train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]

    # 범주형피처 직접 처리와 Early stopping을 사용하여 CatBoost 모델링
    model = CatBoostRegressor(cat_features=categorical_features, verbose=False, random_state=0)
    model.fit(train_x, train_y,
              eval_set=[(valid_x,valid_y)],
              early_stopping_rounds=100,
             )

    # CV 스코어 계산 및 저장
    rmse = np.sqrt(mean_squared_error(valid_y, model.predict(valid_x)))
    scores.append(rmse)

    # OOF 예측값 저장
    oof_pred += model.predict(X_test) / kfold.get_n_splits()
    
scores = np.array(scores)
print("CV scores: ", scores)
print("CV mean = %.2f" % scores.mean(), "with std = %.2f" % scores.std())

CV scores:  [830.36470025 816.26249952 830.26305555 798.15747085 806.10600119]
CV mean = 816.23 with std = 12.85


In [19]:
filename = f'[3]catboost_modelC_{scores.mean():.2f}_{scores.std():.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)