In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install

In [2]:
!pip install ftfy
!pip install kozip

Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3
Collecting kozip
  Downloading kozip-1.1.4-py3-none-any.whl (44.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kozip
Successfully installed kozip-1.1.4


# package

In [3]:
import pandas as pd
import numpy as np
import re
import ftfy
from datetime import datetime
import joblib
import json
import copy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from kozip import KoZIP
import warnings
warnings.simplefilter('ignore')

In [4]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# Function

In [5]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [6]:
'''
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None,
                  sheet_number:int=0)->pd.DataFrame:

    if dtypesMapper:
        df = pd.read_excel(
            filePath,
            engine='openpyxl',
            usecols=[k for k, v in dtypesMapper.items()],
            dtype=dtypesMapper,
            sheet_name=sheet_number
            # low_memory=False
            )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df
'''

'\ndef read_selected(filePath:str,\n                  dtypesMapper:dict=None,\n                  textCols:list[str]=None,\n                  sheet_number:int=0)->pd.DataFrame:\n\n    if dtypesMapper:\n        df = pd.read_excel(\n            filePath,\n            engine=\'openpyxl\',\n            usecols=[k for k, v in dtypesMapper.items()],\n            dtype=dtypesMapper,\n            sheet_name=sheet_number\n            # low_memory=False\n            )\n    else:\n        raise("You should specify parameter [cols] and [dtypesMapeer]")\n\n    if textCols:\n        for col in textCols:\n            df[col] = df[col].apply(lambda x: fixEncoding(x))\n\n    df = df.loc[:, [k for k, v in dtypesMapper.items()]]\n    return df\n'

In [7]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None,
                  sheet_number:int=0)->pd.DataFrame:

    if dtypesMapper:
      try :
        df = pd.read_csv(
        filePath,
        # engine='openpyxl',
        usecols=[k for k, v in dtypesMapper.items()],
        dtype=dtypesMapper
        # low_memory=False
        )
      except :
        df = pd.read_excel(
        filePath,
        engine='openpyxl',
        usecols=[k for k, v in dtypesMapper.items()],
        dtype=dtypesMapper,
        sheet_name=sheet_number
        # low_memory=False
        )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

In [8]:
kozip = KoZIP()

def zipcoding(address:str, depth_number:str="2") :
    try :
        point = kozip.ZIPtoAddr(address, depth=depth_number)[0]
        return point

    except :
        return "No Address"

In [9]:
def month_split(df) :
    time_list = []
    for i, c in df.iterrows() :
        year, month = c['근속기간'].split('년')
        time_list.append(int(year)*12+int(month[:-2]))
    return time_list

In [10]:
def cal_diff(date1, date2):
    date_format = "%Y%m%d"
    start_date = datetime.strptime(date1, date_format)
    end_date = datetime.strptime(date2, date_format)

    month_diff = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
    return month_diff

In [11]:
position_dict = {
    '과장' : 1,
    '차장' : 2,
    '부부장' : 3,
    '부장' : 4,
    '수석부장' : 5
}

def position_sort(data) :
    data = data.reset_index(drop=True)
    position_list = []

    for i in range(len(data)) :
        if pd.isna(data['최근승진일'][i]) :
            position_list.append(data['직무'][i])
        else :
            if data['최근승진일'][i][0:4] == data['년도'][0] :
                if pd.isna(data['직전승진일'][i]) :
                    num = position_dict[data['최근승진직급'][i]] - 1
                    temp = [key for key, value in position_dict.items() if value == num][0]
                    position_list.append(temp)
                else :
                    position_list.append(data['직전승진직급'][i])
            else :
                position_list.append(data['직무'][i])

    position_list = ['차장' if x == '감사역' else x for x in position_list]
    return position_list

# Dataset

In [12]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023'

In [13]:
Mapper = {
    '사번' : str,
    '성명' : str,
    '성별' : str,
    '연령' : str,
    '연락처' : str,
    '이메일' : str,
    '생년월일' : str,
    '직무' : str,
    '연차' : np.int32,
    '근무본부' : str,
    '근무부서' : str,
    '근무지역' : str,
    '최종학력' : str,
    '신입경력' : str,
    '근속기간' : str,
    '수상여부' : np.int32,
    '전년도평균교육점수' : np.float32,
    '입사일' : str,
    '퇴사일' : str,
    '최근승진일' : str,
    '최근승진직급' : str,
    '직전승진일' : str,
    '직전승진직급' : str,
    '전년도KPI통과여부' : np.float32,
    '년도' : str,
    '전년도평가점수' : np.float32,
    '전년도평가등급' : str,
    '전전년도평가점수' : np.float32,
    '전전년도평가등급' : str,
    '전전전년도평가점수' : np.float32,
    '전전전년도평가등급' : str,
    '최종이동일' : str,
    '전년도총교육시간' : np.float32,
    '전년도총이수교육횟수' : np.float32
}

pm = read_selected(filePath = f'{data_path}/result/input/promotion_inference_data.xlsx',
                   dtypesMapper=Mapper,
                   textCols=['사번'],
                   sheet_number=0)

In [14]:
max = 0
for item in pm['년도'].unique() :
  if max <= int(item) :
    max = int(item)

max = str(max)

In [15]:
# ===================================================
# 231218 수정
# 다면평가 데이터가 달라질 경우 사용할 수 없는 데이터
if max == '2023' :
  Mapper = {
      '사번' : str,
      '이름' : str,
      '평가년도' : str,
      '평가회차' : str,
      '평가인원' : np.int32,
      '문항1평균평점' : np.float32,
      '문항2평균평점' : np.float32,
      '문항3평균평점' : np.float32,
      '문항4평균평점' : np.float32,
      '문항5평균평점' : np.float32,
      '문항6평균평점' : np.float32,
      '문항7평균평점' : np.float32,
      '문항8평균평점' : np.float32,
      '문항9평균평점' : np.float32,
      '문항10평균평점' : np.float32,
      '문항11평균평점' : np.float32,
      '문항12평균평점' : np.float32,
      '문항13평균평점' : np.float32,
      '문항14평균평점' : np.float32,
      '문항15평균평점' : np.float32,
      '문항16평균평점' : np.float32,
      '문항17평균평점' : np.float32,
      '문항18평균평점' : np.float32,
      '문항19평균평점' : np.float32,
      '문항20평균평점' : np.float32,
      '문항21평균평점' : np.float32,
      '문항22평균평점' : np.float32,
      '문항23평균평점' : np.float32,
      '문항24평균평점' : np.float32,
      '전체평균평점' : np.float32
  }

  eval = read_selected(filePath = f'{data_path}/result/input/promotion_inference_data.xlsx',
                    dtypesMapper=Mapper,
                    textCols=['사번'],
                    sheet_number=1)
else :
  Mapper = {
      '사번' : str,
      '이름' : str,
      '평가년도' : str,
      '평가회차' : str,
      '평가인원' : np.int32,
      '문항1평균평점' : np.float32,
      '문항2평균평점' : np.float32,
      '문항3평균평점' : np.float32,
      '문항4평균평점' : np.float32,
      '문항5평균평점' : np.float32,
      '문항6평균평점' : np.float32,
      '문항7평균평점' : np.float32,
      '문항8평균평점' : np.float32,
      '문항9평균평점' : np.float32,
      '문항10평균평점' : np.float32,
      '문항11평균평점' : np.float32,
      '문항12평균평점' : np.float32,
      '문항13평균평점' : np.float32,
      '문항14평균평점' : np.float32,
      '문항15평균평점' : np.float32,
      '문항16평균평점' : np.float32,
      '문항17평균평점' : np.float32,
      '문항18평균평점' : np.float32,
      '문항19평균평점' : np.float32,
      '문항20평균평점' : np.float32,
      '문항21평균평점' : np.float32,
      '문항22평균평점' : np.float32,
      '문항23평균평점' : np.float32,
      '문항24평균평점' : np.float32,
      '전체평균평점' : np.float32
  }

  eval = read_selected(filePath = f'{data_path}/result/input/promotion_inference_data.xlsx',
                    dtypesMapper=Mapper,
                    textCols=['사번'],
                    sheet_number=1)

In [16]:
Mapper = {
    'ID' : str,
    '부서' : str,
    '통합ID' : str,
    '통합부서' : str,
    '근무지_우편번호' : str,
    '부서인원수' : str,
    '근무지역' : str,
    '본부' : str
}

org_sort = read_selected(filePath = f'{data_path}/result/input/KB부서정보_전처리_검토사항_수정.xlsx',
                      dtypesMapper=Mapper)

In [17]:
Mapper = {
    '사번' : str,
    '거주지' : str,
    '거주지_우편번호' : str,
    '직책' : str
}

addr = read_selected(filePath = f'{data_path}/result/input/recommendation_inference_data.xlsx',
                      dtypesMapper=Mapper,
                      textCols=['사번'],
                      sheet_number=0)

In [18]:
Mapper = {
    '사번': str
}

recom = read_selected(filePath = f'{data_path}/result/input/process/temp_infer_result_recom.csv',
                   dtypesMapper=Mapper,
                   textCols=['사번'])

In [19]:
# 231201 수정
scaler = joblib.load(f'{data_path}/result/model/scaler_promotion.pkl')
# pca = joblib.load(f'{data_path}/result/model/pca_promotion.pkl')
model = joblib.load(f'{data_path}/result/model/model_promotion.pkl')

# Preprocess

In [20]:
pm = pm.loc[(pm['년도']==max)]
pm = pm.reset_index(drop=True)
pm['연령'] = pm['연령'].apply(lambda x : int(x[:-1]))
pm['근속개월'] = month_split(pm)

In [21]:
#231121 수정
pm = pd.merge(pm, addr, on=['사번'], how='left')
pm['거주지_주소'] = pm['거주지_우편번호'].apply(lambda x : zipcoding(x, 4))
for i in range(len(pm)) :
  if pm['거주지_주소'][i] == 'No Address' :
    pm['거주지_주소'][i] = pm['거주지'][i]

pm.drop(['거주지', '거주지_우편번호'], axis=1, inplace=True)

In [22]:
#231121 수정
all_data = copy.deepcopy(pm)

In [23]:
#231121 수정
idx_list = []
for i in range(len(pm)) :
    if pd.notna(pm['퇴사일'][i]) :
          idx_list.append(i)

pm.drop(idx_list, axis=0, inplace=True)

In [24]:
# ===============================
# 231218 수정
if max == '2023' :
  pm_23 = pm.loc[pm['직무'].str.contains('과장|차장|부부장|부장|수석부장')].reset_index(drop=True)
  # 2023년 infernece에서만 적용, 2024 예측 시에는 미적용
  pm_23['승진적합여부'] = pm_23['최근승진일'].apply(lambda x : 0 if pd.isna(x) else (1 if x[0:4]=='2023' else 0))
  pm_23.drop(pm_23.loc[(pm_23['직무']=='과장') & (pm_23['최근승진일'].str[0:4] == '2023')].index, axis=0, inplace=True)
  pm_23.drop(pm_23.loc[(pm_23['직무']=='수석부장') & (pm_23['최근승진일'].str[0:4] != '2023')].index, axis=0, inplace=True)

  # 다면평가 데이터가 달라질 경우 사용할 수 없는 코드
  eval_23 = eval.loc[eval['평가년도']=='2022'].drop_duplicates(subset='사번').reset_index(drop=True)
  per_point_23 = np.percentile(np.array(eval_23['전체평균평점'].sort_values(ascending=True)), [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], interpolation='nearest')


  per_list_23 = []

  for i in range(len(eval_23)) :
      if eval_23['전체평균평점'][i] >= per_point_23[9] :
          per_list_23.append(10)
      elif eval_23['전체평균평점'][i] >= per_point_23[8] :
          per_list_23.append(9)
      elif eval_23['전체평균평점'][i] >= per_point_23[7] :
          per_list_23.append(8)
      elif eval_23['전체평균평점'][i] >= per_point_23[6] :
          per_list_23.append(7)
      elif eval_23['전체평균평점'][i] >= per_point_23[5] :
          per_list_23.append(6)
      elif eval_23['전체평균평점'][i] >= per_point_23[4] :
          per_list_23.append(5)
      elif eval_23['전체평균평점'][i] >= per_point_23[3] :
          per_list_23.append(4)
      elif eval_23['전체평균평점'][i] >= per_point_23[2] :
          per_list_23.append(3)
      elif eval_23['전체평균평점'][i] >= per_point_23[1] :
          per_list_23.append(2)
      elif eval_23['전체평균평점'][i] < per_point_23[1] :
          per_list_23.append(1)

  eval_23['다면평가등급'] = per_list_23
  merged_23 = pd.merge(pm_23, eval_23, on='사번', how='left')
  # 2023년 infernece에서만 적용, 2024 예측 시에는 미적용
  for i in range(len(merged_23)) :
      if (merged_23['연차'][i] == 1) & (merged_23['승진적합여부'][i] == 1) :
          if pd.notna(merged_23['직전승진일'][i]) :
              year = int(merged_23['년도'][i]) - int(merged_23['직전승진일'][i][0:4])
              merged_23['연차'][i] = year
          else :
              year = int(merged_23['년도'][i]) - int(merged_23['입사일'][i][0:4])
              merged_23['연차'][i] = year

else :
  pm_23 = pm.loc[pm['직무'].str.contains('과장|차장|부부장|부장')].reset_index(drop=True)
  eval_24 = eval.loc[eval['평가년도']=='2023'].drop_duplicates(subset='사번').reset_index(drop=True)
  col_name = "전체평균평점"
  per_point_24 = np.percentile(np.array(eval_24[col_name].sort_values(ascending=True)), [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], interpolation='nearest')

  per_list_24 = []

  for i in range(len(eval_24)) :
    if eval_24[col_name][i] >= per_point_24[9] :
        per_list_24.append(10)
    elif eval_24[col_name][i] >= per_point_24[8] :
        per_list_24.append(9)
    elif eval_24[col_name][i] >= per_point_24[7] :
        per_list_24.append(8)
    elif eval_24[col_name][i] >= per_point_24[6] :
        per_list_24.append(7)
    elif eval_24[col_name][i] >= per_point_24[5] :
        per_list_24.append(6)
    elif eval_24[col_name][i] >= per_point_24[4] :
        per_list_24.append(5)
    elif eval_24[col_name][i] >= per_point_24[3] :
        per_list_24.append(4)
    elif eval_24[col_name][i] >= per_point_24[2] :
        per_list_24.append(3)
    elif eval_24[col_name][i] >= per_point_24[1] :
        per_list_24.append(2)
    elif eval_24[col_name][i] < per_point_24[1] :
        per_list_24.append(1)

  eval_24['다면평가등급'] = per_list_24
  merged_23 = pd.merge(pm_23, eval_24, on='사번', how='left')

# Inference

In [25]:
inference = copy.deepcopy(merged_23)

inference[['전년도KPI통과여부', '전년도평균교육점수', '전년도총교육시간', '전년도총이수교육횟수']] = \
    inference[['전년도KPI통과여부', '전년도평균교육점수', '전년도총교육시간', '전년도총이수교육횟수']].fillna(0)

inference[['전년도평가점수', '전전년도평가점수', '전전전년도평가점수', '다면평가등급']] = inference[['전년도평가점수', '전전년도평가점수', '전전전년도평가점수', '다면평가등급']].fillna(0)
inference[['전년도평가등급', '전전년도평가등급', '전전전년도평가등급']] = inference[['전년도평가등급', '전전년도평가등급', '전전전년도평가등급']].fillna('N')

inference['최종학력'].replace({'고등학교' : 1, '전문대학' : 2, '대학교' : 3, '대학원(석사)' : 4}, inplace=True)
inference['신입경력'].replace({'신입' : 0, '경력' : 1}, inplace=True)
inference['성별'].replace({'여' : 0, '남' : 1}, inplace=True)
inference['전년도평가등급'].replace({'N' : 0, 'C' : 1, 'B' : 2, 'A' : 3, 'S' : 4}, inplace=True)
inference['전전년도평가등급'].replace({'N' : 0, 'C' : 1, 'B' : 2, 'A' : 3, 'S' : 4}, inplace=True)
inference['전전전년도평가등급'].replace({'N' : 0, 'C' : 1, 'B' : 2, 'A' : 3, 'S' : 4}, inplace=True)

In [26]:
# 231201 수정
# 231218 수정
# ===============================
pca_list = ['다면평가 주성분_1', '다면평가 주성분_2', '다면평가 주성분_3', '다면평가 주성분_4', '다면평가 주성분_5']

if max == '2023' :
  # 2023년 infernece에서만 적용, 2024 예측 시에는 수정이 필요
  fe_cols = ['성별', '연령', '연차', '최종학력', '신입경력', '수상여부', '전년도평균교육점수', '전년도KPI통과여부',
            '전년도평가점수', '전전년도평가점수', '전전전년도평가점수',
            '전년도총교육시간', '전년도총이수교육횟수', '근속개월', '다면평가등급', '문항1평균평점', '문항2평균평점', '문항3평균평점', '문항4평균평점', '문항5평균평점',
        '문항6평균평점', '문항7평균평점', '문항8평균평점', '문항9평균평점', '문항10평균평점', '문항11평균평점',
        '문항12평균평점', '문항13평균평점', '문항14평균평점', '문항15평균평점', '문항16평균평점', '문항17평균평점',
        '문항18평균평점', '문항19평균평점', '문항20평균평점', '문항21평균평점', '문항22평균평점', '문항23평균평점',
        '문항24평균평점', '전체평균평점']

  inf_data = inference[fe_cols]
  inf_data = inf_data.fillna(0)

  inf_scaled = scaler.transform(inf_data[inf_data.columns[0:15]])
  inf_data_ = pd.DataFrame(inf_scaled, index=inf_data[inf_data.columns[0:15]].index, columns=inf_data[inf_data.columns[0:15]].columns)
  inf_data[inf_data.columns[0:15]] = inf_data_

  sc_scaler = StandardScaler()
  sc_scaled = sc_scaler.fit_transform(inf_data[inf_data.columns[15:]])
  sc_data_ = pd.DataFrame(sc_scaled, index=inf_data[inf_data.columns[15:]].index, columns=inf_data[inf_data.columns[15:]].columns)
  inf_data[inf_data.columns[15:]] = sc_data_

  pca = PCA(n_components=5)
  train_pca = pca.fit_transform(inf_data[inf_data.columns[15:-1]].values)
  inf_data[pca_list] = pd.DataFrame(data = train_pca, columns=pca_list)[pca_list]
  inf_data = inf_data[fe_cols[0:15] + pca_list]

else :
  # 다면평가 양식 확정되면 구축 가능
  fe_cols = ['성별', '연령', '연차', '최종학력', '신입경력', '수상여부', '전년도평균교육점수', '전년도KPI통과여부',
            '전년도평가점수', '전전년도평가점수', '전전전년도평가점수',
            '전년도총교육시간', '전년도총이수교육횟수', '근속개월', '다면평가등급']
  new_cols = ['문항1평균평점', '문항2평균평점', '문항3평균평점', '문항4평균평점', '문항5평균평점',
        '문항6평균평점', '문항7평균평점', '문항8평균평점', '문항9평균평점', '문항10평균평점', '문항11평균평점',
        '문항12평균평점', '문항13평균평점', '문항14평균평점', '문항15평균평점', '문항16평균평점', '문항17평균평점',
        '문항18평균평점', '문항19평균평점', '문항20평균평점', '문항21평균평점', '문항22평균평점', '문항23평균평점',
        '문항24평균평점', '전체평균평점']

  sum_cols = fe_cols + new_cols
  inf_data = inference[sum_cols]
  inf_data = inf_data.fillna(0)

  inf_scaled = scaler.transform(inf_data[fe_cols])
  inf_data_ = pd.DataFrame(inf_scaled, index=inf_data[fe_cols].index, columns=inf_data[fe_cols].columns)
  inf_data[fe_cols] = inf_data_

  sc_scaler = StandardScaler()
  sc_scaled = sc_scaler.fit_transform(inf_data[new_cols])
  sc_data_ = pd.DataFrame(sc_scaled, index=inf_data[new_cols].index, columns=inf_data[new_cols].columns)
  inf_data[new_cols] = sc_data_

  pca = PCA(n_components=5)
  train_pca = pca.fit_transform(inf_data[new_cols].values)
  inf_data[pca_list] = pd.DataFrame(data = train_pca, columns=pca_list)[pca_list]
  inf_data = inf_data[fe_cols + pca_list]

In [27]:
# 231201 제거
'''
pca_list = ['다면평가 주성분_1', '다면평가 주성분_2', '다면평가 주성분_3', '다면평가 주성분_4', '다면평가 주성분_5']

inf_scaled = scaler.transform(inf_data)
inf_data = pd.DataFrame(inf_scaled, index=inf_data.index, columns=inf_data.columns)

inf_pca = pca.transform(inf_data[inf_data.columns[15:-1]])
inf_data[pca_list] = pd.DataFrame(data = inf_pca, columns=pca_list)[pca_list]
inf_data.drop(inf_data.columns[15:-5], axis=1, inplace=True)
'''

"\npca_list = ['다면평가 주성분_1', '다면평가 주성분_2', '다면평가 주성분_3', '다면평가 주성분_4', '다면평가 주성분_5']\n\ninf_scaled = scaler.transform(inf_data)\ninf_data = pd.DataFrame(inf_scaled, index=inf_data.index, columns=inf_data.columns)\n\ninf_pca = pca.transform(inf_data[inf_data.columns[15:-1]])\ninf_data[pca_list] = pd.DataFrame(data = inf_pca, columns=pca_list)[pca_list]\ninf_data.drop(inf_data.columns[15:-5], axis=1, inplace=True)\n"

In [28]:
predict_proba = model.predict_proba(inf_data)[:,1]
merged_23['predict_proba'] = predict_proba

In [29]:
sca = MinMaxScaler((0.5, 0.98)).fit_transform(merged_23['predict_proba'].values.reshape(-1, 1))
merged_23['predict_proba'] = sca

In [30]:
merged_23 = merged_23[merged_23.columns[0:1].tolist() + merged_23.columns[-1:].tolist() + merged_23.columns[1:-1].tolist()]

In [31]:
merged_23.drop(merged_23.columns[36:-1], axis=1, inplace=True)

# Similarity

In [32]:
sim_data = pd.concat([inf_data, merged_23[['직무']]], axis=1)

cs_list = []

for i in range(len(sim_data)) :
  max = 0
  idx = 0
  idx_list = sim_data[sim_data['직무']==sim_data['직무'][i]].index
  for index in idx_list :
    if i != index :
      cs = cosine_similarity(sim_data.values[i][:-1].reshape(1, -1), sim_data.values[index][:-1].reshape(1, -1))[0][0]
      if max <= cs :
        max = cs
        idx = index
  cs_list.append([merged_23['사번'][i], merged_23['사번'][idx], max])

cs_data = pd.DataFrame(cs_list, columns=['사번', '유사사원', '유사도'])

In [33]:
merged_23 = pd.concat([merged_23, cs_data[['유사사원', '유사도']]], axis=1)

In [34]:
merged_23 = pd.merge(merged_23, org_sort[['부서', '근무지_우편번호', '부서인원수']], left_on=['근무부서'], right_on=['부서'], how='left')
merged_23['근무부서_주소'] = merged_23['근무지_우편번호'].apply(lambda x : zipcoding(x))
merged_23.drop(['부서', '근무지_우편번호'], axis=1, inplace=True)

# Original

In [35]:
pm_id = merged_23['사번'].tolist()
recom_id = recom['사번'].tolist()
pm_id_list = []
recom_id_list = []

for i in range(len(all_data)) :
  if all_data['사번'][i] in pm_id :
    pm_id_list.append(True)
  else :
    pm_id_list.append(False)

  if all_data['사번'][i] in recom_id :
    recom_id_list.append(True)
  else :
    recom_id_list.append(False)

all_data['승진대상자유무'] = pm_id_list
all_data['추천대상자유무'] = recom_id_list

# Export

In [36]:
merged_23.to_csv(f'{data_path}/result/output/infer_result_promotion.csv', index=False)
all_data.to_csv(f'{data_path}/result/output/kb_personal_data.csv', index=False)