In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install

In [None]:
!pip install ftfy
!pip install haversine
!pip install catboost

# Package

In [None]:
import pandas as pd
import numpy as np
import re
import ftfy
from haversine import haversine
from datetime import datetime
from urllib.request import urlopen
from urllib import parse
from urllib.request import Request
from urllib.error import HTTPError
import json
import copy
import joblib
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.simplefilter('ignore')

In [None]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# function

In [None]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [None]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None)->pd.DataFrame:

    if dtypesMapper:
        df = pd.read_csv(
            filePath,
            # engine='openpyxl',
            usecols=[k for k, v in dtypesMapper.items()],
            dtype=dtypesMapper
            # low_memory=False
            )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

# Dataset

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023'

In [None]:
Mapper = {'사번' : str,
 '010900' : np.float32,
 '011100' : np.float32,
 '011200' : np.float32,
 '011700' : np.float32,
 '012340' : np.float32,
 '012900' : np.float32,
 '013500' : np.float32,
 '013600' : np.float32,
 '013900' : np.float32,
 '013910' : np.float32,
 '014000' : np.float32,
 '014100' : np.float32,
 '014102' : np.float32,
 '014200' : np.float32,
 '014300' : np.float32,
 '014400' : np.float32,
 '015120' : np.float32,
 '015900' : np.float32,
 '016200' : np.float32,
 '017800' : np.float32,
 '018800' : np.float32,
 '019900' : np.float32,
 '035000' : np.float32,
 '035800' : np.float32,
 '035900' : np.float32,
 '036100' : np.float32,
 '037100' : np.float32,
 '037800' : np.float32,
 '038000' : np.float32,
 '038100' : np.float32,
 '038200' : np.float32,
 '038400' : np.float32,
 '038500' : np.float32,
 '038600' : np.float32,
 '038700' : np.float32,
 '039300' : np.float32,
 '039400' : np.float32,
 '039800' : np.float32,
 '040100' : np.float32,
 '040900' : np.float32,
 '041300' : np.float32,
 '042220' : np.float32,
 '042800' : np.float32,
 '046600' : np.float32,
 '046700' : np.float32,
 '046800' : np.float32,
 '050500' : np.float32,
 '051000' : np.float32,
 '051200' : np.float32,
 '051500' : np.float32,
 '051600' : np.float32,
 '052500' : np.float32,
 '052700' : np.float32,
 '052800' : np.float32,
 '053400' : np.float32,
 '053500' : np.float32,
 '054310' : np.float32,
 '055110' : np.float32,
 '055111' : np.float32,
 '056110' : np.float32,
 '077100' : np.float32,
 '성명' : str,
 '성별' : str,
 '연령' : np.int32,
 '근속기간' : str,
 '현근무지' : str,
 '현근무부서' : str,
 '학력' : str,
 '전공' : str,
 '거주지' : str,
 '거주지_우편번호' : str,
 '직무관련자격증' : np.int32,
 '수상여부' : np.int32,
 '근무부서코드' : str,
 '근무부서' : str,
 '근무부서고과평균' : str,
 '직책' : str,
 '이동희망부서1' : str,
 '이동희망부서2' : str,
 '이동희망부서3' : str,
 '이동희망시기' : str,
 '최종이동일' : str,
 'major_code' : str,
 'emp_address' : str,
 'emp_lat' : np.float32,
 'emp_long' : np.float32,
 '근속개월' : np.int32,
 '부서근속기간' : np.int32,
 '경영자역량강화' : np.int32,
 '디지털역량강화' : np.int32,
 '조직가치공유' : np.int32,
 '직무역량강화' : np.int32,
 '핵심인재육성' : np.int32}

raw_data = read_selected(filePath = f'{data_path}/result/input/process/inf_data.csv',
                      dtypesMapper=Mapper,
                      textCols=['사번'])

In [None]:
Mapper = {
    'ID' : str,
    '부서' : str,
    '통합ID' : str,
    '통합부서' : str,
    '근무지' : str,
    '근무지_우편번호' : str,
    'item_lat' : np.float32,
    'item_long' : np.float32,
    '부서인원수' : str,
    '근무지역' : str,
    '본부' : str
    }

org_data = read_selected(filePath = f'{data_path}/result/input/process/org_data.csv',
                      dtypesMapper=Mapper,
                      textCols=['ID'])

In [None]:
model = joblib.load(f'{data_path}/result/model/model_recom.pkl')
mapping = joblib.load(f'{data_path}/result/model/mapping_recom.pkl')
prep_dict = joblib.load(f'{data_path}/result/model/prep_dict.pkl')
recom_dict = joblib.load(f'{data_path}/result/model/recom_dict.pkl')

# Inference

In [None]:
merge_data =copy.deepcopy(raw_data)
result_data = copy.deepcopy(raw_data)
result_data['성별'].replace({'남자' : 1, '여자' : 0}, inplace=True)
result_data['직책'].replace({'팀원' : 1, '팀장' : 2, '소장' : 3}, inplace=True)
result_data['학력'].replace({'고등학교' : 1, '전문대학' : 2, '대학교' : 3, '대학원(석사)' : 4}, inplace=True)
result_data['major_code'] = result_data['major_code'].apply(lambda x : int(x[0:2]))

In [None]:
fe_cols = ['사번', '010900', '011100', '011200', '011700', '012340', '012900',
       '013500', '013600', '013900', '013910', '014000', '014100', '014102',
       '014200', '014300', '014400', '015120', '015900', '016200', '017800',
       '018800', '019900', '035000', '035800', '035900', '036100', '037100',
       '037800', '038000', '038100', '038200', '038400', '038500', '038600',
       '038700', '039300', '039400', '039800', '040100', '040900', '041300',
       '042220', '042800', '046600', '046700', '046800', '050500', '051000',
       '051200', '051500', '051600', '052500', '052700', '052800', '053400',
       '053500', '054310', '055110', '055111', '056110', '077100', '성별',
       '연령', '학력', '직무관련자격증', '수상여부', '직책', 'major_code', '근속개월',
       '부서근속기간', '경영자역량강화', '디지털역량강화', '조직가치공유', '직무역량강화',
       '핵심인재육성']

In [None]:
outlier = result_data.loc[result_data['직책']=='감사역']
result_data = result_data.loc[result_data['직책']!='감사역']
result_data = result_data.reset_index(drop=True)
raw_data = raw_data.loc[raw_data['직책']!='감사역']
raw_data = raw_data.reset_index(drop=True)

In [None]:
out_data = outlier[fe_cols]
out_data['직책'] = 1
out_data = out_data.drop(['사번'], axis=1)
out_data = out_data.astype('float64')

In [None]:
inf_data = result_data[fe_cols]
inf_data[inf_data.columns[1:]] = inf_data[inf_data.columns[1:]].astype('float64')
inf_data = inf_data.drop(['사번'], axis=1)

In [None]:
predict_proba = model.predict_proba(inf_data)

In [None]:
proba = pd.DataFrame(result_data['사번'])
proba[list(mapping.keys())] = predict_proba
proba.rename(columns = lambda x : recom_dict[x] if x in recom_dict.keys() else x, inplace=True)

# Distance Filtering

In [None]:
data_list = []
for i in range(len(proba)) :
  for j in range(len(model.classes_)) :
    value_list = []
    value_list.append(proba['사번'][i])
    value_list.append([k for k, v in mapping.items() if v == model.classes_[j]][0])
    data_list.append(value_list)

recom_table = pd.DataFrame([sublist[:2] for sublist in data_list], columns=['사번', '부서코드'])

In [None]:
temp = raw_data[['사번', '현근무지', '현근무부서', 'emp_lat', 'emp_long']]
dist = pd.merge(temp, org_data, left_on='현근무부서', right_on='부서', how='left')

In [None]:
dist['item_lat'] = dist['item_lat'].fillna(org_data[org_data['근무지_우편번호']=='16488']['item_lat'].values[0])
dist['item_long'] = dist['item_long'].fillna(org_data[org_data['근무지_우편번호']=='16488']['item_long'].values[0])
dist.rename(columns={'item_lat' : 'dep_lat', 'item_long' : 'dep_long'}, inplace=True)

In [None]:
recom_table = pd.merge(recom_table, org_data[pd.notna(org_data['근무지_우편번호']) & pd.notna(org_data['통합ID'])][['통합ID', 'item_lat', 'item_long']].drop_duplicates().drop_duplicates(subset='통합ID'), left_on='부서코드', right_on='통합ID', how='left')
recom_table = pd.merge(recom_table, dist[['사번', 'emp_lat', 'emp_long', 'dep_lat', 'dep_long']], on='사번', how='left')

In [None]:
distance = []
dep_distance = []

for i in range(len(recom_table)) :
    if pd.isna(recom_table['emp_lat'][i]) :
        distance.append(np.nan)
    else :
        start = (recom_table['emp_lat'][i], recom_table['emp_long'][i])
        end = (recom_table['item_lat'][i], recom_table['item_long'][i])
        dep_end = (recom_table['dep_lat'][i], recom_table['dep_long'][i])
        distance.append(haversine(start, end, unit='km'))
        dep_distance.append(haversine(start, dep_end, unit='km'))

recom_table['distance'] = distance
recom_table['dep_distance'] = dep_distance
recom_table.drop(['item_lat', 'item_long', 'emp_lat', 'emp_long', 'dep_lat', 'dep_long'], axis=1, inplace=True)

In [None]:
recom_table = pd.merge(recom_table, raw_data, on='사번', how='left')

# Infer result

In [None]:
df_recom = pd.DataFrame(result_data['사번'], columns=['사번'])

for item in mapping.keys() :
    df_recom[item] = predict_proba[:, mapping[item]]

In [None]:
filter_0 = recom_table[recom_table['부서근속기간']>=60]
for i in range(len(df_recom)) :
    if df_recom['사번'][i] in filter_0['사번'].unique() : # 5년이상 근무, 현근무부서의 score를 0으로 변경
        now_dep = dist[dist['사번']==df_recom['사번'][i]]['통합ID'].unique().tolist()[0]
        if now_dep in df_recom.columns :
            df_recom[now_dep][i] = 0

In [None]:
predict_rank = []
for i in range(len(df_recom)) :
    temp = df_recom[df_recom.columns[1:]].sort_values(by=i, axis=1, ascending=False).columns[0:3].tolist()
    predict_rank.append(temp)

df_recom[['predict_1', 'predict_2', 'predict_3']] = predict_rank

# Post-Filtering

In [None]:
# 3년이상 근무한 임직원들 중 현 근무지와 거주지가 40km 이상 차이나는 경우 거주지와의 40km 이내 근무지를 추천목록에 포함
filter_1 = recom_table[(recom_table['부서근속기간']>=36) & (recom_table['부서근속기간']<60) & (recom_table['dep_distance'] >= 40) & (recom_table['distance'] < 40)]

In [None]:
filtering_dict = {}

for id in filter_1['사번'].unique() :
    dep = filter_1[filter_1['사번']==id]['부서코드'].tolist()
    temp = df_recom[df_recom['사번']==id]
    output_1 = 0
    if temp[dep].sort_values(by=temp.index[0], axis=1, ascending=False).values[0][0] != 0 :
        output_1 = temp[dep].sort_values(by=temp.index[0], axis=1, ascending=False).columns[0]
    output_2 = filter_1[filter_1['distance']==filter_1[filter_1['사번']==id]['distance'].min()]['부서코드'].values[0]

    if (output_1 not in temp[['predict_1', 'predict_2', 'predict_3']].values[0].tolist()) & (output_1 != 0) :
        filtering_dict[id] = output_1
    elif (output_1 in temp[['predict_1', 'predict_2', 'predict_3']].values[0].tolist()) & (output_2 not in temp[['predict_1', 'predict_2', 'predict_3']].values[0].tolist()) :
        filtering_dict[id] = output_2

In [None]:
for i in range(len(df_recom)) :
    if df_recom['사번'][i] in filtering_dict.keys() :
        if df_recom['predict_3'][i] != filtering_dict[df_recom['사번'][i]] :
            df_recom['predict_3'][i] = filtering_dict[df_recom['사번'][i]]
        elif (df_recom['predict_3'][i] == filtering_dict[df_recom['사번'][i]]) & (df_recom['predict_2'][i] != filtering_dict[df_recom['사번'][i]]) :
            df_recom['predict_2'][i] = filtering_dict[df_recom['사번'][i]]
        else :
            df_recom['predict_1'][i] = filtering_dict[df_recom['사번'][i]]

In [None]:
distance_three = []

for i in range(len(df_recom)) :
    temp_list = []
    temp = recom_table[recom_table['사번']==df_recom['사번'][i]]
    temp_list.append(temp[temp['부서코드']==df_recom['predict_1'][i]]['distance'].values[0])
    temp_list.append(temp[temp['부서코드']==df_recom['predict_2'][i]]['distance'].values[0])
    temp_list.append(temp[temp['부서코드']==df_recom['predict_3'][i]]['distance'].values[0])
    distance_three.append(temp_list)

In [None]:
df_recom[['distance_1', 'distance_2', 'distance_3']] = distance_three

In [None]:
inf_result = pd.merge(df_recom[['사번', 'predict_1', 'predict_2', 'predict_3', 'distance_1', 'distance_2', 'distance_3']], raw_data, on=['사번'], how='left')

In [None]:
inf_result.rename(columns = lambda x : recom_dict[x] if x in recom_dict.keys() else x, inplace=True)

In [None]:
inf_result['predict_1'] = inf_result['predict_1'].apply(lambda x : recom_dict[x])
inf_result['predict_2'] = inf_result['predict_2'].apply(lambda x : recom_dict[x])
inf_result['predict_3'] = inf_result['predict_3'].apply(lambda x : recom_dict[x])

In [None]:
hope_dict = {}

for id in list(set(inf_result['이동희망부서1'].unique().tolist() + inf_result['이동희망부서2'].unique().tolist() + inf_result['이동희망부서3'].unique().tolist())) :
      if len(org_data[org_data['ID']==id])!=0 :
        hope_dict[id] = org_data[org_data['ID']==id]['부서'].values[0]
      else :
        hope_dict[id] = np.nan

In [None]:
hope_dict['014800'] = '청주지점'
hope_dict['029200'] = '총무부'
hope_dict['012100'] = 'HR부'
hope_dict['077400'] = '투자금융지원팀'
hope_dict['013300'] = '영남채권지점'

In [None]:
inf_result['이동희망부서1'] = inf_result['이동희망부서1'].apply(lambda x : hope_dict[x])
inf_result['이동희망부서2'] = inf_result['이동희망부서2'].apply(lambda x : hope_dict[x])
inf_result['이동희망부서3'] = inf_result['이동희망부서3'].apply(lambda x : hope_dict[x])

# Similarity

In [None]:
co_data = copy.deepcopy(inf_result)

In [None]:
co_data['현근무부서'] = co_data['현근무부서'].apply(lambda x : prep_dict[x] if x in prep_dict.keys() else "None")
co_data['현근무부서'] = co_data['현근무부서'].apply(lambda x : recom_dict[x] if x in recom_dict.keys() else "None")

In [None]:
cs_list = []

for i in range(len(co_data)) :
  for j in range(1, 4) :
    if co_data[f'predict_{j}'][i] == '감사부' :
      temp = outlier[outlier['현근무부서']==co_data[f'predict_{j}'][i]].index
      max = 0
      idx = 0
      for index in temp :
        if i != index :
          cosine = cosine_similarity(inf_data.values[i].reshape(1, -1), out_data.loc[index].values.reshape(1, -1))[0][0]
          if max <= cosine :
            max = cosine
            idx = index
      cs_list.append([co_data['사번'][i], co_data[f'predict_{j}'][i], outlier['사번'][idx], max])

    else :
      temp = co_data[co_data['현근무부서']==co_data[f'predict_{j}'][i]].index
      max = 0
      idx = 0
      for index in temp :
        if i != index :
          cosine = cosine_similarity(inf_data.values[i].reshape(1, -1), inf_data.values[index].reshape(1, -1))[0][0]
          if max <= cosine :
            max = cosine
            idx = index
      cs_list.append([co_data['사번'][i], co_data[f'predict_{j}'][i], co_data['사번'][idx], max])

cs_data = pd.DataFrame(cs_list, columns=['사번', '추천부서', '유사사원', '유사도'])

In [None]:
address = []

for i in range(len(inf_result)) :
  address.append(org_data[org_data['부서']==inf_result['현근무부서'][i]]['근무지'].values[0])

inf_result['근무부서_주소'] = address

In [None]:
inf_result['현재통근거리'] = inf_result['사번'].apply(lambda x : recom_table[recom_table['사번']==x]['dep_distance'].values[0])
inf_result.drop(['emp_address', 'emp_lat', 'emp_long', '현근무지', '거주지_우편번호', '근무부서코드', '근무부서', '근무부서고과평균', 'major_code'], axis=1, inplace=True)

# Tag

In [None]:
from sklearn.preprocessing import MinMaxScaler

sca = MinMaxScaler((0.5, 0.98)).fit_transform(proba[proba.columns[1:]])
proba_temp = pd.DataFrame(sca, columns=proba[proba.columns[1:]].columns, index=proba[proba.columns[1:]].index)
proba_temp['사번'] = proba['사번']
proba = proba_temp.copy()

In [None]:
last = []
per_hope = []
proba_list = []

for i in range(len(cs_data)) :
  temp = inf_result[inf_result['사번']==cs_data['사번'][i]]
  if temp[cs_data['추천부서'][i]].values[0] != 0 :
    last.append("#과거 소속 부서")
  else :
    last.append(np.nan)

  if cs_data['추천부서'][i] == temp['이동희망부서1'].values[0] :
    per_hope.append("#본인 희망부서 1순위")
  elif cs_data['추천부서'][i] == temp['이동희망부서2'].values[0] :
    per_hope.append("#본인 희망부서 2순위")
  elif cs_data['추천부서'][i] == temp['이동희망부서3'].values[0] :
    per_hope.append("#본인 희망부서 3순위")
  else :
    per_hope.append(np.nan)

  temp_2 = proba[proba['사번']==cs_data['사번'][i]]
  proba_list.append(temp_2[cs_data['추천부서'][i]].values[0])

cs_data['distance'] = inf_result[['distance_1', 'distance_2', 'distance_3']].values.reshape(1, -1)[0]
cs_data['#과거부서유무'] = last
cs_data['#이동희망부서'] = per_hope
cs_data['부서적합률'] = proba_list

In [None]:
max = 0
real_dep = []
temp_db = org_data[pd.notna(org_data['근무지_우편번호'])]
temp_db = temp_db.reset_index(drop=True)
for i in range(len(cs_data)) :
  dep = temp_db[temp_db['통합부서']==cs_data['추천부서'][i]]['부서'].unique().tolist()
  while(len(dep)<7) :
    dep.append(np.nan)
  real_dep.append(dep)

cs_data[['세부부서1', '세부부서2', '세부부서3', '세부부서4', '세부부서5', '세부부서6', '세부부서7']] = real_dep

In [None]:
inf_result[['proba_1', 'proba_2', 'proba_3']] = cs_data['부서적합률'].values.reshape(-1, 3)

In [None]:
cs_data['본부'] = cs_data['추천부서'].apply(lambda x : org_data[org_data['통합부서']==x]['본부'].values[0])

In [None]:
# 231124 추가
cs_data['추천부서_주소'] = [org_data[org_data['통합부서']==cs_data['추천부서'][i]]['근무지'].values[0] for i in range(len(cs_data))]

# Export

In [None]:
# 보류 -> 과거부서유무를 "해당 됨", "해당 안됨"으로 변경가능, 이동희망부서는 표기하는 방향
cs_data.to_csv(f'{data_path}/result/output/3rd_personal_tag_recom.csv', index=False)
inf_result.to_csv(f'{data_path}/result/input/process/temp_infer_result_recom.csv', index=False)