In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install

In [2]:
!pip install ftfy
!pip install haversine



# Package

In [3]:
import pandas as pd
import numpy as np
import re
import ftfy
from haversine import haversine
from datetime import datetime
from urllib.request import urlopen
from urllib import parse
from urllib.request import Request
from urllib.error import HTTPError
import json
import copy
import joblib
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.simplefilter('ignore')

In [4]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# Function

In [5]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [6]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None)->pd.DataFrame:

    if dtypesMapper:
        df = pd.read_csv(
            filePath,
            # engine='openpyxl',
            usecols=[k for k, v in dtypesMapper.items()],
            dtype=dtypesMapper
            # low_memory=False
            )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

# Dataset

In [7]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023'

In [8]:
Mapper = {
    '사번' : str,
    'predict_1' : str,
    'predict_2' : str,
    'predict_3' : str,
    'distance_1' : np.float32,
    'distance_2' : np.float32,
    'distance_3' : np.float32,
    'proba_1' : np.float32,
    'proba_2' : np.float32,
    'proba_3' : np.float32,
    '성명' : str,
    '성별' : str,
    '연령' : np.int32,
    '근속기간' : str,
    '현근무부서' : str,
    '학력' : str,
    '전공' : str,
    '거주지' : str,
    '직무관련자격증' : np.int32,
    '수상여부' : np.int32,
    '직책' : str,
    '이동희망부서1' : str,
    '이동희망부서2' : str,
    '이동희망부서3' : str,
    '이동희망시기' : str,
    '최종이동일' : str,
    '경영자역량강화' : np.int32,
    '디지털역량강화' : np.int32,
    '조직가치공유' : np.int32,
    '직무역량강화' : np.int32,
    '핵심인재육성' : np.int32,
    '근무부서_주소' : str,
    '현재통근거리' : np.float32,
    '부서근속기간' : np.int32
    }

inf_data = read_selected(filePath = f'{data_path}/result/input/process/temp_infer_result_recom.csv',
                      dtypesMapper=Mapper,
                      textCols=['사번'])

In [9]:
Mapper = {
    '사번' : str,
    '추천부서' : str,
    '유사사원' : str,
    '유사도' : np.float32,
    'distance' : np.float32,
    '#과거부서유무' : str,
    '#이동희망부서' : str,
    '부서적합률' : np.float32
    }

cs_data = read_selected(filePath = f'{data_path}/result/output/3rd_personal_tag_recom.csv',
                      dtypesMapper=Mapper,
                      textCols=['사번'])

In [10]:
Mapper = {
    'ID' : str,
    '부서' : str,
    '통합ID' : str,
    '통합부서' : str,
    '근무지' : str,
    '근무지_우편번호' : str,
    'item_lat' : np.float32,
    'item_long' : np.float32,
    '부서인원수' : str,
    '근무지역' : str,
    '본부' : str
    }

org_data = read_selected(filePath = f'{data_path}/result/input/process/org_data.csv',
                      dtypesMapper=Mapper,
                      textCols=['ID'])

In [11]:
Mapper = {
    '사번': str,
    'predict_proba': np.float32
}

pro_data = read_selected(filePath = f'{data_path}/result/output/infer_result_promotion.csv',
                   dtypesMapper=Mapper,
                   textCols=['사번'])

# Tag

In [12]:
inf_data['#부서근속기간'] = inf_data['부서근속기간'].apply(lambda x : f'#현재팀 {x//12}년 근무')

In [13]:
# 231124 수정
moving_list = []

for i in range(len(inf_data)) :
  temp_list = []
  for j in range(1, 4) :
    temp_list += org_data[org_data['통합부서']==inf_data['predict_%d' %j][i]]['부서'].unique().tolist()
  if inf_data['현근무부서'][i] not in temp_list :
    moving_list.append('부서이동 가능성 높음')
  else :
    moving_list.append(np.nan)

inf_data['#이동가능성'] = moving_list

In [14]:
inf_data['#경영자역량강화'] = inf_data['경영자역량강화'].apply(lambda x : "#경영자역량강화" if x > inf_data['경영자역량강화'].median() else np.nan)
inf_data['#디지털역량강화'] = inf_data['디지털역량강화'].apply(lambda x : "#디지털역량강화" if x > inf_data['디지털역량강화'].median() else np.nan)
inf_data['#조직가치공유'] = inf_data['조직가치공유'].apply(lambda x : "#조직가치공유" if x > inf_data['조직가치공유'].median() else np.nan)
inf_data['#직무역량강화'] = inf_data['직무역량강화'].apply(lambda x : "#직무역량강화" if x > inf_data['직무역량강화'].median() else np.nan)
inf_data['#핵심인재육성'] = inf_data['핵심인재육성'].apply(lambda x : "#핵심인재육성" if x > inf_data['핵심인재육성'].median() else np.nan)

In [15]:
inf_data['#현재통근'] = inf_data['현재통근거리'].apply(lambda x : '#통근 거리 40km 이상' if x>=40 else '#통근 용이')

In [16]:
#231214 수정
pro_data['#승진가능성'] = pro_data['predict_proba'].apply(lambda x : "승진 가능성 높음" if x>=0.8 else np.nan)
inf_data = pd.merge(inf_data, pro_data[['사번', '#승진가능성']], on=['사번'], how='left')

In [17]:
now_dep = org_data[(pd.notna(org_data['근무지_우편번호'])) & (pd.notna(org_data['본부']))]
now_dep = now_dep.reset_index(drop=True)

In [18]:
# 231124 수정
dep_list = []
for i in range(len(now_dep)) :
  temp = inf_data[(inf_data['predict_1']==now_dep['통합부서'][i]) | (inf_data['predict_2']==now_dep['통합부서'][i]) | (inf_data['predict_3']==now_dep['통합부서'][i])]
  temp = temp.reset_index(drop=True)
  for j in range(len(temp)) :
    if temp['predict_1'][j] == now_dep['통합부서'][i] :
      dep_list.append([now_dep['부서'][i], now_dep['통합부서'][i], temp['사번'].values[j], 1])
    elif temp['predict_2'][j] == now_dep['통합부서'][i] :
      dep_list.append([now_dep['부서'][i], now_dep['통합부서'][i], temp['사번'].values[j], 2])
    elif temp['predict_3'][j] == now_dep['통합부서'][i] :
      dep_list.append([now_dep['부서'][i], now_dep['통합부서'][i], temp['사번'].values[j], 3])

tag_data = pd.DataFrame(dep_list, columns=['부서', '통합부서', '사번', '추천부서순위'])

In [19]:
tag_result = pd.merge(tag_data, cs_data[['사번', '추천부서', 'distance', '#과거부서유무', '#이동희망부서', '부서적합률']], left_on=['통합부서', '사번'], right_on=['추천부서', '사번'], how='left')
tag_result = pd.merge(tag_result, pro_data[['사번', '#승진가능성']], on=['사번'], how='left')
tag_result = pd.merge(tag_result, inf_data[['사번', '#부서근속기간', '#이동가능성', '#경영자역량강화', '#디지털역량강화', '#조직가치공유', '#직무역량강화', '#핵심인재육성']], on=['사번'], how='left')

In [20]:
tag_result['#통근'] = tag_result['distance'].apply(lambda x : '#통근 거리 40km 이상' if x>=40 else '#통근 용이')

In [21]:
first_tag = inf_data[['사번', '#부서근속기간', '#경영자역량강화', '#디지털역량강화', '#조직가치공유', '#직무역량강화', '#핵심인재육성', '#현재통근', '#승진가능성', '#이동가능성']]
main_result = inf_data[['사번', 'predict_1', 'predict_2', 'predict_3', 'distance_1', 'distance_2', 'distance_3', 'proba_1', 'proba_2', 'proba_3', '성명', '성별', '연령', '근속기간',
                     '현근무부서', '학력', '전공', '직책', '이동희망부서1','이동희망부서2', '이동희망부서3', '이동희망시기', '최종이동일', '근무부서_주소', '현재통근거리', '부서근속기간']]

In [22]:
main_result['본부'] = main_result['현근무부서'].apply(lambda x : org_data[org_data['부서']==x]['본부'].values[0])

# Export

In [23]:
# 231124 수정
first_tag = first_tag[['사번', '#승진가능성', '#이동가능성', '#부서근속기간', '#현재통근', '#경영자역량강화', '#디지털역량강화', '#조직가치공유', '#직무역량강화', '#핵심인재육성']]
tag_result = tag_result[['부서', '통합부서', '사번', '#승진가능성', '#이동가능성', '추천부서', '추천부서순위', '부서적합률', '#이동희망부서', 'distance', '#부서근속기간', '#통근']]

In [24]:
first_tag.to_csv(f'{data_path}/result/output/1st_all_tag_recom.csv', index=False)
tag_result.to_csv(f'{data_path}/result/output/2nd_team_tag_recom.csv', index=False)
main_result.to_csv(f'{data_path}/result/output/infer_result_recom.csv', index=False)