In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install

In [None]:
!pip install ftfy
!pip install kozip

# Package

In [None]:
import pandas as pd
import numpy as np
import re
import ftfy
from datetime import datetime
import joblib
import json
import copy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from kozip import KoZIP
import warnings
warnings.simplefilter('ignore')

In [None]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# Function

In [None]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [None]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None)->pd.DataFrame:

    if dtypesMapper:
      try :
        df = pd.read_csv(
        filePath,
        # engine='openpyxl',
        usecols=[k for k, v in dtypesMapper.items()],
        dtype=dtypesMapper
        # low_memory=False
        )
      except :
        df = pd.read_excel(
        filePath,
        engine='openpyxl',
        usecols=[k for k, v in dtypesMapper.items()],
        dtype=dtypesMapper
        # low_memory=False
        )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

In [None]:
kozip = KoZIP()

def zipcoding(address) :
    try :
        point = kozip.ZIPtoAddr(address, depth="2")[0]
        return point

    except :
        return "No Address"

In [None]:
def month_split(df) :
    time_list = []
    for i, c in df.iterrows() :
        year, month = c['근속기간'].split('년')
        time_list.append(int(year)*12+int(month[:-2]))
    return time_list

In [None]:
def cal_diff(date1, date2):
    date_format = "%Y%m%d"
    start_date = datetime.strptime(date1, date_format)
    end_date = datetime.strptime(date2, date_format)

    month_diff = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
    return month_diff

In [None]:
position_dict = {
    '과장' : 1,
    '차장' : 2,
    '부부장' : 3,
    '부장' : 4,
    '수석부장' : 5
}

def position_sort(data) :
    data = data.reset_index(drop=True)
    position_list = []

    for i in range(len(data)) :
        if pd.isna(data['최근승진일'][i]) :
            position_list.append(data['직무'][i])
        else :
            if data['최근승진일'][i][0:4] == data['년도'][0] :
                if pd.isna(data['직전승진일'][i]) :
                    num = position_dict[data['최근승진직급'][i]] - 1
                    temp = [key for key, value in position_dict.items() if value == num][0]
                    position_list.append(temp)
                else :
                    position_list.append(data['직전승진직급'][i])
            else :
                position_list.append(data['직무'][i])

    position_list = ['차장' if x == '감사역' else x for x in position_list]
    return position_list

# Dataset

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023/'

In [None]:
Mapper = {
    '사번': str,
    'predict_proba': np.float32,
    '성명' : str,
    '성별' : str,
    '연령' : np.int32,
    '연락처' : str,
    '이메일' : str,
    '생년월일' : str,
    '직무' : str,
    '연차' : np.int32,
    '근무본부' : str,
    '근무부서' : str,
    '근무지역' : str,
    '최종학력' : str,
    '신입경력' : str,
    '근속기간' : str,
    '수상여부' : np.int32,
    '전년도평균교육점수' : np.float32,
    '입사일' : str,
    '퇴사일' : str,
    '최근승진일' : str,
    '최근승진직급' : str,
    '직전승진일' : str,
    '직전승진직급' : str,
    '전년도KPI통과여부' : np.float32,
    '년도' : str,
    '전년도평가점수' : np.float32,
    '전년도평가등급' : str,
    '전전년도평가점수' : np.float32,
    '전전년도평가등급' : str,
    '전전전년도평가점수' : np.float32,
    '전전전년도평가등급' : str,
    '최종이동일' : str,
    '전년도총교육시간' : np.float32,
    '전년도총이수교육횟수' : np.float32,
    '근속개월' : np.int32,
    '다면평가등급' : np.float32,
}

inf_data = read_selected(filePath = f'{data_path}/result/output/infer_result_promotion.csv',
                   dtypesMapper=Mapper,
                   textCols=['사번'])

In [None]:
Mapper = {
    'ID' : str,
    '부서' : str,
    '통합ID' : str,
    '통합부서' : str,
    '비고' : str,
    '근무지_우편번호' : str,
    '부서인원수' : str,
    '근무지역' : str,
    '본부' : str
}

org_sort = read_selected(filePath = f'{data_path}/result/input/KB부서정보_전처리_검토사항_수정.xlsx',
                      dtypesMapper=Mapper)

In [None]:
Mapper = {
    '사번': str,
    'predict_1' : str,
    'predict_2' : str,
    'predict_3' : str
}

recom_data = read_selected(filePath = f'{data_path}/result/input/process/temp_infer_result_recom.csv',
                   dtypesMapper=Mapper,
                   textCols=['사번'])

# Tag

In [None]:
tag_data = inf_data[['사번', '근무부서', 'predict_proba', '연차', '직무', '최종학력', '신입경력', '근속개월', '수상여부', '전년도평균교육점수', '전년도KPI통과여부', '전년도평가등급', '다면평가등급']]

In [None]:
tag_data = tag_data.fillna(0)
tag_data['연차'] = ['#' + tag_data['직무'][i] + " " + str(tag_data['연차'][i]) + "년차" for i in range(len(tag_data))]
tag_data['#근속개월'] = tag_data['근속개월'].apply(lambda x : "#근속기간 김" if x >= tag_data['근속개월'].quantile(.75) else "#근속기간 짧음")
tag_data['#수상여부'] = tag_data['수상여부'].apply(lambda x : "#수상" if x==1 else np.nan)
tag_data['#전년도KPI통과여부'] = tag_data['전년도KPI통과여부'].apply(lambda x : "#KPI 통과" if x==1 else np.nan)
tag_data['#전년도평가등급'] = tag_data['전년도평가등급'].replace({"S" : "#고과점수 높음", "A" : "#고과점수 높음", "B" : np.nan, "C" : np.nan, 0 : np.nan})
tag_data['#전년도평균교육점수'] = tag_data['전년도평균교육점수'].apply(lambda x : "#교육점수 높음" if x >= tag_data[tag_data['전년도평균교육점수']!=0]['전년도평균교육점수'].quantile(.75) else np.nan)
tag_data['#다면평가등급'] = tag_data['다면평가등급'].apply(lambda x : "#다면평가점수 높음" if x >= tag_data[tag_data['다면평가등급']!=0]['다면평가등급'].quantile(.75) else np.nan)
tag_data['#승진가능성'] = tag_data['predict_proba'].apply(lambda x : "승진 가능성 높음" if x>=0.8 else np.nan)
tag_data['신입경력'] = '#' + tag_data['신입경력']

In [None]:
tag_data.drop(['predict_proba'], axis=1, inplace=True)

In [None]:
# 231124 수정
moving_list = []

for i in range(len(tag_data)) :
  temp_list = []
  if tag_data['사번'][i] not in recom_data['사번'].unique() :
    moving_list.append(np.nan)
  else :
    for j in range(1, 4) :
      temp_list += org_sort[org_sort['통합부서']==recom_data[recom_data['사번']==tag_data['사번'][i]].values[0][j]]['부서'].unique().tolist()
    if tag_data['근무부서'][i] in temp_list :
      moving_list.append(np.nan)
    else :
      if pd.isna(org_sort[org_sort['부서']==tag_data['근무부서'][i]]['통합부서'].values[0]) :
        moving_list.append(np.nan)
      else :
        moving_list.append("부서이동 가능성 높음")

tag_data['#이동가능성'] = moving_list

In [None]:
tag_data = tag_data[['사번', '근무부서', '연차', '직무', '최종학력', '신입경력', '#승진가능성', '#이동가능성', '#근속개월', '#수상여부', '#전년도평균교육점수', '#전년도KPI통과여부',
                     '#전년도평가등급', '#다면평가등급']]

In [None]:
tag_data.rename(columns={'연차' : '#연차', '신입경력' : '#신입경력'}, inplace=True)

# Export

In [None]:
# 승진가능성, 이동가능성은 예외, 나머지는 태그 중요도 순으로 정렬
tag_data = tag_data[['사번', '근무부서', '직무', '#승진가능성', '#이동가능성', '#연차', '#전년도평가등급', '#다면평가등급', '#전년도평균교육점수', '#근속개월', '#수상여부', '#전년도KPI통과여부', '#신입경력']]

In [None]:
tag_data.to_csv(f'{data_path}/result/output/tag_data_promotion.csv', index=False)