In [1]:
# !pip install --upgrade git+https://github.com/tooha289/DataAnalysisLibrary.git

In [2]:
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import chi2_contingency
from DataAnalysis import eda

from matplotlib import font_manager

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
font_path = "C:/Windows/Fonts/malgun.ttf"
font_family = font_manager.FontProperties(fname=font_path).get_name()
plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False

In [5]:
# 폴더가 존재하지 않으면 생성
RECALL_PATH = './recalls/'
for path in [RECALL_PATH]:
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"폴더가 생성되었습니다: {path}")
    else:
        print(f"폴더가 이미 존재합니다: {path}")

폴더가 이미 존재합니다: ./recalls/


# 데이터 load

In [6]:
resume = pd.read_csv("../data/resume.csv")
recruitment = pd.read_csv("../data/recruitment.csv")
apply = pd.read_csv("../data/apply_train.csv")

# 전처리

In [7]:
used_cols = ['resume_seq', 'degree', 'job_code_seq1']
resume = resume[used_cols]

used_cols = ['recruitment_seq', 'address_seq1', 'education', 'major_task', 'qualifications']
recruitment = recruitment[used_cols]

## address_seq1의 na값을 0으로 처리

In [8]:
recruitment['address_seq1'].unique()

array([ 3., 20.,  5.,  9., 11.,  1., nan, 13.,  2.])

In [9]:
recruitment = recruitment.fillna({'address_seq1':3}, axis=0)

## 타입 변경

In [10]:
res_categorical_cols = ['degree', 'job_code_seq1']
rec_categorical_cols = ['address_seq1', 'education', 'major_task', 'qualifications']
resume[res_categorical_cols] = resume[res_categorical_cols].astype('category')
recruitment[rec_categorical_cols] = recruitment[rec_categorical_cols].astype('category')

In [11]:
recruitment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   recruitment_seq  6695 non-null   object  
 1   address_seq1     6695 non-null   category
 2   education        6695 non-null   category
 3   major_task       6695 non-null   category
 4   qualifications   6695 non-null   category
dtypes: category(4), object(1)
memory usage: 79.6+ KB


## 데이터 병합

In [12]:
merged_df = resume.merge(apply, how='inner', left_on = 'resume_seq', right_on ='resume_seq')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57946 entries, 0 to 57945
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   resume_seq       57946 non-null  object  
 1   degree           57946 non-null  category
 2   job_code_seq1    57946 non-null  category
 3   recruitment_seq  57946 non-null  object  
dtypes: category(2), object(2)
memory usage: 1.4+ MB


In [13]:
merged_df = merged_df.merge(recruitment, how='inner', left_on = 'recruitment_seq', right_on ='recruitment_seq')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57946 entries, 0 to 57945
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   resume_seq       57946 non-null  object  
 1   degree           57946 non-null  category
 2   job_code_seq1    57946 non-null  category
 3   recruitment_seq  57946 non-null  object  
 4   address_seq1     57946 non-null  category
 5   education        57946 non-null  category
 6   major_task       57946 non-null  category
 7   qualifications   57946 non-null  category
dtypes: category(6), object(2)
memory usage: 1.7+ MB


## 함수 정의

In [44]:
# eda function
%run ../functions/eda.py
# metric function
%run ../functions/metric.py
# submission function
%run ../functions/submission.py

# 인기도 가중치

In [15]:
mms = MinMaxScaler()
stds = StandardScaler()

# 모델 결과 생성

## idx 디코딩

### 디코딩 딕셔너리 로드

In [16]:
DATA_PATH = '../data/'

In [17]:
resume_decode = pd.read_csv(f"{DATA_PATH}idx_2_resume.txt", delimiter=" ", header=None)
resume_decode = resume_decode.set_index(0).to_dict()[1]

In [18]:
recruitment_decode = pd.read_csv(f"{DATA_PATH}/idx_2_recruitment.txt", delimiter=" ", header=None)
recruitment_decode = recruitment_decode.set_index(0).to_dict()[1]

## CF 모델

In [19]:
DATA_SET = "JOB"

In [20]:
CF_FOLDER_PATH = '../CF/results/'
cf_proba = pd.read_csv(f'{CF_FOLDER_PATH}pred_prob_{DATA_SET}.txt', delimiter=' ', header=None)
cf_rating_idx = pd.read_csv(f'{CF_FOLDER_PATH}pred_idx_{DATA_SET}.txt', delimiter=' ', header=None)

In [21]:
cf_score = create_score_dataframe(cf_proba, cf_rating_idx, "cf")
cf_score.head(20)

Unnamed: 0,resume_idx,recruitment_idx,cf_score
0,0,357,1.0
1,0,2832,0.838635
2,0,156,0.731138
3,0,5164,0.700984
4,0,4670,0.672044
5,0,6228,0.596687
6,0,3641,0.567364
7,0,4675,0.560605
8,0,1965,0.547928
9,0,3349,0.529323


## LTOCF 모델

In [22]:
LTOCF_FOLDER_PATH = '../LT-OCF/results/'
LTOCF_FILE_NAME = 'JOB_360_0.0030__319'

In [23]:
lt_proba = pd.read_csv(f'{LTOCF_FOLDER_PATH}proba_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_rating_idx= pd.read_csv(f'{LTOCF_FOLDER_PATH}rating_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_user = pd.read_csv(f'{LTOCF_FOLDER_PATH}user_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)

In [24]:
lt_proba_df = pd.concat([lt_user, lt_proba], axis=1)
lt_idx_df = pd.concat([lt_user, lt_rating_idx], axis=1)

In [25]:
ltocf_score = create_score_dataframe(lt_proba_df, lt_idx_df, "ltocf")
ltocf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,ltocf_score
0,0,357,0.999983
1,0,156,0.999955
2,0,1965,0.999934
3,0,1030,0.999892
4,0,2832,0.999884


In [26]:
lt_idx_df.columns = range(lt_idx_df.shape[1])

# 모델 결과 + 인기도 가중치

In [27]:
def create_weight_df(origin_df, popularity_df, target_cols, grouping_col, limit_ratio=0.005):
    merged_df = popularity_df.copy()
    weight_dfs = []
    for target_col in target_cols:
        _, ratio_df_hue_ratio = calculate_ratio_df_with_hue(origin_df, target_col, grouping_col, limit_ratio)

        weight_df = ratio_df_hue_ratio.iloc[:-1, :-1].reset_index()
        weight_df = weight_df.melt(id_vars=target_col, var_name=grouping_col, value_name=f'{target_col}_weight')
        weight_dfs.append(weight_df)
        
        merged_df = merged_df.merge(weight_df, how='inner', on=[grouping_col, target_col])
    return merged_df, weight_dfs

## CF 인기도 가중치 적용

In [28]:
cf_score['resume_seq'] = cf_score['resume_idx'].apply(lambda x: resume_decode[x])
cf_score['recruitment_seq'] = cf_score['recruitment_idx'].apply(lambda x: recruitment_decode[x])
cf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,cf_score,resume_seq,recruitment_seq
0,0,357,1.0,U05833,R00585
1,0,2832,0.838635,U05833,R01455
2,0,156,0.731138,U05833,R03943
3,0,5164,0.700984,U05833,R00304
4,0,4670,0.672044,U05833,R04100


In [29]:
cf_score = cf_score.merge(resume, how='inner', on='resume_seq')
cf_score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169640 entries, 0 to 169639
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   resume_idx       169640 non-null  int64   
 1   recruitment_idx  169640 non-null  int64   
 2   cf_score         169640 non-null  float64 
 3   resume_seq       169640 non-null  object  
 4   recruitment_seq  169640 non-null  object  
 5   degree           169640 non-null  category
 6   job_code_seq1    169640 non-null  category
dtypes: category(2), float64(1), int64(2), object(2)
memory usage: 8.1+ MB


### job_code_seq1

#### recruitment_feature : job_code_seq1 가중치

In [30]:
cf_score_weight = cf_score.merge(recruitment, how='inner', on='recruitment_seq')
cf_job_weight, weight_dfs = create_weight_df(merged_df, cf_score_weight, ['address_seq1', 'education',
                                                      'major_task', 'qualifications'], 'job_code_seq1', limit_ratio=0)

In [31]:
weight_columns = cf_job_weight.filter(like="_weight").columns
weight_columns

Index(['address_seq1_weight', 'education_weight', 'major_task_weight',
       'qualifications_weight'],
      dtype='object')

In [32]:
cf_job_weight['sqrt_weight_sum'] = cf_job_weight[weight_columns].apply(np.sqrt).sum(axis=1)

In [33]:
cf_job_weight['s_sqrt_weight_sum'] = cf_job_weight.groupby('resume_idx')['sqrt_weight_sum'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
cf_job_weight['scf_score'] = cf_job_weight.groupby('resume_idx')['cf_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

#### RECALL 계산

##### 가중치 인기도

In [45]:
weight_scores = create_ensemble_submission_2col(cf_job_weight,
                                                'scf_score', 's_sqrt_weight_sum',
                                                start=92, end=101, step=8)

0.92 : 0.08
1.00 : 0.00


In [46]:
ratio= '0.92_0.08'

In [47]:
calculate_precision_at_k_dacon(weight_scores[ratio], weight_scores['1.00_0.00'], 5)

0.9545390238151671

In [49]:
weight_scores[ratio].to_csv(f'{RECALL_PATH}/weighted_{DATA_SET}_{ratio}.csv', index=False)