In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
FILE_PATH = './data'
SAVE_PATH = './data'

# [훈련 데이터 생성]

## apply_train 파일 로드

In [3]:
apply = pd.read_csv(f'{FILE_PATH}/apply_train.csv')
apply.head()

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112


## 딕셔너리 치환 (인코딩, 디코딩)

### Resume 딕셔너리 생성

In [4]:
resume_2_idx = { res : i for i, res in enumerate(apply['resume_seq'].unique())}
[(key, value) for key, value in resume_2_idx.items()][0:10]

[('U05833', 0),
 ('U06456', 1),
 ('U07807', 2),
 ('U04842', 3),
 ('U08336', 4),
 ('U03753', 5),
 ('U01045', 6),
 ('U00825', 7),
 ('U07438', 8),
 ('U01917', 9)]

In [5]:
resume_2_idx.__len__()

8482

In [6]:
idx_2_resume = { i : res for res, i in resume_2_idx.items()}
[(key, value) for key, value in idx_2_resume.items()][0:10]

[(0, 'U05833'),
 (1, 'U06456'),
 (2, 'U07807'),
 (3, 'U04842'),
 (4, 'U08336'),
 (5, 'U03753'),
 (6, 'U01045'),
 (7, 'U00825'),
 (8, 'U07438'),
 (9, 'U01917')]

In [7]:
idx_2_resume.__len__()

8482

In [8]:
np.savetxt(f'{SAVE_PATH}/idx_2_resume.txt', np.array(pd.DataFrame(idx_2_resume.items()).values), fmt='%s')

### Recruitment 딕셔너리 생성

In [9]:
recruitment_2_idx = { rec : i for i, rec in enumerate(apply['recruitment_seq'].unique())}
[(key, value) for key, value in recruitment_2_idx.items()][0:10]

[('R03838', 0),
 ('R02144', 1),
 ('R01877', 2),
 ('R02463', 3),
 ('R00112', 4),
 ('R05448', 5),
 ('R03428', 6),
 ('R04768', 7),
 ('R02907', 8),
 ('R01028', 9)]

In [10]:
recruitment_2_idx.__len__()

6695

In [11]:
idx_2_recruitment = { i : rec for rec, i in recruitment_2_idx.items()}
[(key, value) for key, value in idx_2_recruitment.items()][0:10]

[(0, 'R03838'),
 (1, 'R02144'),
 (2, 'R01877'),
 (3, 'R02463'),
 (4, 'R00112'),
 (5, 'R05448'),
 (6, 'R03428'),
 (7, 'R04768'),
 (8, 'R02907'),
 (9, 'R01028')]

In [12]:
idx_2_recruitment.__len__()

6695

In [13]:
np.savetxt(f'{SAVE_PATH}/idx_2_recruitment.txt', np.array(pd.DataFrame(idx_2_recruitment.items()).values), fmt='%s')

## idx 인코딩 작업

In [14]:
apply['resume_idx'] = apply['resume_seq'].apply(lambda x : resume_2_idx[x])
apply['recruitment_idx'] = apply['recruitment_seq'].apply(lambda x : recruitment_2_idx[x])

In [15]:
apply.tail()

Unnamed: 0,resume_seq,recruitment_seq,resume_idx,recruitment_idx
57941,U02270,R03430,864,3691
57942,U02640,R04987,4358,6176
57943,U08238,R01342,8370,4461
57944,U01296,R06363,1157,3888
57945,U05748,R03090,5662,896


## 제출용 학습 데이터(원본) 생성

In [16]:
origin = apply.groupby('resume_idx')['recruitment_idx'].agg(list).reset_index()
origin = origin['recruitment_idx'].apply(lambda x: ' '.join(map(str,x))).to_frame().reset_index()
origin = origin.rename(columns={'index': 'resume_idx'})
origin

Unnamed: 0,resume_idx,recruitment_idx
0,0,0 996 6499 5621
1,1,1 1871 1507 1804 4663 5352 5607 3262 5413 2645...
2,2,2 4100 3926 3986 4016 6527 1019
3,3,3 3852 2022 5787 1029 4715
4,4,4 3596 3575
...,...,...
8477,8477,5820 5695
8478,8478,1274 3309
8479,8479,4012 2584
8480,8480,425 4397


* 제출 파일을 만들기 위한 train, test는 같은 파일을 사용합니다.

In [17]:
np.savetxt(f'{SAVE_PATH}/train.txt', origin.values, fmt='%s')
np.savetxt(f'{SAVE_PATH}/test.txt', origin.values, fmt='%s')

# [모델별 학습]

* 각 모델 실행 시 마다 커널을 재시작 해줍니다.

## BSPM

In [18]:
import os
ROOT_PATH = os.getcwd()
RELATIVE_BSPM_PATH = "./BSPM/bspm"

In [19]:
# Root 작업 경로로 변경
os.chdir(ROOT_PATH)

# 현재 작업 경로 출력
print("현재 작업 경로:", os.getcwd())

# 상대 경로를 절대 경로로 변환
absolute_path = os.path.abspath(RELATIVE_BSPM_PATH)

# 작업 경로 변경
os.chdir(absolute_path)

# 변경된 작업 경로 출력
print("변경된 작업 경로:", os.getcwd())

현재 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System
변경된 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System\BSPM\bspm


In [20]:
%run main.py --dataset="JOB" --topks="[20]" --simple_model="bspm" --solver_shr="rk4" \
--K_s=1 --T_s=3.5 --final_sharpening=True --idl_beta=0.3 --factor_dim=960


██████╗ ███████╗██████╗ ███╗   ███╗
██╔══██╗██╔════╝██╔══██╗████╗ ████║
██████╔╝███████╗██████╔╝██╔████╔██║
██╔══██╗╚════██║██╔═══╝ ██║╚██╔╝██║
██████╔╝███████║██║     ██║ ╚═╝ ██║
╚═════╝ ╚══════╝╚═╝     ╚═╝     ╚═╝

[0;30;43mCpp extension not loaded[0m
>>SEED: 2020
[0;30;43mloading [../data/JOB][0m
# of user: 8482
# of item: 6695
57946 interactions for training
57946 interactions for testing
JOB Sparsity : 0.0020408195609592974
JOB is ready to go
{'A_n_fold': 100,
 'A_split': False,
 'K_b': 1,
 'K_idl': 1,
 'K_s': 1,
 'T_b': 1,
 'T_idl': 1,
 'T_s': 3.5,
 'bigdata': False,
 'bpr_batch_size': 2048,
 'dataset': 'JOB',
 'decay': 0.0001,
 'device': device(type='cuda', index=0),
 'dropout': 0,
 'factor_dim': 960,
 'final_sharpening': True,
 'idl_beta': 0.3,
 'keep_prob': 0.6,
 'latent_dim_rec': 64,
 'lightGCN_n_layers': 3,
 'lr': 0.001,
 'multicore': 0,
 'pretrain': 0,
 'sharpening_off': False,
 'solver_blr': 'euler',
 'solver_idl': 'euler',
 'solver_shr': 'rk4',
 't_point_combination'

## LT-OCF

In [None]:
os._exit(00)

In [1]:
import os
ROOT_PATH = os.getcwd()
RELATIVE_LTOCF_PATH = "./LT-OCF/code"

In [2]:
# Root 작업 경로로 변경
os.chdir(ROOT_PATH)

# 현재 작업 경로 출력
print("현재 작업 경로:", os.getcwd())

# 상대 경로를 절대 경로로 변환
absolute_path = os.path.abspath(RELATIVE_LTOCF_PATH)

# 작업 경로 변경
os.chdir(absolute_path)

# 변경된 작업 경로 출력
print("변경된 작업 경로:", os.getcwd())

현재 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System
변경된 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System\LT-OCF\code


In [3]:
%run main.py --dataset="JOB" --model="ltocf" --solver="rk4" --adjoint=False \
--K=4 --learnable_time=False --dual_res=False --lr=1e-3 --lr_time=1e-3 \
--decay=1e-4 --topks="[20]" --tensorboard=1 --gpuid=0 \
--epochs=320 --layer=2 --recdim=360 --bpr_batch=2048 --pretrain=0

현재 사용 중인 GPU 디바이스: NVIDIA GeForce RTX 2070

██╗  ████████╗    ██████╗  ██████╗███████╗
██║  ╚══██╔══╝   ██╔═══██╗██╔════╝██╔════╝
██║     ██║█████╗██║   ██║██║     █████╗  
██║     ██║╚════╝██║   ██║██║     ██╔══╝  
███████╗██║      ╚██████╔╝╚██████╗██║     
╚══════╝╚═╝       ╚═════╝  ╚═════╝╚═╝     

Current cuda device  0
>>SEED: 2020
[0;30;43mloading [../data/JOB][0m
57946 interactions for training
57946 interactions for testing
JOB Sparsity : 0.0020408195609592974
JOB is ready to go
{'A_n_fold': 100,
 'A_split': False,
 'K': 4.0,
 'bigdata': False,
 'bpr_batch_size': 2048,
 'decay': 0.0001,
 'dropout': 0,
 'dual_res': False,
 'keep_prob': 0.6,
 'latent_dim_rec': 360,
 'learnable_time': False,
 'lightGCN_n_layers': 2,
 'lr': 0.001,
 'lr_time': 0.001,
 'multicore': 0,
 'pretrain': 0,
 'pretrained_file_name': 'ltocf',
 'solver': 'rk4',
 'test_u_batch_size': 512,
 'time_split': 4}
cores for test: 6
comment: lt-ncf
tensorboard: 1
LOAD: 0
Weight path: ./checkpoints
Test Topks: [20]
usi

## CF

In [None]:
os._exit(0)

In [1]:
import os
ROOT_PATH = os.getcwd()
RELATIVE_SSCF_PATH = "./SSCF/sscf"

In [2]:
# Root 작업 경로로 변경
os.chdir(ROOT_PATH)

# 현재 작업 경로 출력
print("현재 작업 경로:", os.getcwd())

# 상대 경로를 절대 경로로 변환
absolute_path = os.path.abspath(RELATIVE_SSCF_PATH)

# 작업 경로 변경
os.chdir(absolute_path)

# 변경된 작업 경로 출력
print("변경된 작업 경로:", os.getcwd())

현재 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System
변경된 작업 경로: C:\Users\medici\Dacon_Job-Recommendation-System\SSCF\sscf


In [3]:
%run main.py --dataset="JOB" --test="test" --gamma=0.2 --similarity="pearson"


[gamma] '0.2', [similarity] 'pearson'

reading data...
measuring similarity of users...
measuring user-based recommendations...
measuring similarity of items...
measuring item-based recommendations...
measuring final model recommendations...
measuring performance of final model...

[final model]
recall@20: 0.00000

--------------------------------------------------------------------------------
Total Time: 21.39 seconds


# [앙상블]

In [None]:
import os
os._exit(0)

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

## File load

In [3]:
LTOCF_FOLDER_PATH = './LT-OCF/results/'
BSPM_FOLDER_PATH = './BSPM/results/'
SSCF_FOLDER_PATH = './SSCF/results/'

In [4]:
LTOCF_FILE_NAME = 'JOB_360_0.0030__319'
BSPM_FILE_NAME = 'JOB_960'

### LT-OCF

In [5]:
lt_proba = pd.read_csv(f'{LTOCF_FOLDER_PATH}proba_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_rating_idx= pd.read_csv(f'{LTOCF_FOLDER_PATH}rating_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_user = pd.read_csv(f'{LTOCF_FOLDER_PATH}user_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)

### BSPM

In [6]:
bspm_proba = pd.read_csv(f'{BSPM_FOLDER_PATH}proba_{BSPM_FILE_NAME}.txt', delimiter=' ', header=None)
bspm_rating_idx= pd.read_csv(f'{BSPM_FOLDER_PATH}rating_{BSPM_FILE_NAME}.txt', delimiter=' ', header=None)
bspm_user = pd.read_csv(f'{BSPM_FOLDER_PATH}user_{BSPM_FILE_NAME}.txt', delimiter=' ', header=None)

### SSCF

In [7]:
sscf_proba = pd.read_csv(f'{SSCF_FOLDER_PATH}pred_prob_list_20.txt', delimiter=' ', header=None)
sscf_rating_idx= pd.read_csv(f'{SSCF_FOLDER_PATH}pred_idx_list_20.txt', delimiter=' ', header=None)

## LT-OCF DF 생성

In [8]:
lt_proba_df = pd.concat([lt_user, lt_proba], axis=1)
lt_proba_df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,0.999983,0.999955,0.999934,0.999892,0.999884,0.999884,0.999808,0.999793,0.999778,...,0.999753,0.999709,0.999684,0.999586,0.999557,0.999508,0.999494,0.999261,0.999212,0.999158
1,1,0.999989,0.999978,0.999975,0.99997,0.999968,0.999954,0.99995,0.999945,0.999937,...,0.999923,0.999913,0.99991,0.999898,0.999894,0.999893,0.999893,0.999891,0.999888,0.999884
2,2,0.999956,0.999931,0.999911,0.999906,0.999897,0.99986,0.999852,0.999849,0.999827,...,0.999756,0.99974,0.999729,0.999714,0.999699,0.99967,0.99964,0.999637,0.999632,0.999625
3,3,0.999982,0.999981,0.999965,0.999957,0.999927,0.999923,0.999915,0.999915,0.999885,...,0.999841,0.99983,0.999816,0.999768,0.999749,0.999735,0.999728,0.99971,0.999682,0.999668
4,4,0.999793,0.999759,0.999745,0.999717,0.999625,0.999623,0.999466,0.999358,0.999341,...,0.99919,0.999189,0.99902,0.998515,0.998202,0.998198,0.99803,0.998006,0.997997,0.99795


In [9]:
lt_idx_df = pd.concat([lt_user, lt_rating_idx], axis=1)
lt_idx_df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,357,156,1965,1030,2832,5164,4675,4670,6228,...,3195,4050,1051,5279,5358,161,2502,3641,5598,5450
1,1,419,3275,1747,2688,961,2924,5060,1660,3370,...,2574,1746,4350,2881,5096,6250,1749,2650,2554,643
2,2,480,5791,1454,6060,716,3543,4932,4726,5093,...,6213,5851,1790,4745,2895,5260,5226,4475,1672,4244
3,3,3575,534,2219,870,800,1188,5210,2739,3364,...,2390,6163,2264,434,1919,2072,1946,2475,841,913
4,4,5085,5229,3691,3637,1188,2428,4782,1094,1986,...,2893,825,1001,800,5329,2857,3074,594,2934,1385


## BSPM DF 생성

In [10]:
bspm_proba_df = pd.concat([bspm_user, bspm_proba], axis=1)
bspm_proba_df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,0.064493,0.051239,0.050869,0.046182,0.040701,0.036274,0.034129,0.033497,0.032029,...,0.030471,0.030239,0.029476,0.029303,0.02788,0.02716,0.025285,0.024519,0.024484,0.023947
1,1,0.080086,0.075567,0.073841,0.072431,0.070084,0.065038,0.063998,0.063467,0.062092,...,0.060555,0.059121,0.058809,0.058299,0.057959,0.057243,0.055875,0.055593,0.053641,0.052832
2,2,0.063191,0.047769,0.041074,0.040986,0.03847,0.038455,0.037921,0.036122,0.035649,...,0.033184,0.031998,0.031256,0.030993,0.030105,0.029818,0.029745,0.029692,0.029385,0.029359
3,3,0.162731,0.153145,0.101066,0.094658,0.093844,0.079816,0.076262,0.07608,0.066872,...,0.06355,0.063116,0.061223,0.05888,0.057382,0.057093,0.056032,0.054265,0.054,0.052819
4,4,0.041308,0.039246,0.037293,0.034734,0.034231,0.029262,0.029196,0.029115,0.029075,...,0.027709,0.027645,0.027118,0.026612,0.025969,0.025766,0.025761,0.025566,0.025356,0.023943


In [11]:
bspm_idx_df = pd.concat([bspm_user, bspm_rating_idx], axis=1)
bspm_idx_df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,357,2832,1407,156,1403,5164,2576,4704,3349,...,161,1965,3641,4670,6226,5279,1653,2502,6228,3374
1,1,3275,1709,2396,419,643,2688,3739,3230,2138,...,961,1747,1725,1660,3097,5060,3015,1068,1508,3779
2,2,1454,1790,5791,5260,4726,2147,5246,716,3543,...,5377,1447,5239,6398,5093,2512,4244,2834,1000,4855
3,3,2219,2739,3575,4258,3364,6276,1188,434,6163,...,5308,870,1094,3691,4434,800,5210,5137,2072,2475
4,4,1029,5085,3691,5229,3637,4782,5787,5329,1188,...,2739,3852,783,4652,870,825,1094,2264,594,2893


## LT-OCF, SSCF, BSPM 중복 비교

In [12]:
# 각 행별로 컬럼의 원소를 set에 저장
lt_row_sets = [set(row) for _, row in lt_rating_idx.iloc[:, 1:].iterrows()]
sscf_row_sets = [set(row) for _, row in sscf_rating_idx.iloc[:, 1:].iterrows()]
bspm_row_sets = [set(row) for _, row in bspm_rating_idx.iloc[:, 1:].iterrows()]

### LT-OCF / SSCF

In [13]:
data = []
for i in range(len(lt_row_sets)):
    inter = lt_row_sets[i].intersection(sscf_row_sets[i])
    # print("inter:", f"count:{len(inter)}", inter)
    row = {}
    row['count'] = len(inter)
    row['intersection'] = inter
    data.append(row)
intersections = pd.DataFrame(data)
intersections.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
count,8482.0,,,,10.537963,2.531031,1.0,9.0,11.0,12.0,18.0
intersection,8482.0,8465.0,"{929, 2595, 323, 4101, 646, 88, 10, 3187, 54, ...",2.0,,,,,,,


### LT-OCF / BSPM

In [14]:
data = []
for i in range(len(lt_row_sets)):
    inter = lt_row_sets[i].intersection(bspm_row_sets[i])
    # print("inter:", f"count:{len(inter)}", inter)
    row = {}
    row['count'] = len(inter)
    row['intersection'] = inter
    data.append(row)
intersections = pd.DataFrame(data)
intersections.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
count,8482.0,,,,9.885169,2.524217,0.0,8.0,10.0,12.0,18.0
intersection,8482.0,8465.0,"{3269, 4473, 3179, 5932, 1771, 2606, 3566, 691...",2.0,,,,,,,


### SSCF / BSPM

In [15]:
data = []
for i in range(len(sscf_row_sets)):
    inter = sscf_row_sets[i].intersection(bspm_row_sets[i])
    # print("inter:", f"count:{len(inter)}", inter)
    row = {}
    row['count'] = len(inter)
    row['intersection'] = inter
    data.append(row)
intersections = pd.DataFrame(data)
intersections.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
count,8482.0,,,,12.272459,2.503702,1.0,11.0,12.0,14.0,19.0
intersection,8482.0,8461.0,"{5152, 1600, 5768, 5211, 2507, 4748, 3884, 571...",2.0,,,,,,,


## Score DF 생성

### LT-OCF

In [16]:
num_rows, num_cols = lt_proba_df.shape
data = []
for r in range(num_rows):
    row = {}
    for c in range(num_cols):
        if c==0:
            row['resume_idx'] = lt_proba_df.iloc[r,c]
        else:
            row['recruitment_idx'] = lt_idx_df.iloc[r,c]        
            row['lt_score'] = lt_proba_df.iloc[r,c]
            data.append(row.copy())
lt_score = pd.DataFrame(data)
lt_score.head()

Unnamed: 0,resume_idx,recruitment_idx,lt_score
0,0,357,0.999983
1,0,156,0.999955
2,0,1965,0.999934
3,0,1030,0.999892
4,0,2832,0.999884


In [17]:
lt_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169640 entries, 0 to 169639
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   resume_idx       169640 non-null  int64  
 1   recruitment_idx  169640 non-null  int64  
 2   lt_score         169640 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 3.9 MB


### BSPM

In [18]:
num_rows, num_cols = bspm_proba_df.shape
data = []
for r in range(num_rows):
    row = {}
    for c in range(num_cols):
        if c==0:
            row['resume_idx'] = bspm_proba_df.iloc[r,c]
        else:
            row['recruitment_idx'] = bspm_idx_df.iloc[r,c]        
            row['bspm_score'] = bspm_proba_df.iloc[r,c]
            data.append(row.copy())
bspm_score = pd.DataFrame(data)
bspm_score.head()

Unnamed: 0,resume_idx,recruitment_idx,bspm_score
0,0,357,0.064493
1,0,2832,0.051239
2,0,1407,0.050869
3,0,156,0.046182
4,0,1403,0.040701


In [19]:
bspm_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169640 entries, 0 to 169639
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   resume_idx       169640 non-null  int64  
 1   recruitment_idx  169640 non-null  int64  
 2   bspm_score       169640 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 3.9 MB


### SSCF

In [20]:
num_rows, num_cols = sscf_proba.shape
data = []
for r in range(num_rows):
    row = {}
    for c in range(num_cols):
        if c==0:
            row['resume_idx'] = sscf_proba.iloc[r,c]
        else:
            row['recruitment_idx'] = sscf_rating_idx.iloc[r,c]        
            row['sscf_score'] = sscf_proba.iloc[r,c]
            data.append(row.copy())
sscf_score = pd.DataFrame(data)
sscf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,sscf_score
0,0,357,1.0
1,0,2832,0.838635
2,0,156,0.731138
3,0,5164,0.700984
4,0,4670,0.672044


In [21]:
sscf_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169640 entries, 0 to 169639
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   resume_idx       169640 non-null  int64  
 1   recruitment_idx  169640 non-null  int64  
 2   sscf_score       169640 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 3.9 MB


## 점수 스케일링(MinMax)

### LT-OCF

In [22]:
mms = MinMaxScaler()

In [23]:
lt_score['scaled_lt_score'] = lt_score.groupby('resume_idx')['lt_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

In [24]:
lt_score.head(20)

Unnamed: 0,resume_idx,recruitment_idx,lt_score,scaled_lt_score
0,0,357,0.999983,1.0
1,0,156,0.999955,0.965208
2,0,1965,0.999934,0.940382
3,0,1030,0.999892,0.888842
4,0,2832,0.999884,0.880195
5,0,5164,0.999884,0.879348
6,0,4675,0.999808,0.786996
7,0,4670,0.999793,0.768952
8,0,6228,0.999778,0.751344
9,0,5483,0.999766,0.736328


### BSPM

In [25]:
bspm_score['scaled_bspm_score'] = bspm_score.groupby('resume_idx')['bspm_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

In [26]:
bspm_score.head(20)

Unnamed: 0,resume_idx,recruitment_idx,bspm_score,scaled_bspm_score
0,0,357,0.064493,1.0
1,0,2832,0.051239,0.673108
2,0,1407,0.050869,0.663973
3,0,156,0.046182,0.548383
4,0,1403,0.040701,0.413211
5,0,5164,0.036274,0.304027
6,0,2576,0.034129,0.251131
7,0,4704,0.033497,0.235529
8,0,3349,0.032029,0.199326
9,0,1541,0.031028,0.174656


### SSCF

In [27]:
sscf_score['scaled_sscf_score'] = sscf_score.groupby('resume_idx')['sscf_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

In [28]:
sscf_score.head(20)

Unnamed: 0,resume_idx,recruitment_idx,sscf_score,scaled_sscf_score
0,0,357,1.0,1.0
1,0,2832,0.838635,0.729852
2,0,156,0.731138,0.549886
3,0,5164,0.700984,0.499404
4,0,4670,0.672044,0.450954
5,0,6228,0.596687,0.324796
6,0,3641,0.567364,0.275706
7,0,4675,0.560605,0.264389
8,0,1965,0.547928,0.243166
9,0,3349,0.529323,0.212019


## merged_score

In [40]:
merged_score = pd.merge(lt_score, sscf_score, on=["resume_idx", "recruitment_idx"], how="outer")
merged_score = pd.merge(merged_score, bspm_score, on=["resume_idx", "recruitment_idx"], how="outer")
merged_score = merged_score.fillna(0)
merged_score = merged_score.sort_values('resume_idx').reset_index(drop=True)
merged_score.head(40)

Unnamed: 0,resume_idx,recruitment_idx,lt_score,scaled_lt_score,sscf_score,scaled_sscf_score,bspm_score,scaled_bspm_score
0,0,357,0.999983,1.0,1.0,1.0,0.064493,1.0
1,0,5450,0.999158,0.0,0.447291,0.074686,0.0,0.0
2,0,1403,0.0,0.0,0.0,0.0,0.040701,0.413211
3,0,3349,0.0,0.0,0.529323,0.212019,0.032029,0.199326
4,0,5097,0.0,0.0,0.518098,0.193227,0.0,0.0
5,0,2576,0.0,0.0,0.508563,0.177263,0.034129,0.251131
6,0,2415,0.0,0.0,0.505786,0.172614,0.0,0.0
7,0,5598,0.999212,0.066399,0.0,0.0,0.0,0.0
8,0,974,0.0,0.0,0.453572,0.085201,0.0,0.0
9,0,4415,0.0,0.0,0.410012,0.012276,0.0,0.0


In [42]:
merged_score[merged_score['resume_idx']==0].shape

(34, 8)

## idx 디코딩

### 디코딩 딕셔너리 로드

In [30]:
DATA_PATH = './data/'

In [31]:
resume_decode = pd.read_csv(f"{DATA_PATH}idx_2_resume.txt", delimiter=" ", header=None)
resume_decode.head()

Unnamed: 0,0,1
0,0,U05833
1,1,U06456
2,2,U07807
3,3,U04842
4,4,U08336


In [32]:
resume_decode = resume_decode.set_index(0).to_dict()[1]

In [33]:
recruitment_decode = pd.read_csv(f"{DATA_PATH}/idx_2_recruitment.txt", delimiter=" ", header=None)
recruitment_decode.head()

Unnamed: 0,0,1
0,0,R03838
1,1,R02144
2,2,R01877
3,3,R02463
4,4,R00112


In [34]:
recruitment_decode = recruitment_decode.set_index(0).to_dict()[1]

### 모델스코어 디코딩

In [35]:
merged_score['resume_seq'] = merged_score['resume_idx'].apply(lambda x : resume_decode[x])
merged_score.head()

Unnamed: 0,resume_idx,recruitment_idx,lt_score,scaled_lt_score,sscf_score,scaled_sscf_score,bspm_score,scaled_bspm_score,resume_seq
0,0,357,0.999983,1.0,1.0,1.0,0.064493,1.0,U05833
1,0,5450,0.999158,0.0,0.447291,0.074686,0.0,0.0,U05833
2,0,1403,0.0,0.0,0.0,0.0,0.040701,0.413211,U05833
3,0,3349,0.0,0.0,0.529323,0.212019,0.032029,0.199326,U05833
4,0,5097,0.0,0.0,0.518098,0.193227,0.0,0.0,U05833


In [36]:
merged_score['recruitment_seq'] = merged_score['recruitment_idx'].apply(lambda x : recruitment_decode[x])
merged_score.head()

Unnamed: 0,resume_idx,recruitment_idx,lt_score,scaled_lt_score,sscf_score,scaled_sscf_score,bspm_score,scaled_bspm_score,resume_seq,recruitment_seq
0,0,357,0.999983,1.0,1.0,1.0,0.064493,1.0,U05833,R00585
1,0,5450,0.999158,0.0,0.447291,0.074686,0.0,0.0,U05833,R03651
2,0,1403,0.0,0.0,0.0,0.0,0.040701,0.413211,U05833,R02439
3,0,3349,0.0,0.0,0.529323,0.212019,0.032029,0.199326,U05833,R00353
4,0,5097,0.0,0.0,0.518098,0.193227,0.0,0.0,U05833,R01382


## 비율별 Ensemble 및 저장

### 비율 탐색

In [37]:
for i in range(0,100,10):
    for j in range(0, 101-i, 10):
        left = i / 100
        mid = j / 100
        right = (100 - i - j) / 100
        print(f"{left:.2f}, {mid:.2f}, {right:.2f}")

0.00, 0.00, 1.00
0.00, 0.10, 0.90
0.00, 0.20, 0.80
0.00, 0.30, 0.70
0.00, 0.40, 0.60
0.00, 0.50, 0.50
0.00, 0.60, 0.40
0.00, 0.70, 0.30
0.00, 0.80, 0.20
0.00, 0.90, 0.10
0.00, 1.00, 0.00
0.10, 0.00, 0.90
0.10, 0.10, 0.80
0.10, 0.20, 0.70
0.10, 0.30, 0.60
0.10, 0.40, 0.50
0.10, 0.50, 0.40
0.10, 0.60, 0.30
0.10, 0.70, 0.20
0.10, 0.80, 0.10
0.10, 0.90, 0.00
0.20, 0.00, 0.80
0.20, 0.10, 0.70
0.20, 0.20, 0.60
0.20, 0.30, 0.50
0.20, 0.40, 0.40
0.20, 0.50, 0.30
0.20, 0.60, 0.20
0.20, 0.70, 0.10
0.20, 0.80, 0.00
0.30, 0.00, 0.70
0.30, 0.10, 0.60
0.30, 0.20, 0.50
0.30, 0.30, 0.40
0.30, 0.40, 0.30
0.30, 0.50, 0.20
0.30, 0.60, 0.10
0.30, 0.70, 0.00
0.40, 0.00, 0.60
0.40, 0.10, 0.50
0.40, 0.20, 0.40
0.40, 0.30, 0.30
0.40, 0.40, 0.20
0.40, 0.50, 0.10
0.40, 0.60, 0.00
0.50, 0.00, 0.50
0.50, 0.10, 0.40
0.50, 0.20, 0.30
0.50, 0.30, 0.20
0.50, 0.40, 0.10
0.50, 0.50, 0.00
0.60, 0.00, 0.40
0.60, 0.10, 0.30
0.60, 0.20, 0.20
0.60, 0.30, 0.10
0.60, 0.40, 0.00
0.70, 0.00, 0.30
0.70, 0.10, 0.20
0.70, 0.20, 0.

### 최적의 비율(LTOCF:SSCF:BSPM=61:36:3)

In [None]:
SAVE_PATH = './results/'

In [39]:
left = 0.61
mid = 0.36
right = 0.03

submission_dfs = []

submission_score = merged_score.copy()
submission_score['sum'] = submission_score['scaled_lt_score']*left +\
                        submission_score['scaled_sscf_score']*mid +\
                        submission_score['scaled_bspm_score']*right
submission_score = submission_score[['resume_seq', 'recruitment_seq', 'sum']]
submission_score_5 = submission_score.groupby(['resume_seq']).apply(lambda group: group.nlargest(5, 'sum'))
submission_score_5[['resume_seq', 'recruitment_seq']].to_csv(
    f"{SAVE_PATH}[Ensemble]L{left:.2f}_S{mid:.2f}_B{right:.2f}.csv", index=False)
submission_dfs.append(submission_score_5)