In [1]:
# !pip install --upgrade git+https://github.com/tooha289/DataAnalysisLibrary.git

In [2]:
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import chi2_contingency
from DataAnalysis import eda

from matplotlib import font_manager

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
font_path = "C:/Windows/Fonts/malgun.ttf"
font_family = font_manager.FontProperties(fname=font_path).get_name()
plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False

In [5]:
# 폴더가 존재하지 않으면 생성
RECALL_PATH = './recalls/'
for path in [RECALL_PATH]:
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"폴더가 생성되었습니다: {path}")
    else:
        print(f"폴더가 이미 존재합니다: {path}")

폴더가 이미 존재합니다: ./recalls/


# 데이터 load

In [6]:
resume = pd.read_csv("../data/resume.csv")
recruitment = pd.read_csv("../data/recruitment.csv")
apply = pd.read_csv("../data/apply_train.csv")

## train_test 분리

In [7]:
train, test = [], []
apply_train_groupby = apply.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    test.append([uid, iids[-1]])

In [8]:
apply = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
test = pd.DataFrame(test, columns=['resume_seq', 'recruitment_seq'])

# 전처리

In [9]:
used_cols = ['resume_seq', 'degree', 'job_code_seq1']
resume = resume[used_cols]

used_cols = ['recruitment_seq', 'address_seq1', 'education', 'major_task', 'qualifications']
recruitment = recruitment[used_cols]

## address_seq1의 na값을 0으로 처리

In [10]:
recruitment['address_seq1'].unique()

array([ 3., 20.,  5.,  9., 11.,  1., nan, 13.,  2.])

In [11]:
recruitment = recruitment.fillna({'address_seq1':3}, axis=0)

## 타입 변경

In [12]:
res_categorical_cols = ['degree', 'job_code_seq1']
rec_categorical_cols = ['address_seq1', 'education', 'major_task', 'qualifications']
resume[res_categorical_cols] = resume[res_categorical_cols].astype('category')
recruitment[rec_categorical_cols] = recruitment[rec_categorical_cols].astype('category')

In [13]:
recruitment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   recruitment_seq  6695 non-null   object  
 1   address_seq1     6695 non-null   category
 2   education        6695 non-null   category
 3   major_task       6695 non-null   category
 4   qualifications   6695 non-null   category
dtypes: category(4), object(1)
memory usage: 79.6+ KB


## 데이터 병합

In [14]:
merged_df = resume.merge(apply, how='inner', left_on = 'resume_seq', right_on ='resume_seq')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49464 entries, 0 to 49463
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   resume_seq       49464 non-null  object  
 1   degree           49464 non-null  category
 2   job_code_seq1    49464 non-null  category
 3   recruitment_seq  49464 non-null  object  
dtypes: category(2), object(2)
memory usage: 1.2+ MB


In [15]:
merged_df = merged_df.merge(recruitment, how='inner', left_on = 'recruitment_seq', right_on ='recruitment_seq')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49464 entries, 0 to 49463
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   resume_seq       49464 non-null  object  
 1   degree           49464 non-null  category
 2   job_code_seq1    49464 non-null  category
 3   recruitment_seq  49464 non-null  object  
 4   address_seq1     49464 non-null  category
 5   education        49464 non-null  category
 6   major_task       49464 non-null  category
 7   qualifications   49464 non-null  category
dtypes: category(6), object(2)
memory usage: 1.4+ MB


## 함수 정의

In [16]:
# eda function
%run ../functions/eda.py
# metric function
%run ../functions/metric.py
# submission function
%run ../functions/submission.py

# 인기도 가중치

In [17]:
mms = MinMaxScaler()
stds = StandardScaler()

# 모델 결과 생성

## idx 디코딩

### 디코딩 딕셔너리 로드

In [18]:
DATA_PATH = '../data/'

In [19]:
resume_decode = pd.read_csv(f"{DATA_PATH}idx_2_resume.txt", delimiter=" ", header=None)
resume_decode = resume_decode.set_index(0).to_dict()[1]

In [20]:
recruitment_decode = pd.read_csv(f"{DATA_PATH}/idx_2_recruitment.txt", delimiter=" ", header=None)
recruitment_decode = recruitment_decode.set_index(0).to_dict()[1]

## CF 모델

In [21]:
DATA_SET = "JOB_1"

In [22]:
CF_FOLDER_PATH = '../CF/results/'
cf_proba = pd.read_csv(f'{CF_FOLDER_PATH}pred_prob_{DATA_SET}.txt', delimiter=' ', header=None)
cf_rating_idx = pd.read_csv(f'{CF_FOLDER_PATH}pred_idx_{DATA_SET}.txt', delimiter=' ', header=None)

In [23]:
cf_score = create_score_dataframe(cf_proba, cf_rating_idx, "cf")
cf_score.head(20)

Unnamed: 0,resume_idx,recruitment_idx,cf_score
0,0,156,1.0
1,0,4670,0.744136
2,0,3349,0.674452
3,0,2576,0.649263
4,0,4675,0.578998
5,0,5450,0.55209
6,0,5164,0.529899
7,0,5097,0.505583
8,0,2756,0.482113
9,0,6228,0.47724


In [24]:
sbm_cf = create_submission(cf_rating_idx, resume_decode, recruitment_decode)

In [25]:
recall5_dacon(test, sbm_cf)

0.12827163404857345

## LTOCF 모델

In [26]:
LTOCF_FOLDER_PATH = '../LT-OCF/results/'
LTOCF_FILE_NAME = 'JOB_1_360_0.0031__319_0.3145'

In [28]:
lt_proba = pd.read_csv(f'{LTOCF_FOLDER_PATH}proba_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_rating_idx= pd.read_csv(f'{LTOCF_FOLDER_PATH}rating_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)
lt_user = pd.read_csv(f'{LTOCF_FOLDER_PATH}user_{LTOCF_FILE_NAME}.txt', delimiter=' ', header=None)

In [29]:
lt_proba_df = pd.concat([lt_user, lt_proba], axis=1)
lt_idx_df = pd.concat([lt_user, lt_rating_idx], axis=1)

In [30]:
ltocf_score = create_score_dataframe(lt_proba_df, lt_idx_df, "ltocf")
ltocf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,ltocf_score
0,0,156,0.999877
1,0,3195,0.999227
2,0,1051,0.998012
3,0,4670,0.997515
4,0,2576,0.997507


In [31]:
lt_idx_df.columns = range(lt_idx_df.shape[1])

In [32]:
sbm_ltocf = create_submission(lt_idx_df, resume_decode, recruitment_decode)

In [33]:
sbm_ltocf.sort_values("resume_seq")

Unnamed: 0,resume_seq,recruitment_seq
25778,U00001,R03777
332,U00001,R01528
8814,U00001,R03811
34260,U00001,R02064
17296,U00001,R02400
...,...,...
41920,U08482,R02524
16474,U08482,R00712
24956,U08482,R01186
7992,U08482,R04602


In [34]:
recall5_dacon(test, sbm_ltocf)

0.12968639471822682

# 모델 결과 + 인기도 가중치

In [42]:
def create_weight_df(origin_df, popularity_df, target_cols, grouping_col, limit_ratio=0.005):
    merged_df = popularity_df.copy()
    weight_dfs = []
    for target_col in target_cols:
        _, ratio_df_hue_ratio = calculate_ratio_df_with_hue(origin_df, target_col, grouping_col, limit_ratio)

        weight_df = ratio_df_hue_ratio.iloc[:-1, :-1].reset_index()
        weight_df = weight_df.melt(id_vars=target_col, var_name=grouping_col, value_name=f'{target_col}_weight')
        weight_dfs.append(weight_df)
        
        merged_df = merged_df.merge(weight_df, how='inner', on=[grouping_col, target_col])
    return merged_df, weight_dfs

## CF 인기도 가중치 적용

In [35]:
cf_score['resume_seq'] = cf_score['resume_idx'].apply(lambda x: resume_decode[x])
cf_score['recruitment_seq'] = cf_score['recruitment_idx'].apply(lambda x: recruitment_decode[x])
cf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,cf_score,resume_seq,recruitment_seq
0,0,156,1.0,U05833,R03943
1,0,4670,0.744136,U05833,R04100
2,0,3349,0.674452,U05833,R00353
3,0,2576,0.649263,U05833,R00097
4,0,4675,0.578998,U05833,R02806


In [36]:
cf_score = cf_score.merge(resume, how='inner', on='resume_seq')
cf_score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169640 entries, 0 to 169639
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   resume_idx       169640 non-null  int64   
 1   recruitment_idx  169640 non-null  int64   
 2   cf_score         169640 non-null  float64 
 3   resume_seq       169640 non-null  object  
 4   recruitment_seq  169640 non-null  object  
 5   degree           169640 non-null  category
 6   job_code_seq1    169640 non-null  category
dtypes: category(2), float64(1), int64(2), object(2)
memory usage: 8.1+ MB


### job_code_seq1

#### recruitment_feature : job_code_seq1 가중치

In [152]:
cf_score_weight = cf_score.merge(recruitment, how='inner', on='recruitment_seq')
cf_job_weight, weight_dfs = create_weight_df(merged_df, cf_score_weight, ['address_seq1', 'education',
                                                      'major_task', 'qualifications'], 'job_code_seq1', limit_ratio=0)

In [153]:
weight_columns = cf_job_weight.filter(like="_weight").columns
weight_columns

Index(['address_seq1_weight', 'education_weight', 'major_task_weight',
       'qualifications_weight'],
      dtype='object')

In [154]:
cf_job_weight['sqrt_weight_sum'] = cf_job_weight[weight_columns].apply(np.sqrt).sum(axis=1)

In [155]:
cf_job_weight['s_sqrt_weight_sum'] = cf_job_weight.groupby('resume_idx')['sqrt_weight_sum'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
cf_job_weight['scf_score'] = cf_job_weight.groupby('resume_idx')['cf_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

In [156]:
cf_job_weight['sws_cf_score'] = cf_job_weight[['s_sqrt_weight_sum', 'scf_score']].sum(axis=1)

In [158]:
sbm_weight_a = cf_job_weight.groupby('resume_seq').apply(lambda g: g.nlargest(5, 'sws_cf_score'))
sbm_weight_a = sbm_weight_a.reset_index(drop=True)
sbm_weight_a = sbm_weight_a[['resume_seq', 'recruitment_seq']]
sbm_weight_a

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R05862
1,U00001,R03811
2,U00001,R03777
3,U00001,R06276
4,U00001,R04566
...,...,...
42405,U08482,R04602
42406,U08482,R04021
42407,U08482,R05461
42408,U08482,R03743


In [129]:
sbm_cf = cf_score_weight.groupby('resume_seq').apply(lambda g: g.nlargest(5, 'cf_score'))
sbm_cf = sbm_cf.reset_index(drop=True)
sbm_cf = sbm_cf[['resume_seq', 'recruitment_seq']]
sbm_cf.head(10)

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R03811
1,U00001,R05862
2,U00001,R03777
3,U00001,R04769
4,U00001,R03037
5,U00002,R01103
6,U00002,R02412
7,U00002,R04074
8,U00002,R06216
9,U00002,R01081


In [130]:
%run ../functions/metric.py

In [160]:
recall5_dacon(test, sbm_cf)

0.12827163404857345

#### RECALL 계산

##### 가중치 인기도

In [161]:
recalls_2 = []

In [166]:
weight_scores = create_ensemble_submission_2col(cf_job_weight,
                                                'scf_score', 's_sqrt_weight_sum',
                                                start=90, end=101, step=1)

0.90 : 0.10
0.91 : 0.09
0.92 : 0.08
0.93 : 0.07
0.94 : 0.06
0.95 : 0.05
0.96 : 0.04
0.97 : 0.03
0.98 : 0.02
0.99 : 0.01
1.00 : 0.00


In [167]:
for ratio, sbm in tqdm(weight_scores.items()):
    data = {}
    data['ratio'] = ratio
    data['recall'] = calculate_precision_at_k_dacon(test, sbm, 5)
    recalls_2.append(data)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:12<00:00,  1.14s/it]


In [168]:
recall_job_2 = pd.DataFrame(recalls_2).drop_duplicates().sort_values('recall', ascending=False)
recall_job_2.head(10)

Unnamed: 0,ratio,recall
13,0.92:0.08,0.129333
9,0.90:0.10,0.129097
12,0.91:0.09,0.128979
14,0.93:0.07,0.128861
16,0.95:0.05,0.128507
15,0.94:0.06,0.128507
19,0.98:0.02,0.12839
10,1.00:0.00,0.128272
17,0.96:0.04,0.128272
18,0.97:0.03,0.128154


In [141]:
recall_job_2.to_csv(f'{RECALL_PATH}/weighted_{DATA_SET}.csv', index=False)

## LTOCF 인기도 가중치 적용

In [156]:
MODEL = "LTOCF"

In [134]:
ltocf_score['resume_seq'] = ltocf_score['resume_idx'].apply(lambda x: resume_decode[x])
ltocf_score['recruitment_seq'] = ltocf_score['recruitment_idx'].apply(lambda x: recruitment_decode[x])
ltocf_score.head()

Unnamed: 0,resume_idx,recruitment_idx,ltocf_score,resume_seq,recruitment_seq
0,0,156,0.999877,U05833,R03943
1,0,3195,0.999227,U05833,R05806
2,0,1051,0.998012,U05833,R02540
3,0,4670,0.997515,U05833,R04100
4,0,2576,0.997507,U05833,R00097


In [135]:
ltocf_score = ltocf_score.merge(resume, how='inner', on='resume_seq')
ltocf_score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169640 entries, 0 to 169639
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   resume_idx       169640 non-null  int64   
 1   recruitment_idx  169640 non-null  int64   
 2   ltocf_score      169640 non-null  float64 
 3   resume_seq       169640 non-null  object  
 4   recruitment_seq  169640 non-null  object  
 5   degree           169640 non-null  category
 6   job_code_seq1    169640 non-null  category
dtypes: category(2), float64(1), int64(2), object(2)
memory usage: 8.1+ MB


### Degree

In [136]:
degree_weight_pop_view = degree_weight_pop[['degree', 'recruitment_seq',
                                            'score', 'w_score']]

In [137]:
ltocf_score_pop = ltocf_score.merge(degree_weight_pop_view, how='left', on=['degree', 'recruitment_seq'])
ltocf_score_pop[['score', 'w_score']] = ltocf_score_pop[['score', 'w_score']].fillna(0)
ltocf_score_pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169640 entries, 0 to 169639
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   resume_idx       169640 non-null  int64   
 1   recruitment_idx  169640 non-null  int64   
 2   ltocf_score      169640 non-null  float64 
 3   resume_seq       169640 non-null  object  
 4   recruitment_seq  169640 non-null  object  
 5   degree           169640 non-null  int64   
 6   job_code_seq1    169640 non-null  category
 7   score            169640 non-null  float64 
 8   w_score          169640 non-null  float64 
dtypes: category(1), float64(3), int64(3), object(2)
memory usage: 11.8+ MB


#### MinMax Scaling

In [138]:
# ltocf_score_pop['w_score'] = mms.fit_transform(ltocf_score_pop['w_score'].values.reshape(-1,1))
# ltocf_score_pop['ltocf_score'] = mms.fit_transform(ltocf_score_pop['ltocf_score'].values.reshape(-1,1))

In [139]:
ltocf_score_pop['s_score'] = ltocf_score_pop.groupby('resume_idx')['score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
ltocf_score_pop['sw_score'] = ltocf_score_pop.groupby('resume_idx')['w_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
ltocf_score_pop['scf_score'] = ltocf_score_pop.groupby('resume_idx')['ltocf_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

#### RECALL 계산

##### 단순인기도

In [140]:
recalls_1 = []

In [141]:
pop_scores = create_ensemble_submission_2col(ltocf_score_pop, 'scf_score', 's_score', start=0, end=101, step=10)

0.00 : 1.00
0.10 : 0.90
0.20 : 0.80
0.30 : 0.70
0.40 : 0.60
0.50 : 0.50
0.60 : 0.40
0.70 : 0.30
0.80 : 0.20
0.90 : 0.10
1.00 : 0.00


In [142]:
for ratio, sbm in tqdm(pop_scores.items()):
    data = {}
    data['ratio'] = ratio
    data['recall'] = calculate_precision_at_k_dacon(test, sbm, 5)
    recalls_1.append(data)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:13<00:00,  1.23s/it]


In [143]:
pd.DataFrame(recalls_1).drop_duplicates().sort_values('recall', ascending=False)

Unnamed: 0,ratio,recall
10,1.00:0.00,0.129686
8,0.80:0.20,0.129215
9,0.90:0.10,0.128154
7,0.70:0.30,0.124617
6,0.60:0.40,0.120137
5,0.50:0.50,0.116482
4,0.40:0.60,0.113417
3,0.30:0.70,0.108229
2,0.20:0.80,0.100094
1,0.10:0.90,0.093964


##### 가중치 인기도

In [144]:
recalls_2 = []

In [145]:
pop_scores = create_ensemble_submission_2col(ltocf_score_pop, 'scf_score', 'sw_score', start=0, end=101, step=10)

0.00 : 1.00
0.10 : 0.90
0.20 : 0.80
0.30 : 0.70
0.40 : 0.60
0.50 : 0.50
0.60 : 0.40
0.70 : 0.30
0.80 : 0.20
0.90 : 0.10
1.00 : 0.00


In [146]:
for ratio, sbm in tqdm(pop_scores.items()):
    data = {}
    data['ratio'] = ratio
    data['recall'] = calculate_precision_at_k_dacon(test, sbm, 5)
    recalls_2.append(data)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:12<00:00,  1.16s/it]


In [147]:
pd.DataFrame(recalls_2).drop_duplicates().sort_values('recall', ascending=False)

Unnamed: 0,ratio,recall
10,1.00:0.00,0.129686
8,0.80:0.20,0.128861
9,0.90:0.10,0.128036
7,0.70:0.30,0.125442
6,0.60:0.40,0.12108
5,0.50:0.50,0.117779
4,0.40:0.60,0.115421
3,0.30:0.70,0.108819
2,0.20:0.80,0.101627
1,0.10:0.90,0.095732


### job_code_seq1

In [175]:
job_code_weight_pop_view = job_code_weight_pop[['job_code_seq1', 'recruitment_seq',
                                            'score', 'w_score']]

In [176]:
ltocf_score_pop = ltocf_score.merge(job_code_weight_pop_view, how='left', on=['job_code_seq1', 'recruitment_seq'])
ltocf_score_pop[['score', 'w_score']] = ltocf_score_pop[['score', 'w_score']].fillna(0)
ltocf_score_pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169640 entries, 0 to 169639
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   resume_idx       169640 non-null  int64   
 1   recruitment_idx  169640 non-null  int64   
 2   ltocf_score      169640 non-null  float64 
 3   resume_seq       169640 non-null  object  
 4   recruitment_seq  169640 non-null  object  
 5   degree           169640 non-null  category
 6   job_code_seq1    169640 non-null  object  
 7   score            169640 non-null  float64 
 8   w_score          169640 non-null  float64 
dtypes: category(1), float64(3), int64(2), object(3)
memory usage: 11.8+ MB


#### MinMax Scaling

In [177]:
# ltocf_score_pop['sw_score'] = mms.fit_transform(ltocf_score_pop['w_score'].values.reshape(-1,1))
# ltocf_score_pop['scf_score'] = mms.fit_transform(ltocf_score_pop['ltocf_score'].values.reshape(-1,1))

In [151]:
ltocf_score_pop['s_score'] = ltocf_score_pop.groupby('resume_idx')['score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
ltocf_score_pop['sw_score'] = ltocf_score_pop.groupby('resume_idx')['w_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))
ltocf_score_pop['scf_score'] = ltocf_score_pop.groupby('resume_idx')['ltocf_score'].transform(
    lambda x: (mms.fit_transform(x.values.reshape(-1,1))).reshape(-1))

In [178]:
ltocf_score_pop.describe(include ='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
resume_idx,169640.0,,,,4240.5,2448.549692,0.0,2120.0,4240.5,6361.0,8481.0
recruitment_idx,169640.0,,,,2801.88778,1836.516984,0.0,1186.0,2587.0,4268.0,6693.0
ltocf_score,169640.0,,,,0.994335,0.009979,0.805326,0.993198,0.998103,0.999691,1.0
resume_seq,169640.0,8482.0,U05833,20.0,,,,,,,
recruitment_seq,169640.0,6401.0,R00559,251.0,,,,,,,
degree,169640.0,5.0,4.0,149060.0,,,,,,,
job_code_seq1,169640.0,9.0,재료·화학·섬유·의복,145760.0,,,,,,,
score,169640.0,,,,0.206093,0.143475,0.0,0.116279,0.172414,0.255814,1.0
w_score,169640.0,,,,0.209307,0.153171,0.0,0.118221,0.166541,0.260513,2.800227
sw_score,169640.0,,,,0.074747,0.054699,0.0,0.042218,0.059474,0.093033,1.0


In [179]:
ltocf_score_pop.head(20)

Unnamed: 0,resume_idx,recruitment_idx,ltocf_score,resume_seq,recruitment_seq,degree,job_code_seq1,score,w_score,sw_score,scf_score
0,0,156,0.999877,U05833,R03943,4,재료·화학·섬유·의복,0.27907,0.284196,0.10149,0.999367
1,0,3195,0.999227,U05833,R05806,4,재료·화학·섬유·의복,0.209302,0.213147,0.076118,0.996029
2,0,1051,0.998012,U05833,R02540,4,재료·화학·섬유·의복,0.093023,0.094732,0.03383,0.98979
3,0,4670,0.997515,U05833,R04100,4,재료·화학·섬유·의복,0.255814,0.258899,0.092456,0.987237
4,0,2576,0.997507,U05833,R00097,4,재료·화학·섬유·의복,0.302326,0.305971,0.109267,0.987192
5,0,5279,0.997453,U05833,R05335,4,재료·화학·섬유·의복,0.162791,0.165781,0.059203,0.986918
6,0,4675,0.997211,U05833,R02806,4,재료·화학·섬유·의복,0.186047,0.17964,0.064152,0.985672
7,0,5164,0.996898,U05833,R00304,4,재료·화학·섬유·의복,0.255814,0.260513,0.093033,0.984065
8,0,4704,0.996446,U05833,R02097,4,재료·화학·섬유·의복,0.116279,0.117681,0.042026,0.981746
9,0,1030,0.996352,U05833,R05895,4,재료·화학·섬유·의복,0.325581,0.31437,0.112266,0.981259


#### RECALL 계산

##### 단순인기도

In [159]:
recalls_1 = []

In [160]:
pop_scores = create_ensemble_submission_2col(ltocf_score_pop, 'scf_score', 's_score', start=0, end=101, step=1)

0.00 : 1.00
0.01 : 0.99
0.02 : 0.98
0.03 : 0.97
0.04 : 0.96
0.05 : 0.95
0.06 : 0.94
0.07 : 0.93
0.08 : 0.92
0.09 : 0.91
0.10 : 0.90
0.11 : 0.89
0.12 : 0.88
0.13 : 0.87
0.14 : 0.86
0.15 : 0.85
0.16 : 0.84
0.17 : 0.83
0.18 : 0.82
0.19 : 0.81
0.20 : 0.80
0.21 : 0.79
0.22 : 0.78
0.23 : 0.77
0.24 : 0.76
0.25 : 0.75
0.26 : 0.74
0.27 : 0.73
0.28 : 0.72
0.29 : 0.71
0.30 : 0.70
0.31 : 0.69
0.32 : 0.68
0.33 : 0.67
0.34 : 0.66
0.35 : 0.65
0.36 : 0.64
0.37 : 0.63
0.38 : 0.62
0.39 : 0.61
0.40 : 0.60
0.41 : 0.59
0.42 : 0.58
0.43 : 0.57
0.44 : 0.56
0.45 : 0.55
0.46 : 0.54
0.47 : 0.53
0.48 : 0.52
0.49 : 0.51
0.50 : 0.50
0.51 : 0.49
0.52 : 0.48
0.53 : 0.47
0.54 : 0.46
0.55 : 0.45
0.56 : 0.44
0.57 : 0.43
0.58 : 0.42
0.59 : 0.41
0.60 : 0.40
0.61 : 0.39
0.62 : 0.38
0.63 : 0.37
0.64 : 0.36
0.65 : 0.35
0.66 : 0.34
0.67 : 0.33
0.68 : 0.32
0.69 : 0.31
0.70 : 0.30
0.71 : 0.29
0.72 : 0.28
0.73 : 0.27
0.74 : 0.26
0.75 : 0.25
0.76 : 0.24
0.77 : 0.23
0.78 : 0.22
0.79 : 0.21
0.80 : 0.20
0.81 : 0.19
0.82 : 0.18
0.83

In [161]:
for ratio, sbm in tqdm(pop_scores.items()):
    data = {}
    data['ratio'] = ratio
    data['recall'] = calculate_precision_at_k_dacon(test, sbm, 5)
    recalls_1.append(data)

100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [01:59<00:00,  1.18s/it]


In [162]:
recall_job_1 = pd.DataFrame(recalls_1).drop_duplicates().sort_values('recall', ascending=False)
recall_job_1.head()

Unnamed: 0,ratio,recall
86,0.86:0.14,0.131219
83,0.83:0.17,0.131101
84,0.84:0.16,0.130983
87,0.87:0.13,0.130983
85,0.85:0.15,0.130865


In [163]:
recall_job_1.to_csv(f'{RECALL_PATH}/simple_{DATA_SET}_{MODEL}.csv', index=False)

##### 가중치 인기도

In [164]:
recalls_2 = []

In [165]:
pop_scores = create_ensemble_submission_2col(ltocf_score_pop, 'scf_score', 'sw_score', start=0, end=101, step=1)

0.00 : 1.00
0.01 : 0.99
0.02 : 0.98
0.03 : 0.97
0.04 : 0.96
0.05 : 0.95
0.06 : 0.94
0.07 : 0.93
0.08 : 0.92
0.09 : 0.91
0.10 : 0.90
0.11 : 0.89
0.12 : 0.88
0.13 : 0.87
0.14 : 0.86
0.15 : 0.85
0.16 : 0.84
0.17 : 0.83
0.18 : 0.82
0.19 : 0.81
0.20 : 0.80
0.21 : 0.79
0.22 : 0.78
0.23 : 0.77
0.24 : 0.76
0.25 : 0.75
0.26 : 0.74
0.27 : 0.73
0.28 : 0.72
0.29 : 0.71
0.30 : 0.70
0.31 : 0.69
0.32 : 0.68
0.33 : 0.67
0.34 : 0.66
0.35 : 0.65
0.36 : 0.64
0.37 : 0.63
0.38 : 0.62
0.39 : 0.61
0.40 : 0.60
0.41 : 0.59
0.42 : 0.58
0.43 : 0.57
0.44 : 0.56
0.45 : 0.55
0.46 : 0.54
0.47 : 0.53
0.48 : 0.52
0.49 : 0.51
0.50 : 0.50
0.51 : 0.49
0.52 : 0.48
0.53 : 0.47
0.54 : 0.46
0.55 : 0.45
0.56 : 0.44
0.57 : 0.43
0.58 : 0.42
0.59 : 0.41
0.60 : 0.40
0.61 : 0.39
0.62 : 0.38
0.63 : 0.37
0.64 : 0.36
0.65 : 0.35
0.66 : 0.34
0.67 : 0.33
0.68 : 0.32
0.69 : 0.31
0.70 : 0.30
0.71 : 0.29
0.72 : 0.28
0.73 : 0.27
0.74 : 0.26
0.75 : 0.25
0.76 : 0.24
0.77 : 0.23
0.78 : 0.22
0.79 : 0.21
0.80 : 0.20
0.81 : 0.19
0.82 : 0.18
0.83

In [166]:
for ratio, sbm in tqdm(pop_scores.items()):
    data = {}
    data['ratio'] = ratio
    data['recall'] = calculate_precision_at_k_dacon(test, sbm, 5)
    recalls_2.append(data)

100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [01:59<00:00,  1.18s/it]


In [167]:
recall_job_2 = pd.DataFrame(recalls_2).drop_duplicates().sort_values('recall', ascending=False)
recall_job_2.head()

Unnamed: 0,ratio,recall
86,0.86:0.14,0.131337
85,0.85:0.15,0.130983
84,0.84:0.16,0.13063
83,0.83:0.17,0.130512
96,0.96:0.04,0.130276


In [168]:
recall_job_2.to_csv(f'{RECALL_PATH}/weighted_{DATA_SET}_{MODEL}.csv', index=False)