In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [110]:
rule_match_weights = {
    '薪资匹配度归一化': 0.35,
    '工作经验匹配度归一化': 0.25,
    '教育水平匹配度归一化': 0.25,
    '工作地点匹配度归一化': 0.15
}

rule_sat_weights = {
    '薪资满意度归一化' : 0.2,
    '福利满意度归一化' : 0.2,
    '公司满意度归一化' : 0.6
}
match_weights ={
    '规则匹配度' : 0.4,
    '长文本匹配度归一化' : 0.4,
    '知识图谱匹配度归一化' : 0.2
}
sat_weights ={
    '规则满意度' : 0.4,
    '长文本匹配度归一化' : 0.4,
    '知识图谱匹配度归一化' : 0.2
}

In [111]:
joblist_rule_path = '../../demo3/岗位匹配度.csv'
resume_rule_path = '../../demo3/求职者满意度.csv'
joblist_match_path = '../../demo4/岗位匹配度.csv'
resume_sat_path = '../../demo4/求职者满意度.csv'

bert_path = '../../demo3/BERT匹配度.csv'
graph_path = '../../demo3/知识图谱匹配度.csv'

resume_duplicate_path = '../../demo3/重复简历.csv'

In [112]:
joblist_rule = pd.read_csv(joblist_rule_path, dtype={'招聘信息 ID' :str , '求职者 ID' : str})
resume_rule = pd.read_csv(resume_rule_path, dtype={'招聘信息 ID' :str , '求职者 ID' : str})

bert_inf = pd.read_csv(bert_path, dtype={'招聘信息 ID' :str , '求职者 ID' : str})
graph_inf = pd.read_csv(graph_path, dtype={'招聘信息 ID' :str , '求职者 ID' : str})

resume_duplicate = pd.read_csv(resume_duplicate_path, dtype={'重复样本' :str , '其它重复样本' : str})
resume_duplicate.replace(np.nan,'',inplace=True)
resume_duplicate = resume_duplicate.drop(resume_duplicate[resume_duplicate.其它重复样本 == ''].index)
resume_duplicate = resume_duplicate[['重复样本', '其它重复样本']].copy()

合并三张数据集

In [113]:
def concat_df(resume_duplicate_copy, inf_copy):
    # inf_copy['id'] = inf_copy['求职者 ID']
    # inf_copy.set_index('id', inplace=True)
    i = 0
    dataframe_list = [0] * len(resume_duplicate_copy)
    for row in resume_duplicate_copy.itertuples(index=False, name=None):
        r_id = row[0]
        repeat_ids = row[1].split(',')
        n = len(repeat_ids)
        demo_df = inf_copy.loc[r_id]
        repeat_ids = [item for item in repeat_ids for _ in range(len(demo_df))]
        demo_df = inf_copy.loc[r_id]
        demo_df = pd.concat([demo_df] * n, ignore_index=True)
        demo_df['求职者 ID'] = repeat_ids
        dataframe_list[i] = demo_df
        print(i)
        i += 1
    inf_copy = pd.concat(dataframe_list + [inf_copy])
    return inf_copy.reset_index(drop=True)

In [114]:
joblist_degree = pd.merge(joblist_rule, bert_inf, on = ['招聘信息 ID', '求职者 ID'])
joblist_degree = pd.merge(joblist_degree, graph_inf, on = ['招聘信息 ID', '求职者 ID'])
joblist_degree['规则匹配度'] = 0
joblist_degree['岗位匹配度'] = 0
joblist_degree['id'] = joblist_degree['求职者 ID']
joblist_degree.set_index('id', inplace=True)

resume_degree = pd.merge(resume_rule, bert_inf, on = ['招聘信息 ID', '求职者 ID'])
resume_degree = pd.merge(resume_degree, graph_inf, on = ['招聘信息 ID', '求职者 ID'])
resume_degree['规则满意度'] = 0
resume_degree['求职者满意度'] = 0
resume_degree['id'] = resume_degree['求职者 ID']
resume_degree.set_index('id', inplace=True)

In [115]:
joblist_degree = concat_df(resume_duplicate , joblist_degree)
resume_degree = concat_df(resume_duplicate , resume_degree)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [116]:
# for i in joblist_degree['求职者 ID']:
#     if i in resume_duplicate['重复样本']:
#         print(i)
#         # resume_demo = joblist_degree[joblist_degree['招聘信息 ID'] == i]
#         # for j in resume_duplicate.loc[i,'其它重复样本'].split(','):
#         #     resume_demo['求职者 ID'] == j
#         #     joblist_degree.append(resume_demo)

In [117]:
# # 取消id索引
# joblist_degree = joblist_degree.reset_index(drop=True)
# resume_degree = resume_degree.reset_index(drop=True)

In [118]:
scaler = MinMaxScaler()

def ratings_norm(df):
    for i in df.columns[2:-1]:
        new_name = i + "归一化"
        df[new_name] = scaler.fit_transform(df[[i]])
        df[new_name] = df[new_name].apply(lambda x: 0.0005 if x == 0 else x)
    return df  # 返回归一化后的数据
joblist_degree_group = joblist_degree.groupby("招聘信息 ID", group_keys=True).apply(ratings_norm)

In [119]:
resume_degree_group = resume_degree.groupby('求职者 ID', group_keys=True).apply(ratings_norm)

In [120]:
for key,value in rule_match_weights.items():
    joblist_degree_group['规则匹配度'] += joblist_degree_group[key] * value

In [121]:
for key,value in rule_sat_weights.items():
    resume_degree_group['规则满意度'] += resume_degree_group[key] * value

In [122]:
for key,value in match_weights.items():
    joblist_degree_group['岗位匹配度'] += joblist_degree_group[key] * value

In [123]:
for key,value in sat_weights.items():
    resume_degree_group['求职者满意度'] += resume_degree_group[key] * value

In [129]:
joblist_match = joblist_degree_group[['招聘信息 ID', '求职者 ID', '岗位匹配度']].copy()
resume_match = resume_degree_group[['招聘信息 ID', '求职者 ID', '求职者满意度']].copy()

In [133]:
joblist_match.reset_index(drop=True, inplace= True)
resume_match.reset_index(drop=True, inplace= True)

In [135]:
joblist_match.to_csv(joblist_match_path, index = False)
resume_match.to_csv(resume_sat_path, index = False)