In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
from torch.utils import data
from d2l import torch as d2l

In [2]:
# 这里使用的ID为0~N-1版本
class Person:
  def __init__(self, id):
    self.id = id
    self.alive = True # 是否存活
    self.offers = []  # 收到的offer
    self.job = -1 # 签约的企业

class Recruit:
  def __init__(self, id, num):
    self.id = id
    self.num = num # 岗位剩余容量
    self.workers = [] # 签约的员工
    
class Pair:
  def __init__(self, per, rec, val):
    self.per = per
    self.rec = rec
    self.pid = per.id
    self.rid = rec.id
    self.val = val
  
  def print(self):
    print(self.pid, self.rid, self.val)

In [3]:
# 属性嵌入，这里的序号是0~N-1
# 用于拼接向量 and 判断行业
a_emb_job = pd.read_csv('./attribute_embedding_position.txt',sep=' ',index_col=0,header=None,na_values=[])
a_emb_per = pd.read_csv('./attribute_embedding_user.txt',sep=' ',index_col=0,header=None,na_values=[])

In [4]:
def is_fw(pid):
  val = fw_list.loc[pid,'是否为废人']
  return val==1

fw_list = pd.read_csv('./fw.txt',sep=' ',index_col=0)

In [5]:
alive = []
for i in a_emb_per.index:
  alive.append(i)
ss = pd.Series(alive)
ss.to_csv('./no_fw.csv')

In [6]:
pca_per_ppd = pd.read_csv('./pca_per.csv',index_col=0,header=None)
pca_job_ppd = pd.read_csv('./pca_job.csv',index_col=0,header=None)

In [7]:
pca_per_myd = pd.read_csv('./pca_per_myd.csv',index_col=0,header=None)
pca_job_myd = pd.read_csv('./pca_job_myd.csv',index_col=0,header=None)

In [8]:
def get_input_ppd(pid,rid):
  per = pca_per_ppd.loc[pid].values
  job = pca_job_myd.loc[rid].values
  all = np.concatenate([per,job],axis=0)
  return all

def get_input_myd(pid,rid):
  per = pca_per_myd.loc[pid].values
  job = pca_job_myd.loc[rid].values
  all = np.concatenate([per,job],axis=0)
  return all

In [9]:
net_ppd = torch.load('./Sigmoid-200.pt')
net_myd = torch.load('./Sigmoid-MYD.pt')

In [10]:
# 硬核条件，这里的序号是0~N-1
hd_job = pd.read_csv('./hardcore_position.csv',index_col=0,header=0,encoding='gbk')
hd_job.index = [i for i in range(hd_job.shape[0])]
hd_per = pd.read_csv('./hardcore_user.csv',index_col=0,header=0,encoding='gbk')
hd_per.index = [i for i in range(hd_per.shape[0])]
hd_job.shape, hd_per.shape

((1575, 4), (10877, 3))

In [11]:
def judge_hangye(pid,rid): # 0~N-1
  # print(pid,rid)
  x = a_emb_per.loc[pid,6:14].values.astype('int')
  y = a_emb_job.loc[rid,6:14].values.astype('int')
  sum = (x*y).sum()
  return sum > 0

def judge_gongzi(pid,rid): # 0~N-1
  p2, p1 = hd_per.iloc[pid,:-1]
  r2, r1 = hd_job.iloc[rid,-2:]
  return max(p1,r1)<=min(p2,r2)

def judge_type(pid,rid):
  pt = hd_per.iloc[pid,-1]
  rt = hd_job.iloc[rid,0]
  # print(pt,rt)
  return pt == rt

def judge(pid,rid):
  if judge_buf[pid,rid]>=0:
    return judge_buf[pid,rid]==1
  # b1 = judge_hangye(pid,rid)
  b2 = judge_gongzi(pid,rid)
  b3 = judge_type(pid,rid)
  if b2 and b3:
    judge_buf[pid,rid] = 1
  else:
    judge_buf[pid,rid] = 0
  # print('build buf')
  return judge_buf[pid,rid]==1

judge_buf = np.zeros(shape=(10877,1575),dtype='int')-1
# judge_buf

In [12]:
judge(1,444)

False

In [34]:
ppd_buf = np.zeros(shape=(10877,1575))-1
myd_buf = np.zeros(shape=(10877,1575))-1
# ppd_buf = {}
# myd_buf = {}
# for i in pca_per_ppd.index:
#   ppd_buf[i] = np.zeros(1575)-1
#   myd_buf[i] = np.zeros(1575)-1

def get_ppd(pid,rid):
  if judge(pid,rid)==False: # 硬性条件不满足为0
    return 0
  elif is_fw(pid):  # 废物为1
    # return 0.1
    ppd = 1 + np.random.random_sample()
  elif ppd_buf[pid][rid]>=0:
    return ppd_buf[pid][rid]
  else:
    ppd = net_ppd(torch.Tensor(get_input_ppd(pid,rid))).item()
    # print(ppd)
    # print(f'ppd({pid},{rid})={ppd}')
  ppd = min(ppd,10)
  ppd = max(ppd,1)/10
  ppd_buf[pid][rid] = ppd
  return ppd

def get_myd(pid,rid):
  if judge(pid,rid)==False: # 硬性条件不满足为0
    return 0
  elif is_fw(pid):  # 废物为1
    # return 0.1
    myd = 1 + np.random.random_sample()
  elif myd_buf[pid][rid]>=0:
    return myd_buf[pid][rid]
  else:
    myd = net_myd(torch.Tensor(get_input_myd(pid,rid))).item()
    # print(myd)
    # print(f'ppd({pid},{rid})={ppd}')
  myd = min(myd,10)
  myd = max(myd,1)/10
  myd_buf[pid][rid] = myd
  return myd

In [35]:
get_myd(1,4)

0.16448439281126828

In [27]:
reid_per = {}
reid_rec = {}

dict_per = pd.read_excel('./user.xlsx')
for i in dict_per.index:
  reid_per[i]=dict_per.loc[i,'求职者 ID']

dict_rec = pd.read_excel('./position.xlsx')
for i in dict_rec.index:
  reid_rec[i]=dict_rec.loc[i,'招聘信息 ID']


In [42]:
persons = []
recruits = []

In [43]:
for i in tqdm(range(hd_per.shape[0])):
# for i in tqdm(range(5000)):
  # id = hd_per.loc[i, '序号']
  tmp = Person(i)
  persons.append(tmp)
  # print(id)

100%|██████████| 10877/10877 [00:00<00:00, 1087623.24it/s]


In [44]:
for i in tqdm(range(hd_job.shape[0])):
# for i in tqdm(range(50)):
  # id = hd_rec.loc[i+1, '序号']
  num = hd_job.loc[i, '岗位需求量']
  tmp = Recruit(i, num)
  recruits.append(tmp)
  # print(id)

100%|██████████| 1575/1575 [00:00<00:00, 143192.20it/s]


In [45]:
def get_cnt_rec():
  sum = 0
  for i in recruits:
    sum += i.num
  return sum

old_cnt_rec = get_cnt_rec()
old_cnt_rec

5520

In [46]:
epoch = 0
total_pair = 0
cnt_rec = get_cnt_rec()

while True:
  epoch += 1
  # 招聘回合
  print(f'epoch {epoch}:')
  
  np.random.shuffle(persons) # 随机优化
  
  for rec in tqdm(recruits):  # 对于每个岗位
    # 发放offer
    if rec.num <= 0:
      continue
    offer_list = [] # 临时数组：<per, rec, val>
    # 计算与每个求职者的匹配度
    for per in persons:
      if per.alive==False:
        continue
      val = get_ppd(per.id, rec.id)
      if val<=0:
        continue
      pair = Pair(per, rec, val)
      offer_list.append(pair)
    # 就地排序 by val
    if len(offer_list)<=0:
      continue
    
    offer_list.sort(reverse=True, key=lambda x:x.val)
    
    # for of in offer_list:
    #   if get_ppd(of.pid,of.rid)==0:
    #     print(f'1.ppd={get_ppd(of.pid,of.rid)}, val={of.val}')
        
    if len(offer_list)>rec.num:
      offer_list = offer_list[:rec.num] # 取TopK
    
    for pair in offer_list:
      if pair.val>0:
        pair.per.offers.append(pair) # 发放offer
  
  # 所有岗位offer发放完毕
    
  # 求职者签约
  for per in persons: # 对于每个求职者
    if per.alive == False:
      continue
    # offer_list2 = [] # 临时数组：Pair
    for of in per.offers:  # 对于每一份offer
      # rec = recruits[rid]
      val = get_myd(of.pid, of.rid)
      of.val = val
    #   pair = Pair(per, rec, val)
    #   offer_list2.append(pair)
    # if len(offer_list2)==0:  # 当前求职者无offer
    #   continue
    
    # offer_list2.sort(reverse=True, key=lambda x:x.val) # 取最高满意度
    per.offers.sort(reverse=True, key=lambda x:x.val)
    # 目标岗位
    for of in per.offers:
      # dest_rec = recruits[k.rid]
      if of.rec.num > 0: # 若未招满
        of.rec.num -= 1
        per.job = of.rid
        per.alive = False
        # per.offers.clear()
        of.rec.workers.append(per.id)
        # if of.pid!=per.id:
        #   print('ID ERR!')
        # 检测非法值
        # if get_myd(per.id,of.rid)==0:
        #   print(f'2.<{per.id},{of.rid}>:myd={get_myd(per.id,of.rid)},ppd={get_ppd(per.id,of.rid)}')
        break
    
    per.offers.clear()
    
  # 删除冗余
  
  new_cnt_rec = get_cnt_rec()
  increase = cnt_rec - new_cnt_rec
  cnt_rec = new_cnt_rec
  total_pair += increase
  # print(net1_cnt, net2_cnt)
  print(f'increase {increase}, total {total_pair}\n')
  
  # break
  
  if increase <= 0:
    break

epoch 1:


100%|██████████| 1575/1575 [1:16:11<00:00,  2.90s/it]


increase 284, total 284

epoch 2:


100%|██████████| 1575/1575 [01:03<00:00, 24.83it/s]


increase 258, total 542

epoch 3:


100%|██████████| 1575/1575 [01:00<00:00, 25.98it/s]


increase 202, total 744

epoch 4:


100%|██████████| 1575/1575 [00:58<00:00, 26.87it/s]


increase 158, total 902

epoch 5:


100%|██████████| 1575/1575 [00:55<00:00, 28.17it/s]


increase 207, total 1109

epoch 6:


100%|██████████| 1575/1575 [00:52<00:00, 29.89it/s]


increase 644, total 1753

epoch 7:


100%|██████████| 1575/1575 [00:26<00:00, 59.40it/s] 


increase 23, total 1776

epoch 8:


100%|██████████| 1575/1575 [00:23<00:00, 65.73it/s] 

increase 0, total 1776






In [52]:
# # 求职者offer情况
# for i in persons:
#   if i.job!=-1:
#     print(f'{i.id},{i.alive},offers:{len(i.offers)}')

In [48]:
st = set()
cnt_pair = 0
with open('./out_4_opt.csv','w') as f:
  for rec in recruits: # 对于每个岗位
  #print(f'岗位{rec.id}, 招到{len(rec.workers)}, 还差{rec.num}' ,end='\t: ')
  #if len(rec.workers)>0:  # 若非空
    for pid in rec.workers: # 对于每个签约求职者
      # if get_ppd(pid,rec.id)!=0:
      #   continue
      #print(f'<{pid},{get_myd(pid,rec.id)}>', end=' ')
      f.write(f'{reid_rec[rec.id]},{reid_per[pid]},{get_ppd(pid,rec.id)},{get_myd(pid,rec.id)}\n')
      cnt_pair += 1
      st.add(pid)
  #print('')
    
print(cnt_pair, len(st))

1776 1776


In [69]:
df_4 = pd.read_csv('./res/out_4_opt.csv')
df_4.head()

Unnamed: 0,招聘信息 ID,求职者 ID,岗位匹配度,求职者满意度
0,1648527394191049984,1480370431885179904,0.124705,0.171858
1,1648527394191049984,1461648125990139904,0.14602,0.149494
2,1648527394191049984,1647579226406249984,0.166603,0.162506
3,1648165203084440064,7538280375117339648,0.132962,0.170169
4,1648165203084440064,7539095847672930304,0.198937,0.102015


In [80]:
df_4 = df_4.sort_values(by=['招聘信息 ID','岗位匹配度'],ascending=[True,False])
df_4

Unnamed: 0,招聘信息 ID,求职者 ID,岗位匹配度,求职者满意度
1775,1374177417123460096,7531884344179869696,0.184767,0.143034
1774,1374181407047419904,7538623650378470400,0.504476,0.556234
1770,1462672316990350080,1562334736783899904,0.707512,0.648837
1769,1462672316990350080,1623885034161299968,0.157067,0.144563
1768,1462672316990350080,1516604454231729920,0.155551,0.195489
...,...,...,...,...
10,1648165203084440064,7539611467086749696,0.111774,0.125105
5,1648165203084440064,7532056125691849728,0.105999,0.104057
2,1648527394191049984,1647579226406249984,0.166603,0.162506
1,1648527394191049984,1461648125990139904,0.146020,0.149494


In [81]:
df_4.to_csv('./res/result4.csv',index=0)

In [49]:
obj = cnt_pair/old_cnt_rec
obj

0.3217391304347826

In [50]:
# 调用实例，使用时可以把函数里的print关掉
# 仅第一次调用神经网络，即建立缓存时会print
a,b = 317,46
get_ppd(a,b),get_myd(a,b)

(0, 0)

In [47]:
# 以下为导出文件

In [59]:
user_map = pd.read_csv("map/user_map.csv")
position_map = pd.read_csv("map/position_map.csv")

In [53]:
ppd_list = []
for p_index,p_row in tqdm(position_map.iterrows()):
    cur = []
    for u_index,u_row in user_map.iterrows():
        ppd = get_ppd(u_index,p_index)
        if ppd>0:
            cur.append((p_row["招聘信息 ID"],u_row["求职者 ID"],ppd))
            
    cur.sort(key=lambda x:x[2],reverse=True)
#             print(cur)
    ppd_list += cur
#     print(ppd_list)

1575it [16:29,  1.59it/s]


In [55]:
ppd_df = pd.DataFrame(data=ppd_list,columns=["招聘信息 ID","求职者 ID","岗位匹配度"])
# ppd_df.sort_values(by=['岗位匹配度'], inplace=True)
ppd_df.to_csv("res/ppd.csv",index=0)

In [66]:
ppd_list = []
myd_list = []
for u_index,u_row in tqdm(user_map.iterrows()):
    cur = []
    for p_index,p_row in position_map.iterrows():
        myd = get_myd(u_index,p_index)
        if myd>0:
            cur.append((u_row["求职者 ID"],p_row["招聘信息 ID"],myd))
            
    cur.sort(key=lambda x:x[2],reverse=True)
    myd_list += cur

10877it [15:38, 11.59it/s]


In [67]:
myd_df = pd.DataFrame(data=myd_list,columns=["求职者 ID","招聘信息 ID","求职者满意度"])
myd_df = myd_df.merge(position_map,on="招聘信息 ID",how="left")
myd_df = myd_df[["求职者 ID","招聘信息 ID","企业名称","求职者满意度"]]
myd_df.rename(columns={'企业名称':'公司名称'},inplace = True)
# myd_df.sort_values(by=['求职者满意度'], inplace=True)
myd_df.to_csv("res/myd.csv",index=0)

In [68]:
myd_df.head()

Unnamed: 0,求职者 ID,招聘信息 ID,公司名称,求职者满意度
0,1648621861036228608,1506097472638943244,视源电子,0.886064
1,1648621861036228608,1506097472638943235,视源电子,0.702926
2,1648621861036228608,1546677229344391168,嘉盛达,0.53361
3,1648621861036228608,1506097472638943236,视源电子,0.425034
4,1648621861036228608,1506097472634748929,视源电子,0.30634
