In [16]:
from sentence_transformers import SentenceTransformer, util
import jieba, nltk, re, cpca, torch, sys, psutil, os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [17]:
# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
model = SentenceTransformer('uer/sbert-base-chinese-nli')

No sentence-transformers model found with name C:\Users\vmice/.cache\torch\sentence_transformers\uer_sbert-base-chinese-nli. Creating a new one with MEAN pooling.


In [3]:
job_csv_file = '../datasets/recruitment-info.csv'
hunter_csv_file = '../datasets/hunter-info.csv'

job_data = pd.read_csv(job_csv_file, encoding='GBK')
hunter_data = pd.read_csv(hunter_csv_file, encoding='GBK')

### 计算公式

工作 J 字段（除 ID 和招聘人数）：

{pos_name, job_wage, job_kind, exp_edu, job_years, (pos_keys, cor_ind), cor_addr, skill_keys, job_welfare, rec_require}
{岗位名称， 薪资，    招聘类型， 期望学历， 工作年数， (岗位类型， 公司类型)，公司城市， 技能要求，   福利，       招聘要求}

求职责 H 字段（除 ID 、个人信息、简历关键字、报道时间）：

{pos_name, job_wage, job_kind, edu_exps, job_years,      pos_keys,      cor_addr, skill_keys，hunter_soci，hunter_eval，job_exps，project_exps，competition_exps，training_exps，language_exps，cert_exps}
{期望岗位， 期望薪资， 期望类型， 教育经历， 工作年数，       期望行业，      期望城市， 技术能力，  社会属性，    自我评价，    工作经历， 项目经历，               竞赛经历，        培训经历，       语言能力，      证书}

- thre(x, t)，阈值函数，0 if x < t else 1
- Thre(com(x1, x2), t=.5)，比较阈值函数，thre(com(x1, x2), t)

基本因子：岗位名（类型）、招聘类型、学历*、薪资*、工作年数*、公司类型、技能需求、公司城市

$$base(J,H)=[J(position_name), J(job_wage), J(require_kind), J(require_edu), J(require_exp), J(company_ind), ]$$

$$F(J,H)=Thre[J(position_name),H(exp_position)]$$



In [4]:
require_kind_json = { 0: '实习', 1: '不限', 2: '全职'} # self to fill
require_edu_json = { 0: '不限', 1: '技工', 2: '大专', 3: '本科', 4: '硕士', 5: '博士'}
level_json = {'COMMONLY': 1, 'GOOD': 2, 'SKILLED': 3, 'MASTER': 4}

In [5]:
def change_wage(min_wage, max_wage, wage_kind=1):
    return [min_wage // 12, max_wage // 12] if wage_kind == 0 \
        else ([min_wage * 30, max_wage * 30] if wage_kind == 2 else [min_wage, max_wage])

def change_addrs(addrs):
    if not isinstance(addrs, list): addrs = [addrs]
    province, city = None, None
    # print(addrs)
    for addr in addrs:
        addr = cpca.transform([addr])
        if not province and addr['省'][0]: province = addr['省'][0] 
        if not city and addr['市'][0]: city = addr['市'][0] 
        # print(province, city)
    return (str(province) + str(city)).replace('None', '').replace('市县', '市')

def change_edus(edus):
    return [re.findall(r'\[(.*)\]', edu)[0] for edu in edus]

def delete_same_elem(list_like):
    return np.unique(np.array(list_like)).tolist()

def change_years(years):
    years = re.findall(r'(\d+)', years)
    years = [int(year) for year in years]
    if len(years) == 0: years = [0]
    return years


def change_skills(skills):
    return [re.findall(r'\w+[(\w+)]', 'python编程软件[SKILLED]') for skill in skills]

In [6]:
base_dict = {
    'id': (['job_id'], ['hunter_id']),
    'pos_name': (['position_name'], ['exp_position']),
    'job_wage': (['job_min_wage', 'job_max_wage', 'job_wage_kind'], ['exp_min_wage', 'exp_max_wage']),
    'job_kind': (['require_kind'], ['exp_require_kind']),
    'exp_edu': (['require_edu'], ['education_exps']),
    'job_years': (['require_exp'], ['hunter_exp']),
    'pos_keys': (['position_keys', 'company_ind'], ['exp_industry']),
    'cor_addr': (['company_addr', 'company_full_name'], ['exp_city']),
    'skill_keys': (['skill_keys'], ['skill_exps'])
}


In [7]:
def multi_index_to_one(item_list):
    if isinstance(item_list, str):
        return [item_list]
    result = None
    for item in item_list:
        if not result:
            result = eval(item) 
            result += eval(item)
    return result

def try_to_eval(item):
    try:
        return eval(item)
    except:
        return [item]

In [35]:
with_encode_items = ['pos_name', 'pos_keys', 'skill_keys']

def encode_base_data(obj, obj_type: int, dict_data: dict):
    # 0: job, 1: hunter
    assert obj_type == 0 or obj_type == 1
    encode_result = {}
    for key, value in base_dict.items():
        value = value[obj_type]
        try:
            sentence = obj[value].values.tolist()
            
            if key == 'id':
                ojb_id = str(sentence[0])
                encode_result[ojb_id] = {}
            elif key == 'pos_name':
                sentence = try_to_eval(sentence[0])
            elif key == 'job_wage':
                sentence = change_wage(*sentence)
            elif key == 'job_kind':
                try:
                    sentence = require_kind_json[int(sentence[0])]
                except:
                    sentence = require_kind_json[1]
            elif key == 'exp_edu':
                try:
                    sentence = [require_edu_json[sentence[0]]]
                except:
                    sentence = change_edus(try_to_eval(sentence[0]))
            elif key == 'job_years':
                try:
                    sentence = change_years(sentence[0])
                except:
                    sentence = [0]
            elif key == 'pos_keys':
                sentence = delete_same_elem(multi_index_to_one(sentence))
            elif key == 'cor_addr':
                sentence = change_addrs(sentence)
            elif key == 'skill_keys':
                sentence = multi_index_to_one(sentence)
            
            if key != 'id':
                encode_result[ojb_id][key] = {}
                encode_result[ojb_id][key]['sentence'] = sentence 
                if key in with_encode_items:
                    encode_result[ojb_id][key]['vector'] = model.encode(sentence)
                    
        except Exception as e:
            print(key, e)

    dict_data.update(encode_result)

In [38]:
job_base_dict = {} # key: {sentence: ..., vector: ..., }

# size = job_data.shape[0]
size = 3
for index_ in tqdm(range(size)):
    job = job_data.iloc[index_,:]
    encode_base_data(job, 0, job_base_dict)
print(job_base_dict)
# print(sys.getsizeof(job_base_dict))

100%|██████████| 3/3 [00:00<00:00, 11.32it/s]

{'1631112859985510400': {'pos_name': {'sentence': ['会计实习生'], 'vector': array([[-1.14678180e+00, -8.65045428e-01,  6.46795869e-01,
         5.73629625e-02,  2.55323976e-01,  9.96494472e-01,
        -5.11009991e-01, -1.35019794e-01,  1.78562868e-02,
        -9.07505974e-02, -1.43027520e+00,  5.47744930e-01,
         1.20052600e+00, -4.10677135e-01, -8.17242324e-01,
        -6.43396229e-02,  2.57326752e-01,  5.72733998e-01,
        -6.02955759e-01, -5.06015003e-01,  3.49679708e-01,
         2.05754176e-01,  7.02525675e-01,  7.26058662e-01,
         1.00935884e-01, -7.93222636e-02, -7.92950094e-01,
         7.16165423e-01,  6.78854942e-01,  2.52983361e-01,
        -2.38565415e-01,  1.24983501e+00,  5.33190191e-01,
         3.58024418e-01, -7.89407432e-01,  6.18569911e-01,
        -6.29451156e-01,  2.26971600e-02, -6.57187283e-01,
         2.40629211e-01, -6.93873286e-01, -1.05203681e-01,
         5.22484481e-01,  2.95888394e-01, -1.01906858e-01,
        -1.75172016e-01, -6.55110836e-01, -3




In [37]:
hunter_base_dict = {} # key: {sentence: ..., vector: ..., }

# size = hunter_data.shape[0]
size = 3
for index_ in tqdm(range(size)):
    hunter = hunter_data.iloc[index_,:]
    encode_base_data(hunter, 1, hunter_base_dict)
# print(hunter_base_dict)
# print(sys.getsizeof(hunter_base_dict))

100%|██████████| 3/3 [00:00<00:00,  6.84it/s]


In [19]:
# pid = os.getpid()
# p = psutil.process(pid)
# info = p.memory_full_info()
# info.uss / 1024. / 1024. / 1024.

In [39]:
def every_multi_score(vector1, vector2, method='mean'):
    cos_score = util.cos_sim(vector1, vector2).numpy()
    scores = [np.max(cos_score[i, :], keepdims=False) for i in range(cos_score.shape[0])]
    if method == 'mean':
        multi_score = np.mean(scores)
    else:
        multi_score = np.dot(scores, [1] * len(scores))
        # multi_score = 1
        # for score in scores:
        #     multi_score *= score
    return multi_score

In [41]:
require_edu_re_json = {'不限': 0, '技工': 1, '大专': 2, '本科': 3, '硕士': 4, '博士': 5}

def get_max_edu_level(sentence):
    level = [require_edu_re_json[edu] for edu in sentence]
    return np.max(level, keepdims=False)

In [None]:
def calc_base_score(job_data: dict, hunter_data: dict):
    base_score = 1.0
    for key, job_item in job_data.items():
        hunter_item = hunter_data.get(key)
        sentence1, sentence2 = job_item['sentence'], hunter_item['sentence']
        vector1, vector2 = job_item['vector'], hunter_item['vettor']
        if key in with_encode_items:
            score = every_multi_score(vector1, vector2)
        elif key == 'job_wage':
            pass
        elif key == 'job_kind':
            score = float(sentence1 == sentence2)
        elif key == 'exp_edu':
            score = float(get_max_edu_level(sentence1) <= get_max_edu_level(sentence1))
        elif key == 'job_years':
            score = float(np.min(sentence1) <= np.max(sentence2))
        elif key == 'cor_addr':
            province1, city1 = re.findall(r'(.*)省(.*)市', sentence1)[0]
            province2, city2 = re.findall(r'(.*)省(.*)市', sentence2)[0]

In [21]:
def cal_base_score(base_data):
    # multi_score = {}
    for key, value in base_data.items():
        # print(value[0], value[1])
        if key == 'pos_name': # one to multi
            # print(value[0], value[1])
            score = every_multi_score(value[0], value[1])
            # val1 = model.encode(value[0])
            # val2 = model.encode(value[1])
            # score = np.max([util.cos_sim(val1[0], val) for val in val2], keepdims=False)
            # print(score)
        elif key == 'job_wage':
            score = util.cos_sim(model.encode(str(value[0])),
                                 model.encode(str(value[1])))
        elif key == 'job_kind':
            score = (1 if value[0] == value[1] else 0)
        elif key == 'exp_edu':
            # score = (1 if value[0] == value[1] else 0)
            val1 = (require_edu_re_json[value[0][0]] if len(value[0]) != 0 else 0)
            val2 = (require_edu_re_json[value[1][0]] if len(value[1]) != 0 else 0)
            score = (1 if val1 <= val2 else 0)
        elif key == 'job_years':
            if len(value[0]) == 1 and value[0][0] == 0: score = 1
            else:
                if len(value[0]) == 1:
                    score = (1 if value[0][0] in value[1] else 0)
                else:
                    score = util.cos_sim(model.encode(str(value[0])),
                                        model.encode(str(value[1])))
        elif key == 'pos_keys':
            if len(value[0]) == 0: score = 1
            elif len(value[1]) == 0: score = 0.8
            else:
                # val1 = model.encode(value[0])
                # val2 = model.encode(value[1])
                # score = util.cos_sim(val1, val2)
                score = every_multi_score(value[0], value[1])
        elif key == 'cor_addr':
            if len(value[0]) == 0 or len(value[1]) == 0: score = 1
            else:
                val1 = model.encode(value[0])
                val2 = model.encode(value[1])
                score = util.cos_sim(val1, val2)
        elif key == 'skill_keys':
            if len(value[0]) == 0: score = 1
            elif len(value[1]) == 0: score = 0.6
            else:
                # val1 = model.encode(value[0])
                # val2 = model.encode(value[1])
                # score = util.cos_sim(val1, val2)
                score = every_multi_score(value[0], value[1])

        else:
            score = 1

        if isinstance(score, torch.Tensor):
            # print(score)
            score_ = score.numpy()
            score = .0
            for item in score_:
                score = max(score, *item)
            # print(score)
        # multi_score[key] = score
        base_data[key].append(score)
        # print(type(score.int))
        # print(score)
        # multi_score *= score
        # print('total score', multi_score)
    # print(base_data)
    return base_data


In [22]:
for index_ in range(0, 80, 3):
    job = job_data.iloc[index_,:]
    hunter = hunter_data.iloc[index_,:]
    base_parse_data = base_parse(job, hunter)
    base_data = cal_base_score(base_parse_data)
    print(base_data)
    multi_score = 1
    for key, value in base_data.items():
        multi_score *= value[2]
    print('total', multi_score)

{'id': [['1631112859985510400'], ['1632947816114552832'], 1], 'pos_name': [['会计实习生'], ['数据分析师', '数据挖掘工程师', '机器学习工程师'], 0.42128882], 'job_wage': [[3500, 7000], [6000, 7000], 0.8101593], 'job_kind': [['全职'], ['全职'], 1], 'exp_edu': [['大专'], ['本科'], 1], 'job_years': [[0], [1], 1], 'pos_keys': [['互联网', '人工智能', '电子商务'], ['不限'], 0.29863697], 'cor_addr': [['广东省深圳市'], ['广东省广州市'], 0.69438136], 'skill_keys': [[], [['python编程软件', 'SKILLED'], ['python编程软件', 'SKILLED'], ['python编程软件', 'SKILLED']], 1]}
total 0.07077697454036762
{'id': [['1613439889204969472'], ['1631284316023685120'], 1], 'pos_name': [['自然语言处理工程师'], ['数据分析师', '其他', '数据挖掘工程师'], 0.51394564], 'job_wage': [[2000, 4000], [3000, 6000], 0.77671427], 'job_kind': [['实习'], ['实习'], 1], 'exp_edu': [['本科'], [], 0], 'job_years': [[0], [0], 1], 'pos_keys': [['互联网', '数据服务', '自然语言'], ['不限'], 0.3878205], 'cor_addr': [['广东省广州市'], ['广东省深圳市'], 0.69438136], 'skill_keys': [['算法研究', '深度学习'], [], 0.6]}
total 0.0
{'id': [['1613434566444449792'], ['16308524511

In [23]:
result = {}
for i in range(0, 100):
    job = job_data.iloc[i,:]
    for j in range(0, 100):  
        hunter = hunter_data.iloc[j,:]
        base_parse_data = base_parse(job, hunter)
        base_data = cal_base_score(base_parse_data)
        # print(base_data)
        multi_score = 1
        for key, value in base_data.items():
            multi_score *= value[2]
        # print('total', multi_score)
        result[job['job_id']] = {hunter['hunter_id']: multi_score}

KeyboardInterrupt: 

In [112]:
# cal_base_score(base_parse_data)

In [77]:
# job, hunter
print(job['position_name'], change_wage(job['job_min_wage'], job['job_max_wage'], job['job_wage_kind']), job['require_kind'], job['require_edu'], 
      job['require_exp'] , eval(job['position_keys']) + eval(job['company_ind']), job['company_addr'], job['skill_keys'])
print(hunter['exp_position'], change_wage(hunter['exp_min_wage'], hunter['exp_max_wage']), int(hunter['exp_require_kind']), hunter['education_exps'], 
      hunter['hunter_exp'], hunter['exp_industry'], hunter['exp_city'], hunter['skill_exps'])

数据运营专员 [7000, 10000] 2 3 1-3年 ['互联网', '大数据', '互联网', '大数据'] 广东省深圳市 []
['Hadoop大数据开发工程师', '数据挖掘工程师'] [4000, 6000] 2 [] 无经验 [] 广东省广州市天河区 []


In [78]:
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding.shape)

In [30]:
sim = util.cos_sim(embeddings[0], embeddings[1])
print("{0:.4f}".format(sim.tolist()[0][0]))
sim = util.cos_sim(embeddings[0], embeddings[2])
print("{0:.4f}".format(sim.tolist()[0][0]))
sim = util.cos_sim(embeddings[0], embeddings[3])
print("{0:.4f}".format(sim.tolist()[0][0]))

0.4090
0.3398
-0.0029


In [8]:
# hits = util.semantic_search(queries_embeddings, corpus_embeddings, top_k=2)

In [35]:
def get_seg_words(text : str):
    seg_list = jieba.lcut(text)
    interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 
                    '，', '。', '：', '；', '？', '（', '）', '【', '】', '！', '￥', ' ', '、', '-']
    seg_list = [word for word in seg_list if word not in interpunctuations]
    stops = set(nltk.corpus.stopwords.words("chinese"))
    seg_list = [word for word in seg_list if word not in stops]
    return seg_list

In [37]:
# job_text = job_text.replace('。', '. ')
# # res = nltk.tokenize.word_tokenize(job_text)
# print(job_text)

# sentence = nltk.sent_tokenize(job_text)
# print(sentence)
# # job_text = ''.join(re.findall(r'[\u4e00-\u9fa5]', job_text))

# seg_list = jieba.lcut(job_text)
# print(seg_list)

# interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 
#                     '，', '。', '：', '；', '？', '（', '）', '【', '】', '！', '￥', ' ']
# seg_list = [word for word in seg_list if word not in interpunctuations]
# print(seg_list)

# stops = set(nltk.corpus.stopwords.words("chinese"))
# seg_list = [word for word in seg_list if word not in stops]
# print(seg_list)
# # text = nltk.Text(seg_list)

# type_tag = nltk.pos_tag(seg_list)
# print(type_tag)

# cut_word = []
# for word in seg_list:
#     cut_word.append(nltk.stem.PorterStemmer().stem(word))
# print(cut_word)

In [1]:
# job_sentences, hunter_sentences

In [39]:
sentences = [job_sentences, hunter_sentences]
embeddings = model.encode(sentences)
sim = util.cos_sim(embeddings[0], embeddings[1])
print("{0:.4f}".format(sim.tolist()[0][0]))

0.4324


In [40]:
sentences = [job_text, hunter_text]
embeddings = model.encode(sentences)
sim = util.cos_sim(embeddings[0], embeddings[1])
print("{0:.4f}".format(sim.tolist()[0][0]))

0.5019
