In [36]:
from gensim import corpora, models, similarities
from gensim.models import Word2Vec,keyedvectors 
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
import jieba.posseg as jp, jieba
import pandas as pd
import numpy as np
from string import punctuation
import re
import sys
from collections import Counter

jieba.enable_paddle()

add_punc='，。、【 】 “”：；（）《》‘’{}？！⑦()、%^>℃：.”“^-——=&#@￥'
all_punc=punctuation+add_punc

Paddle enabled successfully......


In [8]:
model_path='./model/baike_26g_news_13g_novel_229g.model'
job_path='./model/职业分类大典.xlsx'
job_post_path='./model/招聘20210611.xlsx'
test_data_path='./model/sample.txt'
test_input="前台行政"

In [3]:
model=Word2Vec.load(model_path)
word_vectors = model.wv

In [4]:
#jieba精准分词
def jieba_cut(sentence, word_vectors):
    word_list = []
    seg_list = jieba.cut_for_search(sentence)
    for word in seg_list:
        if word != '' and word is not None and word not in all_punc and word in word_vectors.key_to_index:
            word_list.append(word)
    return word_list


In [5]:
# keep chinese only
def keep_all_chinese(file):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese=re.sub(pattern,'',file)
    return chinese

In [6]:
def create_job_categories_map(job_dictionary_file):
    job_categories_dic=dict()
    job_categories_describ_list_dic=dict()
    for index, row in job_dictionary_file.iterrows():
        job_id=row['职业代码']
        if row['定义'] and not row['定义'].isspace():
            job_describ= row['定义']+row['职责']+row['大类']+row['中类']+row['小类']+row['细类']
            job_categories_dic[job_id]=row['大类']+row['中类']+row['小类']+row['细类']
            job_describ_word_list=list(set(jieba_cut(keep_all_chinese(job_describ),word_vectors)))
            job_categories_describ_list_dic[job_id]=job_describ_word_list
    
    return job_categories_dic, job_categories_describ_list_dic

In [7]:
def find_job_category(job_name, job_categories_describ_list_dic, job_categories_dic):
    job_name_list=[]
    seg_list = jieba_cut(job_name, word_vectors)
    for word in seg_list:
        if word != '' and word is not None and word not in all_punc and word in word_vectors.key_to_index:
            job_name_list.append(word)
    #1st compare with title
    job_scores=dict()   
    for key in job_categories_dic:
        job_category_name=list(set(jieba_cut(keep_all_chinese(job_categories_dic[key]),word_vectors)))
        job_scores[key]=word_vectors.n_similarity(job_category_name,job_name_list)

    job_scores = sorted(job_scores.items(),key=lambda d:d[1],reverse=True)
    #2nd compare with descriptio
    job_scores_des=dict()   
    for index in range(0,10):
        category_1=job_scores[index][0]
        description = job_categories_describ_list_dic[category_1]
        job_scores_des[category_1]=word_vectors.n_similarity(description,job_name_list)
    # Return top job
    return sorted(job_scores_des.items(),key=lambda d:d[1],reverse=True)

In [12]:
# Load job list
data = pd.read_excel(job_path, keep_default_na=False)
job_dictionary_file = pd.DataFrame(data, columns= ['职业代码','大类','中类','小类','细类','职责','定义'])
job_categories_dic,job_categories_describ_list_dic=create_job_categories_map(job_dictionary_file)

posted_jobs_data = pd.read_excel(job_post_path, keep_default_na=False)
posted_jobs = pd.DataFrame(posted_jobs_data, columns= ['职位名称','职位描述']).values.tolist()

In [29]:
#f = open("result.txt", "a")

for post_job in posted_jobs:
    post_job_descri_chinese=keep_all_chinese(post_job[1])
    if len(post_job_descri_chinese) != 0:
        job_scores_des = find_job_category(post_job_descri_chinese, job_categories_describ_list_dic, job_categories_dic)
        print('post_job: {} category&scores: {} {}'.format(post_job[0], job_categories_dic[job_scores_des[0][0]],job_scores_des[0]))
#f.close()

post_job: 报告厅文员 (MJ000752) category&scores: 专业技术人员工程技术人员测绘工程技术人员地图制图与印刷工程技术人员 (2020204, 0.839751)
post_job: 建筑设计主管/经理 category&scores: 专业技术人员工程技术人员管理（工业）工程技术人员质量管理与可靠性控制工程技术人员 (2023405, 0.9038316)
post_job: 前台行政 category&scores: 办事人员和有关人员行政办公人员行政事务人员秘书 (3010201, 0.8821837)
post_job: 西餐厅前台预定礼仪接待 category&scores: 商业、服务业人员饭店、旅游及健身娱乐场所服务人员健身和娱乐场所服务人员康乐服务员 (4040303, 0.7837907)
post_job: 文员 category&scores: 专业技术人员经济业务人员统计人员统计人员 (2060200, 0.80048907)
post_job: 链家储备经理 7K+ 五险一金 category&scores: 专业技术人员工程技术人员管理（工业）工程技术人员人力资源开发与管理工程技术人员 (2023407, 0.84289706)
post_job: 消防中控员 category&scores: 专业技术人员工程技术人员交通工程技术人员船舶检验工程技术人员 (2021805, 0.86760944)
post_job: 会议弱电工 category&scores: 专业技术人员工程技术人员交通工程技术人员船舶检验工程技术人员 (2021805, 0.8469416)
post_job: 办公室文员,销售助理,销售文员 category&scores: 专业技术人员工程技术人员管理（工业）工程技术人员生产组织与管理工程技术人员 (2023404, 0.86960554)
post_job: 客服+无责底薪3k5+长白班 category&scores: 专业技术人员新闻出版、文化工作人员图书资料与档案业务人员图书资料业务人员 (2120601, 0.7830606)
post_job: 石楼石基市桥置业顾问 category&scores: 商业、服务业人员社会服务和居民生活服务人员社会中介服务人员职业指导员 

KeyboardInterrupt: 

['项',
 '科研',
 '发展',
 '本',
 '管理',
 '的',
 '单位',
 '财物',
 '人财物',
 '拟定',
 '与',
 '鉴定',
 '国家',
 '督促',
 '和',
 '制定',
 '方案',
 '协调',
 '计划',
 '等',
 '质量',
 '及',
 '任务',
 '综合',
 '工作',
 '进度',
 '规划',
 '评审',
 '理论',
 '部门',
 '科研任务',
 '检查',
 '科研项目',
 '研究',
 '申请',
 '督促检查',
 '实验报告',
 '撰写',
 '实验',
 '有关',
 '实施',
 '实践',
 '项目',
 '报告',
 '组织']

In [55]:
job_scores = find_job_category('中介代理人', job_categories_list_dic)
print(job_scores)

[(4070101, 0.6039573), (2070302, 0.60136855), (2070106, 0.58514875), (2070107, 0.581261), (4040103, 0.551987), (2070301, 0.5504004), (4010102, 0.5482718), (2060500, 0.5250774), (4010201, 0.52285856), (4040101, 0.5216352), (3030104, 0.51121795), (3020201, 0.5066504), (4010404, 0.5039185), (2070108, 0.50363135), (4010701, 0.50227976), (1050103, 0.49902308), (1050101, 0.49633983), (2070109, 0.49553373), (2070303, 0.49423128), (4010702, 0.49164355), (1050102, 0.48451138), (2080300, 0.48049617), (2080400, 0.47917122), (4040303, 0.4775648), (4040102, 0.47628507), (4010301, 0.47616917), (4040202, 0.47266734), (1020200, 0.4718039), (4020201, 0.47148836), (2070103, 0.47113985), (4010402, 0.46975932), (1020500, 0.46800572), (2070102, 0.46772945), (4050102, 0.46299592), (4010101, 0.4619871), (4010302, 0.46106672), (4010403, 0.45705837), (4019901, 0.4557836), (3030102, 0.45387965), (3030106, 0.45185488), (3030202, 0.45119402), (1020300, 0.4496955), (4040201, 0.4487658), (3010201, 0.44494724), (401

In [38]:
word_count = Counter()
with open(test_data_path) as fp:
    while True:
        line = fp.readline()
        if not line:
            break
        
        split_content = line.split('@!')
        if len(split_content) < 14:
            line_2 = fp.readline()
            if not line_2:
                break    
            line = line + line_2
            split_content = line.split('@!')
        
        job_title = split_content[10]
        job_title_word_list=list(set(jieba_cut(keep_all_chinese(job_title),word_vectors)))
        
        word_count.update(job_title_word_list)
        
print("{}".format(word_count.most_common(10)))

[('专员', 3), ('人事', 2), ('行政', 2), ('关系', 2), ('员工', 2)]
