In [1]:
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship, NodeMatcher
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [2]:
joblist_path = '../../../demo3/Q3_jobList.xlsx'
resume_path = '../../../demo3/Q3_resume.xlsx'

resume_duplicate = '../../../demo3/resume_duplicate.xlsx'
resume_duplicate_id = '../../../demo3/重复简历.csv'

resume_type = {'id':str,'age':str,'edutime_days':str,'worktime_days':str,'traintime_days':str,'projecttime_days':str}

In [3]:
def clean_data(data):
    special_txt = r'\[|\]|【|】|\'|"|“|”|\\t'
    data.replace(special_txt,'', regex=True, inplace=True)
    data.replace('无','', inplace=True)
    data.replace('未填写','', inplace=True)
    data.replace(np.nan,'', inplace=True)
    data.replace('无,无,无','', inplace=True)  
    return data

In [4]:
resume_inf = pd.read_excel(resume_path, dtype = resume_type)
resume_inf['id'] = resume_inf['id'].apply(lambda x:x.strip()) # 在jupyter上，\t不一定代表制表符，也有可能是字符串
# 将NaN替换为''
resume_inf = resume_inf.replace(np.nan,'')

In [5]:
resume_skill = set()
for i in resume_inf['skill_info']:
    for j in i.split(','):
        if j == "":
            continue
        else:
            resume_skill.add(j.split(':')[0])
resume_skill = list(resume_skill)

In [6]:
def sort_skill(x):
    result_list = []
    for i in x.split(','):
        j = i.split(':')[0]
        if j:
            index = resume_skill.index(j) if j in resume_skill else -1
            result_list.append((j, index))
    result_list.sort(key=lambda x: x[1])
    sorted_result = [item[0] for item in sorted(result_list, key=lambda x: x[1])]
    return str(sorted_result)

resume_inf['skill'] = resume_inf['skill_info'].apply(sort_skill)

In [7]:
resume_inf['cert_info_cert'] =  resume_inf['cert_info'].apply(lambda x: x.split(':')[0])


计算重复样本

In [8]:
# 删去重复样本
resume_columns = resume_inf.columns.to_list()
for i in ['concatenated','cert_info', 'birthday' ,'age','skill_info']:
    resume_columns.remove(i)
duplicates = resume_inf.duplicated(subset=resume_columns[3:], keep=False)
duplicates.value_counts()
resume_inf_d = resume_inf.drop_duplicates(subset=resume_columns[3:], keep='first')
resume_inf_d = resume_inf_d.reset_index()
resume_inf_d = resume_inf_d.drop(columns=['level_0','skill','cert_info_cert'])
resume_inf_d.to_excel(resume_duplicate,index = False)

In [15]:
duplicate_rows = resume_inf[resume_inf.duplicated(subset=resume_columns[3:], keep=False)]
duplicate_groups = duplicate_rows.groupby(by=resume_columns[3:])

a = 0
resume_duplicates = pd.DataFrame(columns = ['重复样本','其它重复样本'])

for key, value in duplicate_groups:
    resume_duplicates.loc[a,'重复样本'] = str(value['id'].tolist()[0])
    resume_duplicates.loc[a,'其它重复样本'] = str(value['id'].tolist()[1:])
    a += 1
resume_duplicates = clean_data(resume_duplicates)
resume_duplicates['重复样本'] = resume_duplicates['重复样本'].apply(lambda x:x.strip()) # 在jupyter上，\t不一定代表制表符，也有可能是字符串
resume_duplicates.to_csv(resume_duplicate_id)

制作知识图谱

In [None]:
# 连接neo4j数据库，输入地址、用户名、密码
graph = Graph('http://localhost:7474/',auth = ('neo4j','Xysan955.'))
graph.delete_all()

In [None]:
# 定义字符串
#短字段，只有一个文本
resume_singleword = ['id', 'username',  'address', 'province', 'county', 'region','sex','job_wanted_status', 'arrivalTime',
                       'birthday','exp', 'political','workNature', 'willSalaryStart', 'willSalaryEnd', 
                       'worktime_days','projecttime_days', 'traintime_days', 'edutime_days', 
                       'is_graduate']
resume_singleword_ch = ['ID', '用户名', '地址', '省', '市', '区', '性别', '求职状态', '上岗时间', '生日', '经验', '政治面貌', '工作性质', '最低薪资', '最高薪资', '工作时间', '项目时间', '训练时间', '教育时间', '应届身份']
resume_singleword_relation = ['ID', '用户名', '居住地', '期望省', '期望市', '期望区', '性别', '求职状态','上岗时间', '生日', '经验', '政治面貌', '是否兼职', '期望最低薪资', '期望最高薪资', '工作时间', '项目时间', '训练时间', '教育时间', '是否应届']

#短字段，有多个文本，通过','相隔
resume_multiword = ['expectPosition','expectIndustry',
                      'workEp_companies','workEp_industries','workEp_positionNames',
                      'projectEP_companies','projectEP_projectNames','projectEP_roleNames',
                      'trainEp_orgName', 'trainEP_recordName',
                      'eduEp_educationBackgrounds','eduEP_schools', 'eduEP_specialities']
resume_multiword_ch = ['岗位', '行业', '公司', '行业', '岗位', '公司', '项目名称', '项目人员', '公司', '培训项目', '学历', '学校', '专业']
resume_multiword_relation = ['期望岗位', '期望行业', '原单元', '原单位行业', '原单位岗位', '项目所属公司', '项目名称', '项目身份', '培训机构', '培训项目', '教育背景', '学校', '专业']

#技能，用键值对构成，通过','相隔
resume_skill = ['com_info', 'cert_info', 'lang_info', 'skill_info', 'keyword_info']
resume_skill_ch = ['竞赛情况', '证书情况', '语言情况', '技能', '关键词情况']
resume_skill_relation = ['竞赛情况', '证书情况', '语言情况', '技能情况', '关键词情况']

#长字段，主要是关键词，通过','相隔
resume_keyword = ['selfEvaluation_keywords','workEP_descriptions_keywords','projectEp_descriptions_keywords','projectEP_achievements_keywords','trainEP_description_keywords']
resume_keyword_ch = ['自我评价关键词', '工作描述关键词', '项目描述关键词', '项目成就关键词', '培训描述关键词']
resume_keyword_relation = ['自我评价', '工作描述', '项目描述', '项目成就', '培训描述']

In [None]:
# 创建单个点
def single_node(data, column_ch, column_relation, node_id):
    node_sigleword = NodeMatcher(graph).match(name=str(data)).first()

    # 判断是否存在节点，如果存在包含该标签的节点，则跳过，如果存在节点但不包含标签，则增加标签，如果不存在节点，则创建
    if node_sigleword == None:
        node_sigleword = Node(column_ch, name = str(data))
        graph.create(node_sigleword)
    elif column_ch in node_sigleword.labels:
        pass
    else:
        node_sigleword.add_label(column_ch)
        graph.push(node_sigleword)

    # 创建关系时，如果是Id则跳过
    if column_ch == 'ID':
        pass
    else:
        create_relation(node_id, node_sigleword, column_relation)

# 创建技能点
def skill_node(data,column_ch, column_relation, node_id):
    data = data.split(',')
    data_skill = [skill.split(':')[0] for skill in data]
    for i in data_skill:
        node_skill = NodeMatcher(graph).match(name=str(i)).first()
        if node_skill == None:
            node_skill = Node(column_ch, name = str(i))
            graph.create(node_skill)
        elif column_ch in node_skill.labels:
            pass
        else:
            node_skill.add_label(column_ch)
            graph.push(node_skill)
        create_relation(node_id, node_skill, column_relation)

# 创建关键字节点
def word_split_node(data, column_ch, column_relation, node_id):
    data = data.split(',')
    # 分割json格式
    for i in data:
        node_keyword = NodeMatcher(graph).match(name=str(i)).first()
        if node_keyword == None:
            node_keyword = Node(column_ch, name = str(i))
            graph.create(node_keyword)
        elif column_ch in node_keyword.labels:
            pass
        else:
            node_keyword.add_label(column_ch)
            graph.push(node_keyword)
        create_relation(node_id, node_keyword, column_relation) 

#创建id和其它关系的连接
def create_relation(node_id, node, column_relation):
    relation = Relationship(node_id,column_relation,node)
    graph.create(relation)

def create_node(data, columns, columns_ch, column_relation, type):
    for i in range(data.shape[0]):
        for j in range(len(columns)):
            if data.loc[i,columns[j]] == '':
                continue
            else:
                id = data.loc[i,'id']
                node_id = NodeMatcher(graph).match(name=str(id)).first()
                if type == 'single':
                    single_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)
                elif type == 'splitword':
                    word_split_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)
                elif type == 'skill':
                    skill_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)

def main():
    create_node(resume_inf_d, resume_singleword, resume_singleword_ch, resume_singleword_relation, 'single')
    create_node(resume_inf_d, resume_multiword, resume_multiword_ch, resume_multiword_relation, 'splitword')
    create_node(resume_inf_d, resume_keyword, resume_keyword_ch, resume_keyword_relation, 'splitword')
    create_node(resume_inf_d, resume_skill, resume_skill_ch, resume_skill_relation, 'skill')
main()

创建joblist的节点

In [None]:
joblist_inf = pd.read_excel(joblist_path, dtype = {'id':str})

# 数据清洗
joblist_inf['deadline'] = joblist_inf['deadline'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.isna(x) != True else x)
joblist_inf['id'] = joblist_inf['id'].apply(lambda x:x.strip()) # 在jupyter上，\t不一定代表制表符，也有可能是字符串
# 将NaN替换为''
joblist_inf = joblist_inf.replace(np.nan,'')

In [None]:
# 定义字符串
joblist_singleword = ['id', 'enterpriseName', 'positionName', 'willNature', 'minimumWage','maximumWage', 'payMethod', 'exp', 'edu_require', 
                       'position_count', 'workplace', 'provinceCode', 'cityCode', 'regionCode','enter_address', 'fixed_province', 
                       'fixed_city', 'fixed_region','deadline','eA_shortName', 'eA_econKind', 'job_personScope','job_registCapi', 'job_email', 'job_phone']
                       
joblist_singleword_ch = ['ID', '公司', '岗位', '工作性质',  '最低薪资', '最高薪资', '支付方式', '经验', '学历', '人数', '地址', '地理编码', 
                          '地理编码', '地理编码', '地址', '省', '市', '区', '截止时间','缩写', '公司性质', '人数' , '资本', '邮箱', '电话']

joblist_singleword_relation = ['ID', '公司', '岗位', '工作性质',  '最低薪资', '最高薪资', '支付方式', '经验要求', '学历要求', '人数', 
                                '工作地点', '省级编码', '市级编码', '区级编码', '具体工作地点', '省', '市', '区', '岗位截止时间', 
                                '公司缩写', '公司性质', '公司人数' , '公司注册资本', '公司邮箱', '公司电话']

joblist_multiword = ['function', 'keyword_info','skill_info','job_industry', 'welfare']
joblist_multiword_ch = ['职责', '关键词', '技能要求', '行业', '福利']
joblist_multiword_relation = ['职责要求', '岗位关键词', '技能要求', '公司行业', '福利待遇']

joblist_keyword = ['jobRequiredments_keywords']
joblist_keyword_ch = ['关键词']
joblist_keyword_relation = ['工作要求']

In [None]:
# 创建单个点
def single_node(data, column_ch, column_relation, node_id):
    node_sigleword = NodeMatcher(graph).match(name=str(data)).first()
    if node_sigleword == None:
        node_sigleword = Node(column_ch, name = str(data))
        graph.create(node_sigleword)
    elif column_ch in node_sigleword.labels:
        pass
    else:
        node_sigleword.add_label(column_ch)
        graph.push(node_sigleword)
    
    if column_ch == 'ID':
        pass
    else:
        create_relation(node_id, node_sigleword, column_relation)

# 创建技能点
def skill_node(data,column_ch, column_relation, node_id):
    data = data.split(',')
    data_skill = [skill.split(':')[0] for skill in data]
    for i in data_skill:
        node_skill = NodeMatcher(graph).match(name=str(i)).first()
        if node_skill == None:
            node_skill = Node(column_ch, name = str(i))
            graph.create(node_skill)
        elif column_ch in node_skill.labels:
            pass
        else:
            node_skill.add_label(column_ch)
            graph.push(node_skill)
        create_relation(node_id, node_skill, column_relation)

# 创建关键字节点
def word_split_node(data, column_ch, column_relation, node_id):
    data = data.split(',')
    # 分割json格式
    for i in data:
        node_keyword = NodeMatcher(graph).match(name=str(i)).first()
        if node_keyword == None:
            node_keyword = Node(column_ch, name = str(i))
            graph.create(node_keyword)
        elif column_ch in node_keyword.labels:
            pass
        else:
            node_keyword.add_label(column_ch)
            graph.push(node_keyword)
        create_relation(node_id, node_keyword, column_relation) 

#创建id和其它关系的连接
def create_relation(node_id, node, column_relation):
    relation = Relationship(node_id,column_relation,node)
    graph.create(relation)

def create_node(data, columns, columns_ch, column_relation, type):
    for i in range(data.shape[0]):
        for j in range(len(columns)):
            if data.loc[i,columns[j]] == '':
                continue
            else:
                id = data.loc[i,'id']
                node_id = NodeMatcher(graph).match(name=str(id)).first()
                if type == 'single':
                    single_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)
                elif type == 'splitword':
                    word_split_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)
                elif type == 'skill':
                    skill_node(data.loc[i, columns[j]], columns_ch[j], column_relation[j],node_id)

def main():
    create_node(joblist_inf, joblist_singleword, joblist_singleword_ch, joblist_singleword_relation, 'single')
    create_node(joblist_inf, joblist_multiword, joblist_multiword_ch, joblist_multiword_relation, 'splitword')
    create_node(joblist_inf, joblist_keyword, joblist_keyword_ch, joblist_keyword_relation, 'splitword')
main()