In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, BertTokenizer, BertModel
import torch, re, json, jieba
from sklearn.cluster import KMeans, DBSCAN
import numpy as np
import matplotlib.pyplot as plt
from pyecharts.charts import Scatter
from pyecharts.charts import Bar
from pyecharts.charts import Geo
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
from pyecharts.globals import ChartType
from pyecharts.charts import Pie
from pyecharts import options as opts
from pyecharts.faker import Faker
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib as mpl
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from analysis_utils import get_position_types

In [2]:
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [3]:
MODEL_PATH = '../models/model_zoo/medium'
# MODEL_PATH = '/home/vmice/projects/sbert-base-chinese-nli'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = SentenceTransformer(MODEL_PATH, device=device)

test_sentences = ['我想要去加入一个经常旅游的团队，因为我喜欢旅游。',
                  '我们团队平时会偶尔去旅游的。',
                  '经常去旅游的团队我不喜欢去。']
# input_ = tokenizer(test_sentences, truncation=True, return_tensors="pt")
# input_
test_sentences = model.encode(test_sentences)
util.cos_sim(test_sentences, test_sentences)

No sentence-transformers model found with name ../models/model_zoo/medium. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at ../models/model_zoo/medium and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[1.0000, 0.6602, 0.3318],
        [0.6602, 1.0000, 0.3214],
        [0.3318, 0.3214, 1.0000]])

In [4]:
hunter_csv_file = '../datasets/result1-2.csv'

hunter_data = pd.read_csv(hunter_csv_file, encoding='GBK')
print('hunter data shape', hunter_data.shape)

hunter data shape (8690, 22)


In [5]:
hunter_data.iloc[1, :]

序号                                                        2
求职者 ID                                  1579833809761861632
姓名                                                      李先生
性别                                                        男
年龄                                                     23.0
出生地                                                  广东省中山市
社会属性                                                     群众
工作经验                                                      无
期望岗位                                   ['数据分析师', '数据挖掘工程师']
期望薪资                                           [4000, 8000]
期望类型                                                     实习
期望城市                                              广东省深圳市南山区
期望行业                                                 ['不限']
预计到岗                                                   时间待议
工作经历                                 ['广东泰迪智能科技股份有限公司[助教]']
项目经历                                    ['银行客户忠诚度分析[数据分析]']
竞赛经历          ['泰迪杯[一等奖]', '其他[铜牌]', '其他

## 求职信息

### 预期岗位

In [6]:
position_data_items_ = hunter_data['期望岗位'].unique().tolist()
position_data_items = []
for data_item in position_data_items_:
    position_data_items += eval(data_item)
position_data_dict = get_position_types(position_data_items)

position_counter = hunter_data['期望岗位'].value_counts()
position_counter_index = position_counter.index.to_list()
# position_counter_count_dict = {index: int(position_counter[index]) for index in position_counter_index}
position_counter_count_dict = {}
for position_index in position_counter_index:
    exp_type = eval(position_index)
    for position_name in exp_type:
        if position_name not in position_counter_count_dict: 
            position_counter_count_dict[position_name] = 0
        position_counter_count_dict[position_name] += int(position_counter[position_index])

# position_data_items
print(len(position_data_dict), len(position_counter_count_dict))

9 9


In [7]:
social_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(position_counter_count_dict.keys()), 
                                   position_counter_count_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
social_pie.render_notebook()

### 薪资需求

In [8]:
social_counter = hunter_data.loc[:, ['期望岗位', '期望薪资']]
social_counter_dict = {}
for idx in range(social_counter.shape[0]):
    exp_type = social_counter.iloc[idx].values[0]
    wage_count = social_counter.iloc[idx].values[1]
    for position_name in eval(exp_type):
        if position_name not in social_counter_dict: 
            social_counter_dict[position_name] = []
        social_counter_dict[position_name].append(np.mean(eval(wage_count)))

social_counter_dict = {position_name: np.round(np.mean(counts), 2) for position_name, counts in social_counter_dict.items()}
social_counter_dict

{'数据分析师': 5027.21,
 '数据挖掘工程师': 5021.19,
 '机器学习工程师': 7705.88,
 '其他': 6469.7,
 '自然语言处理工程师': 6454.55,
 '算法工程师': 14428.57,
 'Hadoop大数据开发工程师': 7132.65,
 '图像处理工程师': 8285.71,
 '计算机视觉工程师': 6750.0}

In [9]:
# position_data_dict, position_counter_count_dict
wage_bar = (
    Bar(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add_xaxis(list(social_counter_dict.keys()))
    .add_yaxis('全部', list(social_counter_dict.values()))
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": "平均薪资"})
)
wage_bar.render_notebook()

### 社会属性

In [10]:
social_counter = hunter_data.loc[:, ['社会属性']]
social_counter_dict = {}
for idx in range(social_counter.shape[0]):
    exp_type = social_counter.iloc[idx].values[0]
    # wage_count = social_counter.iloc[idx].values[1]
    if not isinstance(exp_type, str): continue
    if exp_type not in social_counter_dict: 
        social_counter_dict[exp_type] = 0
    social_counter_dict[exp_type] += 1

# social_counter_dict = {position_name: np.round(np.mean(counts), 2) for position_name, counts in social_counter_dict.items()}
social_counter_dict

{'共青团员': 110, '群众': 15, '党员': 18, '预备党员': 13}

In [11]:
social_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(social_counter_dict.keys()), 
                                   social_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
social_pie.render_notebook()

### 工作经验

In [12]:
work_exp_counter = hunter_data.loc[:, ['工作经验']]
work_exp_counter_dict = {}
for idx in range(work_exp_counter.shape[0]):
    exp_type = work_exp_counter.iloc[idx].values[0]
    # wage_count = social_counter.iloc[idx].values[1]
    if not isinstance(exp_type, str): continue
    if exp_type not in work_exp_counter_dict: 
        work_exp_counter_dict[exp_type] = 0
    work_exp_counter_dict[exp_type] += 1

# social_counter_dict = {position_name: np.round(np.mean(counts), 2) for position_name, counts in social_counter_dict.items()}
work_exp_counter_dict

{'无': 133, '1年': 15, '4年': 4, '3年': 3, '10年以上': 1}

In [13]:
work_exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(work_exp_counter_dict.keys()), 
                                   work_exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
work_exp_pie.render_notebook()

### 期望类型

In [14]:
exp_counter = hunter_data.loc[:, ['期望类型']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_type = exp_counter.iloc[idx].values[0]
    # wage_count = social_counter.iloc[idx].values[1]
    if not isinstance(exp_type, str): continue
    if exp_type not in exp_counter_dict: 
        exp_counter_dict[exp_type] = 0
    exp_counter_dict[exp_type] += 1

# social_counter_dict = {position_name: np.round(np.mean(counts), 2) for position_name, counts in social_counter_dict.items()}
exp_counter_dict

{'实习': 9, '全职': 138, '无': 9}

In [15]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 期望行业

In [16]:
exp_counter = hunter_data.loc[:, ['期望行业']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

exp_counter_dict

{'不限': 75,
 '互联网': 8603,
 '数据服务': 18,
 '电子商务': 8,
 '游戏': 10,
 '计算机软件': 7,
 'O2O': 2,
 '媒体': 5,
 '金融': 15,
 '在线教育': 3,
 '信息安全': 4,
 '医疗健康': 1,
 '通信设备': 1,
 '人力资源服务': 1}

In [17]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 知识储备

In [18]:
exp_counter = hunter_data.loc[:, ['技能类型']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.sub(r'\[.*\]', '', exp_type).lower()
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

95

In [19]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="1000px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

In [20]:
exp_counter = hunter_data.loc[:, ['语言类型']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.sub(r'\[.*\]', '', exp_type).lower()
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

32

In [21]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="1000px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

In [22]:
exp_counter = hunter_data.loc[:, ['证书类型']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.sub(r'\[.*\]', '', exp_type).lower()
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

81

In [23]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="1000px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 工作经历

In [24]:
exp_counter = hunter_data.loc[:, ['工作经历']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    if len(exp_types) == 0: continue
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.findall(r'\[(.*?)\]', exp_type)[0]
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

32

In [25]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 项目经历

In [26]:
exp_counter = hunter_data.loc[:, ['项目经历']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    if len(exp_types) == 0: continue
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.findall(r'\[(.*?)\]', exp_type)[0]
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

59

In [27]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="1200px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 培训经历

In [28]:
exp_counter = hunter_data.loc[:, ['培训经历']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    if len(exp_types) == 0: continue
    # wage_count = social_counter.iloc[idx].values[1]
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.findall(r'\[(.*?)\]', exp_type)[0]
        if exp_type not in exp_counter_dict: 
            exp_counter_dict[exp_type] = 0
        exp_counter_dict[exp_type] += 1

len(exp_counter_dict)

33

In [29]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 学历

In [30]:
exp_counter = hunter_data.loc[:, ['教育经历']]
exp_counter_dict = {}
for idx in range(exp_counter.shape[0]):
    exp_types = eval(exp_counter.iloc[idx].values[0])
    if len(exp_types) == 0: continue
    # wage_count = social_counter.iloc[idx].values[1]
    edu_types = []
    for exp_type in exp_types:
        # if not isinstance(exp_type, str): continue
        exp_type = re.findall(r'\[(.*?)\]', exp_type)[0]
        edu_types.append(exp_type)
    if '硕士' in edu_types: exp_type = '硕士'
    elif '本科' in edu_types: exp_type = '本科'
    elif '大专' in edu_types: exp_type = '大专'
    if exp_type not in exp_counter_dict: 
        exp_counter_dict[exp_type] = 0
    exp_counter_dict[exp_type] += 1

exp_counter_dict

{'本科': 91, '硕士': 3, '大专': 2}

In [31]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="1000px", height="600px"))
    .add("", [list(z) for z in zip(list(exp_counter_dict.keys()), 
                                   exp_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 期望行业

In [32]:
pos_type_counter = hunter_data.loc[:, '期望行业'].values

pos_type_counter_dict = {}
for pos_types in pos_type_counter:
    for pos_type in eval(pos_types):
        if pos_type not in pos_type_counter_dict: 
            pos_type_counter_dict[pos_type] = 0
        pos_type_counter_dict[pos_type] += 1

len(pos_type_counter_dict)

14

In [33]:
exp_pie = (
    Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add("", [list(z) for z in zip(list(pos_type_counter_dict.keys()), 
                                   pos_type_counter_dict.values())])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts={"text": ""}
    )
)
exp_pie.render_notebook()

### 期望城市

In [39]:
addr_counter = hunter_data.loc[:, '期望城市']
addr_counter_dict = {}
for idx in range(addr_counter.shape[0]):
    # position_name = addr_counter.iloc[idx].values[0]
    position_addr = addr_counter.iloc[idx]
    if not isinstance(position_addr, str): continue
    # print(position_addr)
    position_addr = re.findall(r'(.*?(省|区|市))', position_addr)[0][0]
    if '广西壮族自治区' in position_addr: position_addr = '广西省'
    if position_addr not in addr_counter_dict: 
        addr_counter_dict[position_addr] = 0
    addr_counter_dict[position_addr] += 1
# edu_counter = job_data.loc[:, ['招聘岗位', '期望学历']]
 
addr_counter_dict

{'广东省': 138,
 '重庆市': 2,
 '湖北省': 2,
 '广西省': 1,
 '北京市': 5,
 '天津市': 2,
 '四川省': 1,
 '湖南省': 1,
 '辽宁省': 1,
 '河北省': 1,
 '陕西省': 1,
 '山西省': 1}

In [40]:
c = (
    Geo(init_opts=opts.InitOpts(width="800px", height="600px"))
    .add_schema(maptype="china")
    .add("城市", 
         [list(z) for z in zip(
            list(addr_counter_dict.keys()), 
            list(addr_counter_dict.values()))],
            # type_=ChartType.HEATMAP,
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        visualmap_opts=opts.VisualMapOpts(),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        title_opts=opts.TitleOpts(title="公司地点"),
    )
)
c.render_notebook()