-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_wordcloud.py
139 lines (105 loc) · 4.38 KB
/
generate_wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from get_position_description import get_all_jobs_description
from common import check_contain_chinese, add_words, set_show_Chinese, get_job_type_from_position_info_xlsx
import jieba
from openpyxl import load_workbook
from read_position_info import get_salary_list, get_district, get_business_zones, get_industry_field
set_show_Chinese()
font = './font/SourceHanSerifSC-Regular.otf'
def filter_not_it_job(x):
flag = not x.isspace() and x.isprintable()
flag = flag and not x.isnumeric() and not x.isdecimal() and not x.isdigit()
flag = flag and len(x) > 1
return flag
def filter_it_job(x):
if x in add_words:
return True
flag = not x.isspace() and x.isprintable()
flag = flag and not x.isnumeric() and not x.isdecimal() and not x.isdigit()
flag = flag and len(x) > 1
flag = flag and (not check_contain_chinese(x))
return flag
def simple_filter(x):
flag = not x.isspace() and x.isprintable()
flag = flag and not x.isnumeric() and not x.isdecimal() and not x.isdigit()
flag = flag and len(x) > 1
return flag
def is_it_job(job_type):
'''
包含中文的是非it工作(如产品,运行), it工作指Python、C++、Java等,暂时不考虑HR之类的工作。
:param job_type:
:return:
'''
flag = check_contain_chinese(job_type)
return not flag
def filter_fun(job_type):
return filter_it_job # 目前先返回filter_it_job
'''
返回it工作或者非it工作的过滤函数
由于it工作或者非it工作的职位信息分词不一样,it工作的职位信息分词主要是英文如Linux,Git,NoSQL等所以
过滤方法不一样
:param job_type:
:return:
'''
if is_it_job(job_type):
return filter_it_job
else:
return filter_not_it_job
def generate_skill_wordcloud(xlsx_file):
job_type = get_job_type_from_position_info_xlsx(xlsx_file)
desc = get_all_jobs_description(xlsx_file)
text = ''.join(desc)
[jieba.add_word(word) for word in add_words]
seg = jieba.cut(text, cut_all=False, HMM=True)
seg = filter(filter_fun(job_type), seg)
seg = [s.title() for s in seg]
text = ' '.join(seg)
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2, max_words=1000).generate(text)
plt.imshow(wc)
plt.title('%s 技术栈词云图' % job_type, fontsize=20)
plt.axis("off")
plt.show()
wc.to_file('./img_wordcloud/%s_skill_wordcloud.png' % job_type)
def generate_salary_worlcloud(xlsx_file):
job_type = get_job_type_from_position_info_xlsx(xlsx_file)
salary_list = get_salary_list(xlsx_file)
text = ' '.join(salary_list)
regexp = r"(\w[\w']+| \d+k-\d+k)"
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2, max_words=1000, regexp=regexp).generate(text)
plt.imshow(wc)
plt.title('%s 工资词云图' % job_type, fontsize=20)
plt.axis("off")
plt.show()
wc.to_file('./img_wordcloud/%s_salary_wordcloud.png' % job_type)
def generate_district_wordcloud(xlsx_file):
job_type = get_job_type_from_position_info_xlsx(xlsx_file)
district = get_district(xlsx_file)
business_zones = get_business_zones(xlsx_file)
district.extend(business_zones)
text = ' '.join(district)
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2, max_words=200).generate(text)
plt.imshow(wc)
plt.title('%s 公司区域分布词云图' % job_type, fontsize=20)
plt.axis("off")
plt.show()
wc.to_file('./img_wordcloud/%s_district_wordcloud.png' % job_type)
def generate_industry_field_wordcloud(xlsx_file):
job_type = get_job_type_from_position_info_xlsx(xlsx_file)
industry = get_industry_field(xlsx_file)
text = ' '.join(industry)
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2, max_words=200).generate(text)
plt.imshow(wc)
plt.title('%s 行业词云图' % job_type, fontsize=20)
plt.axis("off")
plt.show()
wc.to_file('./img_wordcloud/%s_industry_field_wordcloud.png' % job_type)
if __name__ == '__main__':
job = 'c++'
xlsx = './xlsx_file/%s_position_info.xlsx' % job
print(job)
# generate_skill_wordcloud(xlsx)
#generate_salary_worlcloud(xlsx)
#generate_district_wordcloud(xlsx)
generate_industry_field_wordcloud(xlsx)
print('end!!!', job)