-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_position_description.py
50 lines (39 loc) · 1.37 KB
/
get_position_description.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests
from common import headers, random_time_sleep
from bs4 import BeautifulSoup
from read_position_info import get_company_ids
import os
def get_job_description(company_id):
job_url = 'https://www.lagou.com/jobs/%s.html' % str(company_id)
print('crawl:', job_url)
response = requests.get(job_url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html5lib')
doms = soup.find_all('dd', class_='job_bt')[0]
doms = doms.find_all('div')[0]
doms = doms.find_all('p')
return [item.get_text() for item in doms]
# for item in doms:
# yield item.get_text()
elif response.status_code == 403:
print('request is forbidden by the server...')
else:
print(response.status_code)
def get_all_jobs_description(xlsx_file):
list_id = get_company_ids(xlsx_file)
info = []
for _id in list_id:
desc = get_job_description(_id)
if desc:
info.extend(desc)
random_time_sleep(3, 7)
return info
def test_get_all_jobs_description():
job_type = 'python'
xlsx_file = 'xlsx_file/%s_position_info.xlsx' % job_type
d = os.path.dirname(__file__)
xlsx = os.path.join(d, xlsx_file)
r = get_all_jobs_description(xlsx)
print(r)
if __name__ == '__main__':
test_get_all_jobs_description()