-
Notifications
You must be signed in to change notification settings - Fork 0
/
gushiwenwang_all.py
75 lines (60 loc) · 1.94 KB
/
gushiwenwang_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
import time
from lxml import etree
from utils import header
base_url = "https://so.gushiwen.cn"
def parse_head_url(html):
root = etree.HTML(html)
divs = root.xpath('//div[@class="right"]/div[@class="son1"]')
for div in divs:
urls = div.xpath('./child::a/@href')
get(urls[1:5])
# print(urls[1:5])
# for url in urls:
# test = get(url)
# print(test)
# resp_con = requests.get(url,
# headers={'User-Agent': header.get_ua()}
# )
# if resp_con.status_code == 200:
# parse(resp_con.text)
# else:
# raise Exception('请求失败!')
# print(urls[1:5])
# for url in urls[1:5]:
# print(url)
def get_leader(url):
resp = requests.get(url,
headers={'User-Agent': header.get_ua()}
)
if resp.status_code == 200:
parse_head_url(resp.text)
else:
raise Exception('请求失败!')
def parse(html):
root = etree.HTML(html)
divs = root.xpath('//div[@class="left"]/div[@class="sons"]')
for div in divs:
name = div.xpath('.//p[1]//text()')
author = ' '.join(div.xpath('.//p[2]//text()'))
content = div.xpath('.//div[@class="contson"]/text()')
tag = ','.join(div.xpath('./div[last()]/a/text()'))
print(name, author, content, tag)
next_url = [base_url + root.xpath('//a[@class="amore"]/@href')[0]]
print(next_url)
time.sleep(0.5)
try:
get(next_url)
except Exception:
pass
def get(urls):
for url in urls:
resp = requests.get(url,
headers={'User-Agent':header.get_ua()}
)
if resp.status_code == 200:
parse(resp.text)
else:
raise Exception('请求失败')
if __name__ == "__main__":
get_leader('https://www.gushiwen.cn')