-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
35 lines (29 loc) · 1.43 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# !/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2019/3/28 23:54
# @Author : ZyTomorrow
# @Mail : z794672847@gmail.com
# @Github_Url :
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/73.0.3683.86 Safari/537.36",
"Referer": "http://ip.zdaye.com/dayProxy.html"}
# 测试获取最大页的bs结构
# page = requests.get('http://ip.zdaye.com/dayProxy.html', headers=headers)
# max_page = BeautifulSoup(page.content).find_all('a', title='最后页')[0].contents
# print(max_page[0])
# 测试进入查找每页的title是否正确
page = requests.get('http://ip.zdaye.com/dayProxy.html', headers=headers).content
# print(page)
# title_list = BeautifulSoup(page).find_all('div', class_='title')
# for title in title_list:
# print(title.a['href'])
# print(title_list)
# 测试ip文本split的问题
page = BeautifulSoup(requests.get('http://ip.zdaye.com/dayProxy/ip/293404.html', headers=headers).content)
# content = BeautifulSoup(page).find_all('div', class_='cont') # 文本全在<div class='cont'>下
# ip_list = str(content.find_all('div', class_='cont')[0]).split("<br/>")[1:-1] # 数据是<br/>换行分隔的
ip_list = str(page.find_all('div', class_='cont')[0]).split("<br/>")[1:-1] # 数据是<br/>换行分隔的
print(ip_list)