In [None]:
import os
import re
import time
import random
import math
import requests
import json
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup

In [None]:
### 辅助函数
# 装饰器，如果函数没有返回值，则一直重试
def retry_if_no_return(func):
    def wrapper(*args, **kwargs):
        # 定义重试次数
        retry_times = 5
        while retry_times>=0:
            result = func(*args, **kwargs)
            if result:
                return result
            # 如果函数没有返回值，可以在这里添加日志或等待时间
            time.sleep(2)
            retry_times -= 1
    return wrapper
# 获取主页详情页链接
@retry_if_no_return
def get_detail_page_url(driver):
    """获取详情页链接"""
   # 定位包含详情按钮的所有<tr>元素
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.ant-table-row")
    return rows
# 跳转到下一页
def step_next_page(driver):
    """跳转下一页"""
    # 使用类名和属性选择器
    next_page_button = driver.find_element(By.CSS_SELECTOR, "li.ant-pagination-next[title='下一页']")
    next_page_button.click()
    # # 使用XPath
    # next_page_button = driver.find_element(By.XPATH, "//li[@class='ant-pagination-next' and @title='下一页']")
    # next_page_button.click()
    # # 使用类名
    # next_page_button = driver.find_element(By.CLASS_NAME, "ant-pagination-next")
    # next_page_button.click()

# 从详情页返回到主页
def step_back_to_home(driver):
    button = driver.find_element(By.XPATH,"//button[span='返回']")
    button.click()


# 遍历url列表下载图片
def download_image(url,save_path_name):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
    try:
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            with open(save_path_name, 'wb') as f:
                f.write(r.content)
                return True
    except:
        return False
    time.sleep(random.random())

# 获取当前详情页面样点编码
@retry_if_no_return
def get_code_number(driver):
    """获取当前页面的样点编码"""
    code = driver.find_element(By.CSS_SELECTOR,'.address-logo-text')
    if code.text:
        return code.text 
    else:
        return None
    
def create_save_path(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    else:
        print("目录已存在")

In [None]:
# 获取施肥信息
def get_sf_info(point_id,headers):
    """获取施肥信息"""
    url = f"https://sanpu.iarrp.cn/api/ssp/dccy/ldtj/sfyl/{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取图片信息
def get_img_info(point_id,headers):
    """获取图片和视频链接"""
    url = "https://sanpu.iarrp.cn/api/ssp/fjwj3/list"
    # 请求的JSON数据
    # 请根据实际情况替换下面的data字典
    data = {
        "glbh": f"{int(point_id)}"
    }
    # 将字典转换为JSON格式的字符串
    payload = json.dumps(data)
    # 发送POST请求
    response = requests.post(url, headers=headers, data=payload)

    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取立地条件信息
def get_ldtj_info(point_id,headers):
    """获取立地条件信息"""
    url = f"https://sanpu.iarrp.cn/api/ssp/dccy/ldtj/{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取样点基本信息
def get_base_info(point_id,headers):
    """获取基本信息"""
    url = f"https://sanpu.iarrp.cn/api/ssp/dcyd/{point_id}"
        # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取采土袋信息
def get_ctd_info(point_id,headers):
    """获取采土袋信息"""
    url = f"https://sanpu.iarrp.cn/api/ssp/dccy/ctd/ydbh/{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取当前页面cookie信息
def get_cookie():
    """获取当前页面cookie信息"""
    # 使用JavaScript获取Cookie
    cookie_script = """
    return document.cookie;
    """
    cookie_value = driver.execute_script(cookie_script)
    return cookie_value
# 更新cookie信息
def update_cookie(headers):
    """更新cookie信息"""
    # 获取当前页面cookie信息
    cookie_value = get_cookie()
    # 更新headers
    headers.update({"Cookie": cookie_value})
    return headers
# 获取已审核数量
def get_audit_num(headers):
    """获取已审核数量"""
    url = "https://sanpu.iarrp.cn/api/ssp/dcyd/sj/zj/ysh/sl?sjdm=52"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data['result']
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取待审核数量
def get_wait_num(headers):
    """获取待审核数量"""
    url = "https://sanpu.iarrp.cn/api/ssp/dcyd/sj/zj/dsh/sl?sjdm=52"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data['result']
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取指定页面的编码
def get_set_page_num(url,headers):
    """获取指定页面的编码"""
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        info_list = data['result']['records']
        # 循环获取编码
        temp_list = [_['ydbh'] for _ in info_list]
        return temp_list
    else:
        print("请求失败，状态码：", response.status_code)
        return False

# 获取已审核的编码
def get_page_number(total,headers):
    """获取当前传入页面的编码,默认每一页50"""
    # 计算有多少页
    page_num = math.ceil(total / 50)
    # 循环获取每一页的编码
    result_list = []
    for i in range(1, page_num + 1):
        url = f"https://sanpu.iarrp.cn/api/ssp/dcyd/sj/zj/ysh/page?pageNum={i}&pageSize=50&xzqdm=52"
        result_list+=get_set_page_num(url,headers)
        time.sleep(random.random())
    return result_list

# 获取待审核的编码
def get_wait_page_number(total,headers):
    """获取当前传入页面的编码,默认每一页10"""
    # 计算有多少页
    page_num = math.ceil(total / 50)
    # 循环获取每一页的编码
    temp_list = []
    for i in range(1, page_num + 1):
        url = f"https://sanpu.iarrp.cn/api/ssp/dcyd/sj/zj/dsh/page?pageNum={i}&pageSize=50&xzqdm=52"
        temp_list+=get_set_page_num(url,headers)
        time.sleep(random.random())
    return temp_list
    

In [None]:
# 配置浏览器
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Selenium\AutomationProfile"
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Program Files\ChromeDir"
options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "localhost:9998")
driver = webdriver.Chrome(options=options)

driver.implicitly_wait(60)

In [None]:
# 保存路径
save_path = r"F:\collection_spb_info\李相楹"

In [None]:
# 请求的头部信息
headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Authorization": "eyJhbGciOiJIUzUxMiJ9.eyJ1c2VyX3Rva2VuOiI6InVzZXJfdG9rZW46NTcwY2ViYjktMDE1Yi00OThiLTkzZjUtNjc0YjMwMjY1ZTJhIn0.h0kqz0ehJR1bKh01k5NYFHkssxk2RqJY40jA0xNxO4YXJ4DFhyFuW77L1kInCEwyRkOXX2hdGwoupKdptlKeJQ",
    "Connection": "keep-alive",
    "Cookie": "HWWAFSESID=51dcc73c594961e824; HWWAFSESTIME=1711330630059",
    "Host": "sanpu.iarrp.cn",
    "Referer": "https://sanpu.iarrp.cn/",
    "Sec-Ch-Ua": '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": '"Windows"',
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}


# 检查heraders并更新

In [None]:

headers = update_cookie(headers)

In [None]:
# 获取当前账户下的待审核和已审核编码，并写入文件
# 更新headers
headers = update_cookie(headers)
audit_point_list = get_page_number(get_audit_num(headers),headers)
wait_point_list = get_wait_page_number(get_wait_num(headers),headers)
all_point = audit_point_list + wait_point_list

In [None]:
# 写入编码到文件
with open(os.path.join(save_path,'point_id.txt'),'w',encoding='utf-8') as f:
    for one_id in all_point:
        f.write(one_id+'\n')

In [None]:
len(all_point),len(wait_point_list),len(audit_point_list)

In [None]:
# 测试一个点位的基本信息
base_info = get_base_info(all_point[0],headers)
base_info

In [None]:
# 获取基本信息
# 更新headers
headers = update_cookie(headers)
base_info_list = []
for one_point in tqdm(all_point):
    one_point_base_info = get_base_info(one_point,headers)['result']
    # 随机休眠
    time.sleep(random.random())
    base_info_list.append(one_point_base_info)


In [None]:
df_base_info = pd.DataFrame(base_info_list)
df_base_info

In [None]:
# 保存基本信息
df_base_info.to_excel(os.path.join(save_path,'base_info.xlsx'),index=False)

In [None]:
# 测试一个点位的图片信息
img_info = get_img_info(all_point[0],headers)
img_info

In [None]:
# 获取图片链接信息
# 更新headers
headers = update_cookie(headers)
img_info_list = []
for one_point in tqdm(all_point):
    one_point_img_info = get_img_info(one_point,headers)['result']
    # 为链接增加域名
    prefix = 'https://sanpu.iarrp.cn/'
    updated_list = [{**item, 'wjlj': f"{prefix}{item['wjlj']}"} for item in one_point_img_info]
    # 随机休眠
    time.sleep(random.random())
    img_info_list+=updated_list

In [None]:
df_img_info = pd.DataFrame(img_info_list)
df_img_info

In [None]:
# 保存媒体信息
df_img_info.to_excel(os.path.join(save_path, 'img_info.xlsx'), index=False)

In [None]:
# 测试一个点位的立地调查信息
ldtj_info = get_ldtj_info(all_point[0],headers)
ldtj_info

In [None]:
# 获取立地条件信息
ldtj_info_list = []
for one_point in tqdm(all_point):
    one_point_ldtj_info = get_ldtj_info(one_point,headers)['result']
    ldtj_info_list.append(one_point_ldtj_info)
    # 随机休眠
    # time.sleep(random.random())

In [None]:
df_ldtj_info = pd.DataFrame(ldtj_info_list)
df_ldtj_info

In [None]:
# 保存立地条件信息
df_ldtj_info.to_excel(os.path.join(save_path,'ldtj_info.xlsx'),index=False)

In [None]:
# 测试一个点位的采土袋信息
ctd_info = get_ctd_info(all_point[0],headers)
ctd_info

In [None]:
# 获取采土袋信息
ctd_info_list = []
for one_point in tqdm(all_point):
    one_point_ctd_info = get_ctd_info(one_point,headers)['result']
    ctd_info_list+=one_point_ctd_info
    # 随机休眠
    # time.sleep(random.random())

In [None]:
df_ctd_info = pd.DataFrame(ctd_info_list)
df_ctd_info

In [None]:
# 保存采土袋信息
df_ctd_info.to_excel(os.path.join(save_path, 'ctd_info.xlsx'), index=False)

In [None]:
# 测试一个点位的施肥信息
sf_info = get_sf_info(all_point[0],headers)
sf_info

TODO

In [None]:
# 获取施肥信息
sf_info_list = []
for one_point in tqdm(all_point):
    one_point_sf_info = get_sf_info(one_point,headers)['result']
    sf_info_list+=one_point_sf_info
    # 随机休眠
    # time.sleep(random.random())

In [None]:
df_sf_info = pd.DataFrame(sf_info_list)
df_sf_info

In [None]:
# 保存施肥信息
df_sf_info.to_excel(os.path.join(save_path, 'sf_info.xlsx'), index=False)

In [None]:
headers

In [None]:
import requests
import json

# 请求的URL
url = "https://sanpu.iarrp.cn/api/ssp/dcyd/sj/zj/dsh/page?pageNum=1&pageSize=50&xzqdm=52"

# 请求的头部信息
headers = headers



# 发送get请求
response = requests.get(url, headers=headers)

# 检查响应状态码
if response.status_code == 200:
    # 解析响应内容
    data = response.json()
    print(data)
else:
    print("请求失败，状态码：", response.status_code)


In [None]:
import requests
import json

# 请求的URL
url = "https://sanpu.iarrp.cn/api/ssp/fjwj3/list"
headers = update_cookie(headers)

# 请求的JSON数据
# 请根据实际情况替换下面的data字典
data = {
    "glbh": "5227270101000090"
}

# 将字典转换为JSON格式的字符串
payload = json.dumps(data)

# 发送POST请求
response = requests.post(url, headers=update_cookie(headers), data=payload)

# 检查响应状态码
if response.status_code == 200:
    # 解析响应内容
    data = response.json()
    print(1)
    print(data)
else:
    print(2)
    print("请求失败，状态码：", response.status_code)


In [None]:
import requests
import json

# 请求的URL
url = "https://sanpu.iarrp.cn/api/ssp/fjwj3/list"

# 请求的头部信息
headers = {
    'Authorization': 'eyJhbGciOiJIUzUxMiJ9.eyJ1c2VyX3Rva2VuOiI6InVzZXJfdG9rZW46NTcwY2ViYjktMDE1Yi00OThiLTkzZjUtNjc0YjMwMjY1ZTJhIn0.h0kqz0ehJR1bKh01k5NYFHkssxk2RqJY40jA0xNxO4YXJ4DFhyFuW77L1kInCEwyRkOXX2hdGwoupKdptlKeJQ',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json;charset=UTF-8',
    'Cookie': 'HWWAFSESID=74d6aa0b5af7d5b0b6; HWWAFSESTIME=1711415484123',
    'Host': 'sanpu.iarrp.cn',
    'Origin': 'https://sanpu.iarrp.cn',
    'Referer': 'https://sanpu.iarrp.cn/',
    'Sec-Ch-Ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

# 请求的JSON数据
payload = {
    "glbh": "5227220101000001"
}

# 将字典转换为JSON格式的字符串
payload_json = json.dumps(payload)

# 发送POST请求
response = requests.post(url, headers=update_cookie(headers), data=payload_json)

# 检查响应状态码
if response.status_code == 200:
    # 解析响应内容
    data = response.json()
    print(data)
else:
    print("请求失败，状态码：", response.status_code)
