In [None]:
import os
import re
import time
import random
import math
import requests
import json
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [None]:
# 读取需要的环境变量
load_dotenv()

# 样品查询
XJYPCX_URL = os.getenv('XJYPCX_URL')

# 样品审核
XJ_PAGE_URL = os.getenv('XJ_PAGE_URL')

# 状态查询
XJ_STATUS_URL = os.getenv('XJ_STATUS_URL')


# headers info 
headers = {
    "Accept": os.getenv("ACCEPT"),
    "Accept-Encoding": os.getenv("ACCEPT_ENCODING"),
    "Accept-Language": os.getenv("ACCEPT_LANGUAGE"),
    "Authorization": os.getenv("AUTHORIZATION"),
    "Connection": os.getenv("CONNECTION"),
    "Cookie": os.getenv("COOKIE"),
    "Host": os.getenv("HOST"),
    "Referer": os.getenv("REFERER"),
    "Sec-Ch-Ua": os.getenv("SEC_CH_UA"),
    "Sec-Ch-Ua-Mobile": os.getenv("SEC_CH_UA_MOBILE"),
    "Sec-Ch-Ua-Platform": os.getenv("SEC_CH_UA_PLATFORM"),
    "Sec-Fetch-Dest": os.getenv("SEC_FETCH_DEST"),
    "Sec-Fetch-Mode": os.getenv("SEC_FETCH_MODE"),
    "Sec-Fetch-Site": os.getenv("SEC_FETCH_SITE"),
    "User-Agent": os.getenv("USER_AGENT")}


In [None]:
# 获取当前页面cookie信息
def get_cookie():
    """获取当前页面cookie信息"""
    # 使用JavaScript获取Cookie
    cookie_script = """
    return document.cookie;
    """
    cookie_value = driver.execute_script(cookie_script)
    return cookie_value
# 更新cookie信息
def update_cookie(headers):
    """更新cookie信息"""
    # 获取当前页面cookie信息
    cookie_value = get_cookie()
    # 更新headers
    headers.update({"Cookie": cookie_value})
    return headers
# 获取各个状态的数量
def get_status_number(headers,yplx,xzqdm):
    """查询各个状态的样品数量"""    
    url=XJ_STATUS_URL
    # 请根据实际情况替换下面的data字典
    url = f"{url}?yplx={yplx}&xzqdm={xzqdm}"
    # 发送get请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data['result']
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取每一页的内容
def get_one_page_data(headers,qxdm,yplx,shzt):
    """获取指定地区、样品类型、审核状态的数据"""
    url = XJ_PAGE_RUL
    result_list = []
    data_dict = {
        "pageNum": 1,
        "pageSize": 50,
        "xzqdm": f"{qxdm}",
        "yplx": f"{yplx}",
        "shzt": f"{shzt}",
        "expressions": []
        }
    # 获取总页数
    page_response = requests.post(url=url, headers=headers, json=data_dict)
    page_result = page_response.json()
    page_num = page_result['result']['pages']
    for i in range(1, page_num + 1):
        data_dict['pageNum'] = int(i)
        one_list_response = requests.post(url=url, headers=headers, json=data_dict)
        one_list = one_list_response.json()['result']['records']
        result_list += one_list
    return result_list
def get_all_data(headers,xzqdm):
    """获取指定区域的所有数据"""
    url = XJYPCX_URL
    result_list = []
    params = {
        'pageNum': 1, # 页码
        'pageSize': 50, # 每页显示数量
        'xzqdm': f'{xzqdm}', # 行政区划代码
    }
    # 发送GET请求
    response = requests.get(url, headers=headers, params=params)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        # 获取有多少页
        page_num = data['result']['pages']
        # 循环获取每一页的编码
        for i in range(1, page_num + 1):
            params['pageNum'] = i
            response = requests.get(url=url, headers=headers, params=params)
            data = response.json()
            result_list += data['result']['records']
        return result_list
# 获取所有内容
def get_today_date():
    """获取当天日期"""
    today = datetime.today()
    return today.strftime("%Y%m%d")
# 获取文件路径
def find_file_with_string(path, string):
    for root, dirs, files in os.walk(path):
        for file in files:
            if string in file:
                return os.path.join(root, file)
    return None
# 删除文件
def delete_files(file_path):
    if not file_path:
        print("列表为空！")
        return
        
    for path in file_path:
        try:
            if os.path.exists(path):
                os.remove(path)
                print(f"文件 {path} 删除成功！")
            else:
                print(f"文件 {path} 不存在！")
        except Exception as e:
            print(f"删除文件 {path} 时出错: {e}")

In [None]:
# 配置浏览器
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Selenium\AutomationProfile"
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Program Files\ChromeDir"
options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "localhost:9998")
driver = webdriver.Chrome(options=options)

driver.implicitly_wait(60)

In [None]:
# 保存路径
save_path = r"F:\collection_spb_info\XJSH\SB"
# 如果没有该文件夹，则创建
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:

headers = update_cookie(headers)

In [None]:
# 查看状态数量(示例) yplx=1 表层  yplx=2 剖面 yplx=3 水团
get_status_number(headers=headers,yplx=1,xzqdm='522623')

In [None]:
# 获取指定区域、样品类型、审核状态数据
# get_one_page_data(headers=headers,qxdm='522701',yplx='1',shzt='0')

In [None]:
# 获取指定区域的全部数据
# get_all_data(headers=headers,xzqdm='522701')

In [None]:
result_list = get_all_data(headers=headers,xzqdm='510104')

In [None]:
result_list

In [None]:
# 保存基本信息
df_base_info =pd.DataFrame(result_list)


In [None]:
df_base_info

In [None]:
df_base_info.sample(1)

In [None]:
df_base_info.to_excel(os.path.join(save_path,f'all_info_{get_today_date()}_{len(result_list)}.xlsx'),index=False)