In [1]:
import os
import re
import time
import random
import math
import requests
import json
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [2]:
# 读取需要的环境变量
load_dotenv()
# link info
SF_URL = os.getenv('SF_URL')
LDTJ_URL = os.getenv('LDTJ_URL')
JBXX_URL = os.getenv('JBXX_URL')
CTD_URL = os.getenv('CTD_URL')
IMG_URL = os.getenv('IMG_URL')
PM_URL = os.getenv('PM_URL')
PM_FC_URL = os.getenv('PM_FC_URL')
DSH_URL = os.getenv('DSH_URL')
YSH_URL = os.getenv('YSH_URL')
DSH_PAGE = os.getenv('DSH_PAGE')
YSH_PAGE = os.getenv('YSH_PAGE')
PAGE_TAIL = os.getenv('PAGE_TAIL')
BASE_NAME = os.getenv('BASE_NAME')

# GJ LINK
CITY_CODE_URL = os.getenv('CITY_CODE_URL')
QCZS_URL = os.getenv('QCZS_URL')
# headers info 
img_headers = {
    "Accept": os.getenv("ACCEPT"),
    "Accept-Encoding": os.getenv("ACCEPT_ENCODING"),
    "Accept-Language": os.getenv("ACCEPT_LANGUAGE"),
    "Authorization": os.getenv("AUTHORIZATION"),
    "Connection": os.getenv("CONNECTION"),
    "Content-Length": os.getenv("CONTENT_LENGTH"),
    "Content-Type": os.getenv("CONTENT_TYPE"),
    "Cookie": os.getenv("COOKIE"),
    "Host": os.getenv("HOST"),
    "origin": os.getenv("ORIGIN"),
    "Referer": os.getenv("REFERER"),
    "Sec-Ch-Ua": os.getenv("SEC_CH_UA"),
    "Sec-Ch-Ua-Mobile": os.getenv("SEC_CH_UA_MOBILE"),
    "Sec-Ch-Ua-Platform": os.getenv("SEC_CH_UA_PLATFORM"),
    "Sec-Fetch-Dest": os.getenv("SEC_FETCH_DEST"),
    "Sec-Fetch-Mode": os.getenv("SEC_FETCH_MODE"),
    "Sec-Fetch-Site": os.getenv("SEC_FETCH_SITE"),
    "User-Agent": os.getenv("USER_AGENT")
}
headers = {
    "Accept": os.getenv("ACCEPT"),
    "Accept-Encoding": os.getenv("ACCEPT_ENCODING"),
    "Accept-Language": os.getenv("ACCEPT_LANGUAGE"),
    "Authorization": os.getenv("AUTHORIZATION"),
    "Connection": os.getenv("CONNECTION"),
    "Cookie": os.getenv("COOKIE"),
    "Host": os.getenv("HOST"),
    "Referer": os.getenv("REFERER"),
    "Sec-Ch-Ua": os.getenv("SEC_CH_UA"),
    "Sec-Ch-Ua-Mobile": os.getenv("SEC_CH_UA_MOBILE"),
    "Sec-Ch-Ua-Platform": os.getenv("SEC_CH_UA_PLATFORM"),
    "Sec-Fetch-Dest": os.getenv("SEC_FETCH_DEST"),
    "Sec-Fetch-Mode": os.getenv("SEC_FETCH_MODE"),
    "Sec-Fetch-Site": os.getenv("SEC_FETCH_SITE"),
    "User-Agent": os.getenv("USER_AGENT")}


In [3]:
### 辅助函数
# 装饰器，如果函数没有返回值，则一直重试
def retry_if_no_return(func):
    def wrapper(*args, **kwargs):
        # 定义重试次数
        retry_times = 5
        while retry_times>=0:
            result = func(*args, **kwargs)
            if result:
                return result
            # 如果函数没有返回值，可以在这里添加日志或等待时间
            time.sleep(2)
            retry_times -= 1
    return wrapper

# 遍历url列表下载图片
@retry_if_no_return
def download_image(url,save_path_name,headers):
    headers = headers
    try:
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            with open(save_path_name, 'wb') as f:
                f.write(r.content)
                return True
    except:
        return False
    time.sleep(random.random())

In [4]:
# GJ_INFO
# 获取市州、区县编码
def get_city_code(code,headers):
    """获取市州、区县编码"""
    url = CITY_CODE_URL
    # 请求的payload
    params = {
        'code': f'{code}',
    }

    # 发送GET请求
    response = requests.get(url, headers=headers, params=params)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取基本信息
def get_gj_info_total(code,headers,sjdm=None,ydlb=None,sfsty=None,sfstypxy=None,sfzdbb=None,cylx=None,
                sfjxzcyd=None,sfdbnpyd=None,sfqmpk=None,sfyjd=None,sfgjswy=None,sfsjswy=None,
                sftzy=None,sfttcpyd=None,sfwryxcyd=None,tdlylx=None):
    result_list = []
    """获取基本信息"""
    url = QCZS_URL
    params = {
        'pageNum': 1, # 页码
        'pageSize': 50, # 每页显示数量
        'sjdm': sjdm, # 省市区编码
        'ydlb': ydlb, # 类别
        'sfsty': sfsty, # 水团
        'sfstypxy': sfstypxy, # 水团平行
        'sfzdbb': sfzdbb, # 整段标本
        'cylx': cylx, # 采样类型
        'sfjxzcyd': sfjxzcyd, # 机械组成样点
        'sfdbnpyd': sfdbnpyd, # 地理标志点
        'sfqmpk': sfqmpk, # 全面铺开
        'sfyjd': sfyjd, # 盐碱地
        'sfgjswy': sfgjswy, # 生物点
        'sfsjswy': sfsjswy, # 省级生物点
        'sftzy': sftzy, # 土钻样
        'sfttcpyd': sfttcpyd, # 土特产
        'sfwryxcyd': sfwryxcyd, # xc污染
        'tdlylx': tdlylx, # TD利用类型
        'keyword': '',
        'xzqdm': f'{code}', # 行政区划代码
    }
        # 发送GET请求
    response = requests.get(url, headers=headers, params=params)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        total = data['result']['total']
            # 计算有多少页
        page_num = math.ceil(total / 50)
        # 循环获取每一页的编码
        for i in range(1, page_num + 1):
            params['pageNum'] = i
            response = requests.get(url=url, headers=headers, params=params)
            data = response.json()
            result_list += data['result']['records']
        return result_list
    else:
        print("请求失败，状态码：", response.status_code)
        return False



In [5]:
# 获取施肥信息
def get_sf_info(point_id,headers):
    """获取施肥信息"""
    url = f"{SF_URL}{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取图片信息
def get_img_info(point_id,headers):
    """获取图片和视频链接"""
    url = IMG_URL
    # 请求的JSON数据
    # 请根据实际情况替换下面的data字典
    data = {
        "glbh": f"{int(point_id)}"
    }
    # 将字典转换为JSON格式的字符串
    payload = json.dumps(data)
    # 发送POST请求
    response = requests.post(url, headers=headers, data=payload)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取立地条件信息
def get_ldtj_info(point_id,headers):
    """获取立地条件信息"""
    url = f"{LDTJ_URL}{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取样点基本信息
def get_base_info(point_id,headers):
    """获取基本信息"""
    url = f"{JBXX_URL}{point_id}"
        # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取采土袋信息
def get_ctd_info(point_id,headers):
    """获取采土袋信息"""
    url = f"{CTD_URL}{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
    
# 获取剖面信息
def get_pm_info(point_id,headers):
    """获取剖面信息"""
    url = f"{PM_URL}{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取剖面发生层信息
def get_pmfc_info(point_id,headers):
    """获取剖面发生层信息"""
    url = f"{PM_FC_URL}{point_id}"
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取当前页面cookie信息
def get_cookie():
    """获取当前页面cookie信息"""
    # 使用JavaScript获取Cookie
    cookie_script = """
    return document.cookie;
    """
    cookie_value = driver.execute_script(cookie_script)
    return cookie_value
# 更新cookie信息
def update_cookie(headers):
    """更新cookie信息"""
    # 获取当前页面cookie信息
    cookie_value = get_cookie()
    # 更新headers
    headers.update({"Cookie": cookie_value})
    return headers
# 获取已审核数量
def get_audit_num(headers):
    """获取已审核数量"""
    url = YSH_URL
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data['result']
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取待审核数量
def get_wait_num(headers):
    """获取待审核数量"""
    url = DSH_URL
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        return data['result']
    else:
        print("请求失败，状态码：", response.status_code)
        return False
# 获取指定页面的编码
def get_set_page_num(url,headers):
    """获取指定页面的编码"""
    response = requests.get(url, headers=headers)
    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        data = response.json()
        info_list = data['result']['records']
        # 循环获取编码
        temp_dict = [{_['ydbh']:_['ydlb']} for _ in info_list]
        return temp_dict
    else:
        print("请求失败，状态码：", response.status_code)
        return False

# 获取已审核的编码
def get_page_number(total,headers):
    """获取当前传入页面的编码,默认每一页50"""
    # 计算有多少页
    page_num = math.ceil(total / 50)
    # 循环获取每一页的编码
    result_list = []
    for i in range(1, page_num + 1):
        url = f"{YSH_PAGE}{i}{PAGE_TAIL}"
        result_list+=get_set_page_num(url,headers)
        time.sleep(random.random())
    return result_list

# 获取待审核的编码
def get_wait_page_number(total,headers):
    """获取当前传入页面的编码,默认每一页50"""
    # 计算有多少页
    page_num = math.ceil(total / 50)
    # 循环获取每一页的编码
    temp_list = []
    for i in range(1, page_num + 1):
        url = f"{DSH_PAGE}{i}{PAGE_TAIL}"
        temp_list+=get_set_page_num(url,headers)
        time.sleep(random.random())
    return temp_list
    
# 随机休眠
def random_sleep():
    """随机休眠"""
    time.sleep(random.random())
    return

In [6]:
# 配置浏览器
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Selenium\AutomationProfile"
# chrome.exe --remote-debugging-port=9999 --user-data-dir="D:\Program Files\ChromeDir"
options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "localhost:9998")
driver = webdriver.Chrome(options=options)

driver.implicitly_wait(60)

In [7]:
# 保存路径
save_path = r"F:\collection_spb_info\GJ\QZ"
# 如果没有该文件夹，则创建
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 检查heraders并更新

In [8]:

headers = update_cookie(headers)

In [9]:
headers

{'Accept': 'application/json, text/plain, */*',
 'Accept-Encoding': 'gzip, deflate, br, zstd',
 'Accept-Language': 'zh-CN,zh;q=0.9',
 'Authorization': 'eyJhbGciOiJIUzUxMiJ9.eyJ1c2VyX3Rva2VuOiI6InVzZXJfdG9rZW46MDljNGM3ZGQtYjc0ZS00N2M0LTlhNTQtMGViZmJmY2RlNmJhIn0.MNnInpNwW9ElDUgGHIsLBvPLAywZLCphRxp79RtWT83HOlBTaPNqCnhLKfJ4OO4gsVcPhIhg9sBrxVTFh4x9vg',
 'Connection': 'keep-alive',
 'Cookie': 'HWWAFSESID=ca78a7387d9f59d2fb; HWWAFSESTIME=1721118780204',
 'Host': 'sanpu.iarrp.cn',
 'Referer': 'https://sanpu.iarrp.cn/',
 'Sec-Ch-Ua': 'Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
 'Sec-Ch-Ua-Mobile': '?0',
 'Sec-Ch-Ua-Platform': '"Windows"',
 'Sec-Fetch-Dest': 'empty',
 'Sec-Fetch-Mode': 'cors',
 'Sec-Fetch-Site': 'same-origin',
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'}

In [10]:
# 查看区县编码
get_city_code(5201,headers)

{'success': True,
 'message': None,
 'code': 200,
 'result': [{'ctime': None,
   'utime': None,
   'id': '3304',
   'parentId': '1028',
   'code': '520102',
   'name': '南明区',
   'fullname': None,
   'level': 3,
   'centerx': 106.715963,
   'centery': 26.573743,
   'sfspsdx': None,
   'sfqmpk': '1'},
  {'ctime': None,
   'utime': None,
   'id': '3305',
   'parentId': '1028',
   'code': '520103',
   'name': '云岩区',
   'fullname': None,
   'level': 3,
   'centerx': 106.713397,
   'centery': 26.58301,
   'sfspsdx': None,
   'sfqmpk': '1'},
  {'ctime': None,
   'utime': None,
   'id': '3306',
   'parentId': '1028',
   'code': '520111',
   'name': '花溪区',
   'fullname': None,
   'level': 3,
   'centerx': 106.670791,
   'centery': 26.410464,
   'sfspsdx': None,
   'sfqmpk': '1'},
  {'ctime': None,
   'utime': None,
   'id': '3307',
   'parentId': '1028',
   'code': '520112',
   'name': '乌当区',
   'fullname': None,
   'level': 3,
   'centerx': 106.762123,
   'centery': 26.630928,
   'sfspsdx': No

In [11]:
# 获取选择条件的编码
# 更新headers
headers = update_cookie(headers=headers)
# get info 
info_list = get_gj_info_total(code=520181,headers=headers)


In [104]:
# 保存基本信息
df_base_info =pd.DataFrame(info_list)
df_base_info.to_excel(os.path.join(save_path,f'base_info{len(info_list)}.xlsx'),index=False)

In [105]:
df_base_info.head(1)

Unnamed: 0,ctime,utime,id,bsm,ysdm,ydbh,ydlb,zldwdm,zldwmc,xzdm,...,gjjshsj,bz,sfspsdx,sfqmpk,gjlycjCy,zdbbzt,appVersion,sfwryxcyd,sfwrxcyd2,sfgd
0,2023-08-10 10:28:53,2024-04-07 11:23:08,a88866d8-f089-4e02-93cc-79d964f56f0f,,,5201810101000001,0,520181100206,中山村,520181100,...,,,0,1,,,1.47.2,0,0,0


In [106]:
# 区分样点类别
pm_list = df_base_info[df_base_info['ydlb']=='1']['ydbh'].to_list()
bc_list = df_base_info[df_base_info['ydlb']=='0']['ydbh'].to_list()
# 分类型打印样点信息
print(f"剖面样点数：{len(pm_list)}",f"表层样点数：{len(bc_list)}")
# 检查总数
df_base_info.shape[0]==len(pm_list)+len(bc_list)

剖面样点数：103 表层样点数：1091


True

In [107]:
# 更新替换信息
loop_all_point = bc_list+pm_list

In [108]:
# 测试一个点位的图片信息
img_info = get_img_info(loop_all_point[0],img_headers)
img_info

{'success': True,
 'message': None,
 'code': 200,
 'result': [{'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 12:56:39',
   'id': '1773212585837039618',
   'glbh': '5201810101000001',
   'wjbh': None,
   'wjmc': '1697078531744.mp4',
   'wjlx': None,
   'wjfl': '1400',
   'wjlj': 'ssp-dccy/2024-03-28/520181/88ea2df2-a091-41bb-b82f-25847b795a21.mp4',
   'jd': 106.44865,
   'wd': 26.494167,
   'jdu': None,
   'cjsj': '2023-10-12 10:42:24',
   'bz': None,
   'url': None,
   'rawUrl': None},
  {'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 12:56:39',
   'id': '1773212585837039619',
   'glbh': '5201810101000001',
   'wjbh': None,
   'wjmc': '1697078566062.mp4',
   'wjlx': None,
   'wjfl': '1400',
   'wjlj': 'ssp-dccy/2024-03-28/520181/aaa364b1-6610-42b1-b10d-e4f1cfe7e43f.mp4',
   'jd': 106.44865,
   'wd': 26.494167,
   'jdu': None,
   'cjsj': '2023-10-12 10:42:59',
   'bz': None,
   'url': None,
   'rawUrl': None},
  {'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28

In [112]:
# 获取图片链接信息
# 更新headers
img_headers = update_cookie(img_headers)
img_info_list = []
for one_point in tqdm(loop_all_point):
    one_point_img_info = get_img_info(one_point,img_headers)['result']
    # 为链接增加域名
    prefix = BASE_NAME
    updated_list = [{**item, 'wjlj': f"{prefix}{item['wjlj']}"} for item in one_point_img_info]
    # 随机休眠
    # time.sleep(random.random())
    img_info_list+=updated_list

100%|██████████| 1194/1194 [03:57<00:00,  5.02it/s]


In [113]:
df_img_info = pd.DataFrame(img_info_list)
df_img_info.head(1)

Unnamed: 0,ctime,utime,id,glbh,wjbh,wjmc,wjlx,wjfl,wjlj,jd,wd,jdu,cjsj,bz,url,rawUrl
0,2024-03-28 12:56:39,2024-03-28 12:56:39,1773212585837039618,5201810101000001,,1697078531744.mp4,,1400,https://sanpu.iarrp.cn/ssp-dccy/2024-03-28/520...,106.44865,26.494167,,2023-10-12 10:42:24,,,


In [114]:
# 保存媒体信息
df_img_info.to_excel(os.path.join(save_path, f'img_info_{len(loop_all_point)}.xlsx'), index=False)

In [115]:
# 测试一个点位的立地调查信息
ldtj_info = get_ldtj_info(loop_all_point[0],headers)
ldtj_info

{'success': True,
 'message': None,
 'code': 200,
 'result': {'ctime': '2024-03-28 12:56:39',
  'utime': '2024-03-28 20:04:41',
  'id': '1773212585807679490',
  'ydbh': '5201810101000001',
  'zldwmc': '中山村',
  'cyjd': 106.44840860965174,
  'cywd': 26.493917787733082,
  'hbgd': 1236.0,
  'cysj': '2023-10-12 12:30:46',
  'tqqk': '03',
  'dcz': '闫黔昆',
  'dcdw': '有色一总队',
  'qslx': 'W',
  'qscd': 'S',
  'jyclfd': 'F',
  'jycljj': 'C',
  'dblsfd': 'N',
  'dblsdx': None,
  'dbybfd': 'N',
  'dbybhd': None,
  'dblxkd': None,
  'dblxcd': None,
  'dblxfd': 'N',
  'dblxjj': None,
  'dblxsl': None,
  'trsh': '0',
  'ddx': 'PT',
  'zdx': 'LH',
  'xdx': 'AF',
  'dxbw': 'LS',
  'dxbwqt': None,
  'pd': 'II',
  'px': 'W',
  'pxn': '02',
  'my': '20',
  'myqt': None,
  'mz': 'LG',
  'mzqt': None,
  'zblx': '11',
  'zbzwysz': '农作物',
  'zbfgd': None,
  'zbqmfgd': None,
  'zbgmfgd': None,
  'zbcbfgd': None,
  'tdlylx': '0101',
  'tdlylxqt': None,
  'tdlylxbg': '[]',
  'sftsncp': '0',
  'tsncpzwlx': None,
  

In [116]:
# 获取立地条件信息
# 更新headers
headers = update_cookie(headers)
ldtj_info_list = []
for one_point in tqdm(loop_all_point):
    one_point_ldtj_info = get_ldtj_info(one_point,headers)['result']
    ldtj_info_list.append(one_point_ldtj_info)
    # 随机休眠
    # time.sleep(random.random())

100%|██████████| 1194/1194 [03:34<00:00,  5.57it/s]


In [117]:
ldtj_info_list,len(ldtj_info_list)

([{'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 20:04:41',
   'id': '1773212585807679490',
   'ydbh': '5201810101000001',
   'zldwmc': '中山村',
   'cyjd': 106.44840860965174,
   'cywd': 26.493917787733082,
   'hbgd': 1236.0,
   'cysj': '2023-10-12 12:30:46',
   'tqqk': '03',
   'dcz': '闫黔昆',
   'dcdw': '有色一总队',
   'qslx': 'W',
   'qscd': 'S',
   'jyclfd': 'F',
   'jycljj': 'C',
   'dblsfd': 'N',
   'dblsdx': None,
   'dbybfd': 'N',
   'dbybhd': None,
   'dblxkd': None,
   'dblxcd': None,
   'dblxfd': 'N',
   'dblxjj': None,
   'dblxsl': None,
   'trsh': '0',
   'ddx': 'PT',
   'zdx': 'LH',
   'xdx': 'AF',
   'dxbw': 'LS',
   'dxbwqt': None,
   'pd': 'II',
   'px': 'W',
   'pxn': '02',
   'my': '20',
   'myqt': None,
   'mz': 'LG',
   'mzqt': None,
   'zblx': '11',
   'zbzwysz': '农作物',
   'zbfgd': None,
   'zbqmfgd': None,
   'zbgmfgd': None,
   'zbcbfgd': None,
   'tdlylx': '0101',
   'tdlylxqt': None,
   'tdlylxbg': '[]',
   'sftsncp': '0',
   'tsncpzwlx': None,
   'tsncpzwlx

In [118]:
df_ldtj_info = pd.DataFrame(ldtj_info_list)
df_ldtj_info.head(1)

Unnamed: 0,ctime,utime,id,ydbh,zldwmc,cyjd,cywd,hbgd,cysj,tqqk,...,glswylszl,rzlstjzb,hlsbctrhhfd,hlsbctrhhzl,cyhydgs,cbfmc,cbfzjhm,cbflxfs,hydgs,bz
0,2024-03-28 12:56:39,2024-03-28 20:04:41,1773212585807679490,5201810101000001,中山村,106.448409,26.493918,1236.0,2023-10-12 12:30:46,3,...,,,0,0,,李显钟,522502196705020839,13639102726,9.0,所有采样点全部在同一地块内，景观照定位有误


In [None]:
# 过滤NONE值
ldtj_info_list = [x for x in ldtj_info_list if x is not None]  # 过滤掉 None 值
df_ldtj_info = pd.DataFrame(ldtj_info_list)
df_ldtj_info.head(1)


In [119]:
# 保存立地条件信息
df_ldtj_info.to_excel(os.path.join(save_path,f'ldtj_info_{len(loop_all_point)}.xlsx'),index=False)

In [120]:
# 测试一个点位的采土袋信息
ctd_info = get_ctd_info(loop_all_point[0],headers)
ctd_info

{'success': True,
 'message': None,
 'code': 200,
 'result': [{'ctime': '2023-10-15 09:17:26',
   'utime': '2024-03-28 12:56:39',
   'id': '1713363420395442178',
   'ctdbh': '520181010100000110',
   'ydbh': '5201810101000001',
   'ydlb': '0',
   'cylx': '1',
   'yplx': '1',
   'fscxh': None,
   'ypzl': 6185.0,
   'jszt': '2',
   'jshzt': '2'},
  {'ctime': '2023-10-15 09:17:26',
   'utime': '2024-03-28 12:56:39',
   'id': '1713363420395442179',
   'ctdbh': '520181010100000120',
   'ydbh': '5201810101000001',
   'ydlb': '0',
   'cylx': '1',
   'yplx': '4',
   'fscxh': None,
   'ypzl': None,
   'jszt': '1',
   'jshzt': None}],
 'timestamp': 1719545173089,
 'elapse': 0}

In [121]:
# 获取采土袋信息
# 更新headers
headers = update_cookie(headers)
ctd_info_list = []
for one_point in tqdm(loop_all_point):
    one_point_ctd_info = get_ctd_info(one_point,headers)['result']
    ctd_info_list+=one_point_ctd_info
    # 随机休眠
    # time.sleep(random.random())

100%|██████████| 1194/1194 [04:14<00:00,  4.69it/s]


In [122]:
df_ctd_info = pd.DataFrame(ctd_info_list)
df_ctd_info.head(1)

Unnamed: 0,ctime,utime,id,ctdbh,ydbh,ydlb,cylx,yplx,fscxh,ypzl,jszt,jshzt
0,2023-10-15 09:17:26,2024-03-28 12:56:39,1713363420395442178,520181010100000110,5201810101000001,0,1,1,,6185.0,2,2


In [123]:
# 保存采土袋信息
df_ctd_info.to_excel(os.path.join(save_path, f'ctd_info_{len(loop_all_point)}.xlsx'), index=False)

In [124]:
# 测试一个点位的施肥信息
sf_info = get_sf_info(loop_all_point[0],headers)
sf_info

{'success': True,
 'message': None,
 'code': 200,
 'result': [{'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 12:56:39',
   'id': '1773212585816068097',
   'ydbh': '5201810101000001',
   'jd': '第一季',
   'fllx': '化学氮肥',
   'flmc': '尿素',
   'swyl': 11.0,
   'hlzb': 15,
   'yshl': 1.65,
   'zwlx': '12',
   'zwlxqt': None,
   'jfzb': 0,
   'zfzb': 100},
  {'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 12:56:39',
   'id': '1773212585816068098',
   'ydbh': '5201810101000001',
   'jd': '第一季',
   'fllx': '化学氮肥',
   'flmc': '三元复合（混）肥',
   'swyl': 25.0,
   'hlzb': 15,
   'yshl': 3.75,
   'zwlx': '12',
   'zwlxqt': None,
   'jfzb': 100,
   'zfzb': 0},
  {'ctime': '2024-03-28 12:56:39',
   'utime': '2024-03-28 12:56:39',
   'id': '1773212585816068099',
   'ydbh': '5201810101000001',
   'jd': '第一季',
   'fllx': '磷肥',
   'flmc': '三元复合（混）肥',
   'swyl': 25.0,
   'hlzb': 15,
   'yshl': 3.75,
   'zwlx': '12',
   'zwlxqt': None,
   'jfzb': 100,
   'zfzb': 0},
  {'ctime': '2024-03-28 12:

In [125]:
# 获取施肥信息
# 更新headers
headers = update_cookie(headers)
sf_info_list = []
for one_point in tqdm(loop_all_point):
    one_point_sf_info = get_sf_info(one_point,headers)['result']
    sf_info_list+=one_point_sf_info
    # 随机休眠
    # time.sleep(random.random())

100%|██████████| 1194/1194 [04:12<00:00,  4.72it/s]


In [126]:
df_sf_info = pd.DataFrame(sf_info_list)
df_sf_info.head(1)

Unnamed: 0,ctime,utime,id,ydbh,jd,fllx,flmc,swyl,hlzb,yshl,zwlx,zwlxqt,jfzb,zfzb
0,2024-03-28 12:56:39,2024-03-28 12:56:39,1773212585816068097,5201810101000001,第一季,化学氮肥,尿素,11.0,15.0,1.65,12,,0.0,100.0


In [127]:
# 保存施肥信息
df_sf_info.to_excel(os.path.join(save_path, f'sf_info_{len(loop_all_point)}.xlsx'), index=False)

# 剖面信息

In [128]:
pm_point_id = pm_list

In [129]:
# 测试剖面点位的信息
pm_info = get_pm_info(pm_point_id[0],headers)
pm_info

{'success': True,
 'message': None,
 'code': 200,
 'result': {'ctime': '2023-11-28 12:26:46',
  'utime': '2024-03-26 20:12:31',
  'id': '1729356131350974466',
  'ydbh': '5201810101100005',
  'pmzp': None,
  'yxtchd': 115,
  'gzchd': 15.0,
  'tthd': 115,
  'ttgx': None,
  'ttgxqt': None,
  'fscs': 6,
  'fsxjs': '该土壤由于石灰岩风化的坡残积物发育的土壤，经水耕熟化后，形成了明显的耕作层和犁底层，耕作层有机质积累明显。由于水分周期性移动产生锈色斑纹，形成氧化还原层。底部由于水分过于饱和铁锰以低价形态存在，形成大量锰的聚集，使得土层底部及母质层形成青灰色的潜育层。',
  'scxnps': '该土种质地较适中，适耕性广，宜耕期长，耕作质量较好。但磷素可能缺乏，作物苗期生长受阻，中期供肥平稳。改良利用上要平衡施肥，增施有机肥，因底部水分较高，加强基本农田建设，注意防涝排水。',
  'zjzxyj': '同意',
  'fsxtg': '人为土',
  'fsxyg': '人为水成土',
  'fsxtl': '水稻土',
  'fsxyl': '潜育水稻土',
  'fsxts': '青灰泥田',
  'fsxtz': '黄青灰泥田',
  'xtfltg': '人为土',
  'xtflyg': '水耕人为土',
  'xtfltl': '潜育水耕人为土',
  'xtflyl': '普通潜育水耕人为土',
  'sfydxscx': '0',
  'dxscxsd': None,
  'tbcd': '[]',
  'tbjhwzw': None,
  'tbjhwzp': None,
  'tkd1jd': 106.38195901,
  'tkd1wd': 26.53899607,
  'tkd2jd': 106.3830406,
  'tkd2wd': 26.54447353,
  'tkd3jd': 106.37681825,
  'tkd3wd':

In [130]:
# 获取剖面信息
# 更新headers
headers = update_cookie(headers)
pm_info_list = []
for one_point in tqdm(pm_point_id):
    one_point_pm_info = get_pm_info(one_point,headers)['result']
    pm_info_list.append(one_point_pm_info)
    # 随机休眠
    time.sleep(random.random())

100%|██████████| 103/103 [02:04<00:00,  1.21s/it]


In [131]:
df_pm_info = pd.DataFrame(pm_info_list)
df_pm_info.head(1)

Unnamed: 0,ctime,utime,id,ydbh,pmzp,yxtchd,gzchd,tthd,ttgx,ttgxqt,...,tkd2wd,tkd3jd,tkd3wd,tkd4jd,tkd4wd,tkd5jd,tkd5wd,tkd6jd,tkd6wd,bz
0,2023-11-28 12:26:46,2024-03-26 20:12:31,1729356131350974466,5201810101100005,,115,15.0,115,,,...,26.544474,106.376818,26.544005,,,,,,,经野外实地踏勘，布设样点及周边无国家下发的漂洗水稻土，因此采样剖面土壤类型选择电子围栏内典型...


In [58]:
# 过滤NONE值
pm_info_list = [x for x in pm_info_list if x is not None]  # 过滤掉 None 值
df_pm_info = pd.DataFrame(pm_info_list)
df_pm_info.head(1)


Unnamed: 0,ctime,utime,id,ydbh,pmzp,yxtchd,gzchd,tthd,ttgx,ttgxqt,...,tkd2wd,tkd3jd,tkd3wd,tkd4jd,tkd4wd,tkd5jd,tkd5wd,tkd6jd,tkd6wd,bz
0,2024-05-29 11:52:48,2024-05-29 11:52:48,1795664565456965633,5201210101100001,,92,32.0,92,,,...,26.947319,106.993796,26.95197,106.996625,26.943678,,,,,


In [132]:
# 保存剖面信息
df_pm_info.to_excel(os.path.join(save_path, f'pm_info_{len(pm_info_list)}.xlsx'), index=False)

In [133]:
# 测试剖面点位发生层信息
pm_fc_info = get_pmfc_info(pm_point_id[0],headers)
pm_fc_info

{'success': True,
 'message': None,
 'code': 200,
 'result': [{'ctime': '2023-11-28 12:26:46',
   'utime': '2024-01-17 21:03:26',
   'id': '1729356131355168770',
   'ydbh': '5201810101100005',
   'xh': 1,
   'fschdsj': 0,
   'fschdxj': 15,
   'fscmc': '耕作层',
   'fscfh': 'Ap1',
   'bjmxd': 'C',
   'bjgdxz': 'S',
   'spywrtbs': '1',
   'ywrtsd': '2.5Y',
   'ywrtmd': 6,
   'ywrtcd': 3,
   'sngtsd': None,
   'sngtmd': None,
   'sngtcd': None,
   'rtbssd': None,
   'rtbsmd': None,
   'rtbscd': None,
   'gxfd': 'M',
   'gxcx': 'F',
   'gxxz': '3,4',
   'zd': '3',
   'jgxz': 'I',
   'jgdx': 'GRFI',
   'jgdxqt': None,
   'jgfycd': 'VS',
   'tnlsfd': '0',
   'tnlsdx': None,
   'tnlsxz': None,
   'tnlsfhcd': None,
   'tnlszl': 0.0,
   'jcx': '3',
   'bwfd': 'N',
   'bwdx': None,
   'bwwz': None,
   'bwzcwz': None,
   'bwzcwzqt': None,
   'jmfd': 'N',
   'jmwz': None,
   'jmzcwz': None,
   'jmzcwzqt': None,
   'jmytrjzdbd': None,
   'kzlzjhfd': 'N',
   'kzlzjhzl': None,
   'kzlzjhdx': None,
   'k

In [134]:
# 获取剖面发生层信息
# 更新headers
headers = update_cookie(headers)
pm_fc_info_list = []
for one_point in tqdm(pm_point_id):
    one_point_pm_fc_info = get_pmfc_info(one_point,headers)['result']
    pm_fc_info_list+=one_point_pm_fc_info
    # 随机休眠
    time.sleep(random.random())

100%|██████████| 103/103 [01:10<00:00,  1.46it/s]


In [135]:
df_pm_fc_info = pd.DataFrame(pm_fc_info_list)
df_pm_fc_info.head(1)

Unnamed: 0,ctime,utime,id,ydbh,xh,fschdsj,fschdxj,fscmc,fscfh,bjmxd,...,trdwyxqk,shfy,ytfy,jhfy,trsjfy,glscytrtj,glsywlstj,glswylszl,rzlstjzb,bz
0,2023-11-28 12:26:46,2024-01-17 21:03:26,1729356131355168770,5201810101100005,1,0,15,耕作层,Ap1,C,...,,N,,,AC,,,,,


In [None]:
# 过滤NONE值
pm_fc_info_list = [x for x in pm_fc_info_list if x is not None]  # 过滤掉 None 值
df_pm_fc_info = pd.DataFrame(pm_info_list)
df_pm_fc_info.head(1)


In [136]:
# 保存剖面发生层信息
df_pm_fc_info.to_excel(os.path.join(save_path, f'pm_fc_info_{len(pm_fc_info_list)}.xlsx'), index=False)

# TODO

In [None]:
# 合并更新后的信息表和原始表
excel_name_list = set([_.split('_')[0] for _ in  os.listdir(save_path) if _.endswith('.xlsx')])

In [None]:

for one_table in tqdm(excel_name_list):
    # 初始化空DataFrame
    result_df = pd.DataFrame()
    temp_df_list = []
    for one_excel in [_ for _ in os.listdir(save_path) if _.endswith('.xlsx')]:
        if one_excel.split('_')[0] == one_table:
            temp_df_list.append(pd.read_excel(os.path.join(save_path, one_excel)))
    result_df = pd.concat(temp_df_list)
    # 转换内容为字符串
    result_df = result_df.astype(str)
    result_df.to_excel(os.path.join(save_path, one_table + '_info.xlsx'),index=False)

In [None]:
# 连接两个表

In [None]:
df_ldtj_info.columns

In [None]:
df_pm_info.columns

In [None]:
df_img_info.columns

In [None]:
df_img_info.rename(columns={'glbh':'ydbh'},inplace=True)

In [None]:
df_img_info.columns

In [None]:
result_df = pd.merge(df_ldtj_info,df_pm_info,on='ydbh',how='left')

In [None]:
result_df_2 = pd.merge(result_df,df_img_info,on='ydbh',how='left')

In [None]:
result_df_2.to_excel(os.path.join(save_path,'result2.xlsx'),index=False)