In [36]:
import pandas as pd
import os
import requests
import json
import re
from bs4 import BeautifulSoup
import tenacity

In [38]:
@tenacity.retry(wait=tenacity.wait_random(1, 5), stop=tenacity.stop_after_attempt(3))
def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

len(get_html_text(url="https://detail.zol.com.cn/1396/1395230/param.shtml"))

150087

In [None]:
table_info = pd.read_html("https://detail.zol.com.cn/1396/1395230/param.shtml")

In [None]:
type(table_info)

In [None]:
type(table_info[0])

In [None]:
table_info[0]

In [None]:
len(table_info)

In [None]:
table_info[0].to_dict(orient="list")

In [None]:
table_info[0].to_dict(orient="split")

In [None]:
@tenacity.retry(wait=tenacity.wait_random(1, 5), stop=tenacity.stop_after_attempt(3))
def parse_page(url, unique_name):
    uid, name = unique_name.split("_", maxsplit=1)
    table_info = pd.read_html(url)
    info_dict = {"uid": uid, "name": name}
    for table in table_info:
        # 第一行不要
        data = table.to_dict(orient="list")
        keys = data[0][1:]
        values = data[1][1:]
        for k, v in zip(keys, values):
            k = k.strip()
            v = v.replace("纠错", "").replace(">", "").strip()
            info_dict[k] = v
    # print(info_dict)
    name = name.replace("/", "_")
    json.dump(info_dict, open(f"./data/{uid}_{name}.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)
    return info_dict

In [None]:
parse_page("https://detail.zol.com.cn/1396/1395230/param.shtml", "apple")

In [None]:
start_url = "https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html"
html_text = get_html_text(start_url)
body = BeautifulSoup(html_text, "html5lib")

In [None]:
li_list = body.select("#J_PicMode li")
for li in li_list:
    # cell_phone/index1395230.shtml
    a_url = li.select("a")[0]["href"]
    uid = re.search(r"/cell_phone/index(\d+).shtml", a_url).group(1)
    # https://detail.zol.com.cn/1396/1395230/param.shtml
    param_url = f"https://detail.zol.com.cn/1396/{uid}/param.shtml"

    h3 = li.select("h3")[0]
    for span in h3.select("span"):
        span.decompose()
    name = h3.get_text(strip=True)
    unique_name = f"{uid}_{name}"
    parse_page(param_url, unique_name)

In [34]:
print(len(li_list))

50


In [33]:
next_page_url = body.select(".next")[0]["href"]
next_page_url = f"https://detail.zol.com.cn{next_page_url}"
next_page_url

'https://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_2_0_2.html'

In [39]:
fail = {
    "item_url": [],
}

def run_main(start_url):
    print("start:", start_url)
    html_text = get_html_text(start_url)
    body = BeautifulSoup(html_text, "html5lib")

    li_list = body.select("#J_PicMode li")
    for li in li_list:
        try:
            # cell_phone/index1395230.shtml
            a_url = li.select("a")[0]["href"]
            uid = re.search(r"/cell_phone/index(\d+).shtml", a_url).group(1)
            # https://detail.zol.com.cn/1396/1395230/param.shtml
            param_url = f"https://detail.zol.com.cn/1396/{uid}/param.shtml"

            h3 = li.select("h3")[0]
            for span in h3.select("span"):
                span.decompose()
            name = h3.get_text(strip=True)
            unique_name = f"{uid}_{name}"
        except Exception as e:
            print("解析当前项有误", e)
            print(li)
            continue

        print(unique_name)
        try:
            parse_page(param_url, unique_name)
        except Exception as e:
            print("无法获取当前产品的信息", unique_name)
            fail["item_url"].append(param_url)
    
    # 继续下一页
    try:
        next_page_url = body.select(".next")[0]["href"]
        next_page_url = f"https://detail.zol.com.cn{next_page_url}"
    except Exception as e:
        print(e)
        print("检查出错原因, 或者是没有下一页了")
        return
    run_main(next_page_url)

In [40]:
start_url = "https://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_2_0_28.html"
run_main(start_url)

start: https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html
1395230_OPPO Find N（8GB/256GB/5G版）
1392811_iQOO 9 Pro（12GB/256GB/5G版）
1330794_华为P50 Pocket（8GB/256GB）
1364781_三星Galaxy S22（8GB/128GB/5G版）
1342492_苹果iPhone 13 Pro Max（128GB/全网通/5G版）
1383992_一加10 Pro（8GB/256GB/5G版）
1368418_荣耀Magic3（8GB/128GB/全网通/5G版）
1366979_Redmi K50
1337287_华为Mate40 Pro（8GB/256GB/全网通/5G版/玻璃版）
1357542_魅族18 Pro（8GB/256GB/全网通/5G版）
1340278_Redmi K40（12GB/256GB/全网通/5G版）
1395715_Redmi K50 电竞版（12GB/128GB/5G版）
1342489_苹果iPhone 13（128GB/全网通/5G版）
1395020_荣耀60 SE（8GB/128GB/5G版）
1349131_华为P50 Pro（8GB/256GB/全网通/麒麟9000）
1394397_Moto Edge X30（8GB/128GB/全网通/5G版）
1342491_苹果iPhone 13 Pro（128GB/全网通/5G版）
1366619_三星Galaxy S22 Ultra（12GB/256GB/5G版）
1396078_荣耀Magic V（12GB/256GB/5G版）
1394387_荣耀60（8GB/128GB/全网通/5G版）
1331417_苹果iPhone 12（4GB/64GB/全网通/5G版）
1341561_苹果iPhone SE 3
1394391_荣耀60 Pro（8GB/256GB/全网通/5G版）
1371912_小米12 Pro（8GB/256GB/5G版）
1383833_OPPO Reno7（8GB/128GB/全网通/5G版）
1395333_荣耀X30（8GB/128GB/5G版）
解析当前项有误 list in