In [11]:
import pandas as pd
import os
import requests
import json
import re
import math
from bs4 import BeautifulSoup
import tenacity

In [2]:
@tenacity.retry(wait=tenacity.wait_random(1, 5), stop=tenacity.stop_after_attempt(3))
def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

len(get_html_text(url="https://m.samrugs.com/pinpai/pp679.html"))

54619

In [5]:
html = get_html_text("https://m.samrugs.com/pinpai/daquan/")
body = BeautifulSoup(html, "html5lib")

urls = []
for a in body.select("#classifyPop a"):
    urls.append("https:" + a["href"])
print(urls)

['https://m.samrugs.com/pinpai/baihuodq/', 'https://m.samrugs.com/pinpai/leqiyinxiangdq/', 'https://m.samrugs.com/pinpai/yulexuexidq/', 'https://m.samrugs.com/pinpai/lipindq/', 'https://m.samrugs.com/pinpai/wujindq/', 'https://m.samrugs.com/pinpai/chujudq/', 'https://m.samrugs.com/pinpai/diydq/', 'https://m.samrugs.com/pinpai/jiajuriyongdq/', 'https://m.samrugs.com/pinpai/bangongdq/', 'https://m.samrugs.com/pinpai/weishengchuwudq/', 'https://m.samrugs.com/pinpai/chuweiweiyudq/', 'https://m.samrugs.com/pinpai/jiatingbaojiandq/', 'https://m.samrugs.com/pinpai/jiafangdq/', 'https://m.samrugs.com/pinpai/canjudq/', 'https://m.samrugs.com/pinpai/buyiruanshidq/', 'https://m.samrugs.com/pinpai/jiajudq/', 'https://m.samrugs.com/pinpai/dengjuzhaomingdq/', 'https://m.samrugs.com/pinpai/lingshidq/', 'https://m.samrugs.com/pinpai/jiancaidq/', 'https://m.samrugs.com/pinpai/shengxiandq/', 'https://m.samrugs.com/pinpai/zhuangxiudq/', 'https://m.samrugs.com/pinpai/meishidq/', 'https://m.samrugs.com/pin

In [7]:
# 保存起始链接
category_map = dict()
for url in urls:
    try:
        html = get_html_text(url)
        # classid: 464,
        classid = re.search(r"classid: (\d+)", html).group(1)
        start_url = f"https://api.phb123.com/e/extend/api/index.php?cate={classid}&page=1&m=ppdq&c=pplist&type=2"
        html = get_html_text(start_url)
        data = json.loads(html)
        brand_count = data["data"]["brand_count"]
        cate = data["data"]["cate"]
        category_map[url] = {"brand_count": brand_count, "cate": cate, "start_url": start_url}
    except Exception as e:
        print("fail:", url, e)

json.dump(category_map, open("category_map.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

In [10]:
sum([int(x["brand_count"]) for x in category_map.values()])
# 169182 这可不是个小工程

169182

In [None]:
html = get_html_text("https://m.samrugs.com/pinpai/zhubaodq/")
body = BeautifulSoup(html, "html5lib")

brand_urls = []
for a in body.select("a.btn"):
    brand_urls.append("https:" + a["href"])


In [None]:
# 请求分页数据
# https://api.phb123.com/e/extend/api/index.php?cate=464&page=1&m=ppdq&c=pplist&type=2
# 请求页面数据
# https://m.samrugs.com/pinpai/pp11753.html

In [15]:
@tenacity.retry(wait=tenacity.wait_random(1, 5), stop=tenacity.stop_after_attempt(3))
def parse_page_url(page_url):
    html = get_html_text(page_url)
    data = json.loads(html)
    brand_urls = []
    for item in data["data"]["data"]:
        brand_urls.append("https://m.samrugs.com/pinpai/pp{}.html".format(item["id"]))
    return brand_urls


@tenacity.retry(wait=tenacity.wait_random(1, 5), stop=tenacity.stop_after_attempt(3))
def parse_brand_url(brand_url):
    html = get_html_text(brand_url)
    body = BeautifulSoup(html, "html5lib")

    data = {"industry": [], "url": brand_url}

    data["name"] = body.select(".base-info-tit")[0].get_text(strip=True)

    for item in body.select(".base-row"):
        key, val = item.get_text(strip=True).split("：")
        data[key] = val
    
    for a in body.select(".brand-hy a"):
        data["industry"].append(a.get_text(strip=True))
    return data

parse_brand_url("https://m.samrugs.com/pinpai/pp11753.html")

{'industry': ['布艺软饰',
  '美食',
  '户外',
  '家庭保健',
  '百货',
  '家居日用',
  '手表',
  '珠宝',
  '饰品',
  '女装',
  '办公',
  '厨具',
  '美发',
  '娱乐/学习'],
 'url': 'https://m.samrugs.com/pinpai/pp11753.html',
 'name': '施华洛世奇/Swarovski',
 '品牌源地': '欧洲',
 '创立时间': '1895'}

In [16]:
item = {
    "brand_count": "479",
    "cate": "乐器音像",
    "start_url": "https://api.phb123.com/e/extend/api/index.php?cate=525&page=1&m=ppdq&c=pplist&type=2"
}
pages = math.ceil(int(item["brand_count"]) / 10)

result = []
for page_no in range(1, pages+1):
    page_url = item["start_url"].replace("page=1", "page={}".format(page_no))
    print(page_no, page_url)
    try:
        brand_urls = parse_page_url(page_url)
    except Exception as e:
        print("fail page_url:", page_url, e)
        continue
    for brand_url in brand_urls:
        print(brand_url)
        try:
            result.append(parse_brand_url(brand_url))
        except Exception as e:
            print("fail brand_url:", brand_url, e)
            continue

json.dump(result, open("乐器音像.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

1 https://api.phb123.com/e/extend/api/index.php?cate=525&page=1&m=ppdq&c=pplist&type=2
https://m.samrugs.com/pinpai/pp8530.html
https://m.samrugs.com/pinpai/pp19668.html
https://m.samrugs.com/pinpai/pp28685.html
https://m.samrugs.com/pinpai/pp169137.html
https://m.samrugs.com/pinpai/pp28333.html
https://m.samrugs.com/pinpai/pp48238.html
https://m.samrugs.com/pinpai/pp28655.html
https://m.samrugs.com/pinpai/pp28633.html
https://m.samrugs.com/pinpai/pp28406.html
https://m.samrugs.com/pinpai/pp168558.html
2 https://api.phb123.com/e/extend/api/index.php?cate=525&page=2&m=ppdq&c=pplist&type=2
https://m.samrugs.com/pinpai/pp135672.html
https://m.samrugs.com/pinpai/pp19676.html
https://m.samrugs.com/pinpai/pp28376.html
https://m.samrugs.com/pinpai/pp172344.html
https://m.samrugs.com/pinpai/pp168917.html
https://m.samrugs.com/pinpai/pp19703.html
https://m.samrugs.com/pinpai/pp69234.html
https://m.samrugs.com/pinpai/pp28431.html
https://m.samrugs.com/pinpai/pp122209.html
https://m.samrugs.com/p