In [1]:
import asyncio
import json
import math
from typing import Dict, List
import httpx

from nested_lookup import nested_lookup
from parsel import Selector

# create HTTPX client with headers that resemble a web browser
client = httpx.AsyncClient(
    http2=True,
    follow_redirects=True,
    limits=httpx.Limits(max_connections=3),  # keep this low to avoid being blocked
    headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
    },
)

# From previous chapter:
def parse_nextjs(html: str) -> dict:
    """extract nextjs cache from page"""
    selector = Selector(html)
    data = selector.css("script#__NEXT_DATA__::text").get()
    if not data:
        data = selector.css("script[data-name=query]::text").get()
        data = data.split("=", 1)[-1].strip().strip(";")
    data = json.loads(data)
    return data


async def scrape_search(url: str, max_pages: int = 25) -> List[Dict]:
    """Scrape StockX search"""
    print(f"scraping first search page: {url}")
    first_page = await client.get(url)
    assert first_page.status_code == 200, "scrape was blocked"  # this should be retried, handled etc.

    # parse first page for product search data and total amount of pages:
    data = parse_nextjs(first_page.text)
    _first_page_results = nested_lookup("results", data)[0]
    _paging_info = _first_page_results["pageInfo"]
    total_pages = _paging_info["pageCount"] or math.ceil(_paging_info["total"] / _paging_info["limit"])  # note: pageCount can be missing but we can calculate it ourselves
    if max_pages < total_pages:
        total_pages = max_pages

    product_previews = [edge["node"] for edge in _first_page_results["edges"]]

    # then scrape other pages concurrently:
    print(f"  scraping remaining {total_pages - 1} search pages")
    _other_pages = [  # create GET task for each page url
        asyncio.create_task(client.get(f"{first_page.url}&page={page}"))
        for page in range(2, total_pages + 1)
    ]
    for response in asyncio.as_completed(_other_pages):  # run all tasks concurrently
        response = await response
        data = parse_nextjs(response.text)
        _page_results = nested_lookup("results", data)[0]
        product_previews.extend([edge["node"] for edge in _page_results["edges"]])
    return product_previews



In [2]:
import pandas as pd
import json2csv

fields = ['title','brand','description']


async def main():
    result = await(scrape_search("https://stockx.com/search?s=nike+dunk+low&sort=deadstock_sold", max_pages=2))
    print(json.dumps(result, indent=2))
    csv_file = 'stockx_test_50_most_sold_dunks.csv'

    df = pd.json_normalize(result)

    df.to_csv(csv_file, index=False)
    
    

if __name__ == "__main__":
    await main()

scraping first search page: https://stockx.com/search?s=nike+dunk+low&sort=deadstock_sold
  scraping remaining 1 search pages
[
  {
    "__typename": "Product",
    "id": "796882e1-685f-4bbb-8720-075b76db84dd",
    "name": "Photon Dust (Women's)",
    "urlKey": "nike-dunk-low-photon-dust-w",
    "title": "Nike Dunk Low Photon Dust (Women's)",
    "brand": "Nike",
    "description": "The women's Nike Dunk Low Photon Dust (W) is made of white leather with Photon Dust leather overlays and Swooshes. A woven Nike label on the tongue and embroidered Nike branding completes the design.\n\nThe women's Nike Dunk Low Photon Dust (W) released in May of 2021 and retailed for $100.",
    "model": "Nike Dunk Low",
    "condition": "New",
    "productCategory": "sneakers",
    "browseVerticals": [
      "sneakers"
    ],
    "listingType": "STANDARD",
    "favorite": null,
    "media": {
      "thumbUrl": "https://images.stockx.com/images/Nike-Dunk-Low-Photon-Dust-W-Product.jpg?fit=fill&bg=FFFFFF&w=1