In [1]:
# Imports
import requests
import time
import html
import json

from typing import Optional

# Haven't tested this, but it should work

In [None]:
# Constants
PAGES_PER_SUBCATEGORY = 1
LOCATION = "Salt Lake City"
CATEGORIES = ["grocery"]

In [5]:
# Get all categories
def get_categories():
    categories_object = {}
    # Send requests to find categories
    featured_categories_req = requests.get(
        "https://redoak.target.com/content-publish/pages/v1?url=/c/-/-/N-4nav"
    ).json()

    # Get categories
    categories = featured_categories_req['slots']['100']['content']['taxonomy_nodes']

    # Remove wrong categories
    categories = categories[3:][:-3]

    # Create placeholder for category
    for category in categories:
        # Get category name
        category_name = category['name'].lower()

        # Get category url
        category_url = category['seo_data']['canonical_url']

        # Placeholder for categories
        categories_object[category_name] = {
            "url": category_url,
            "categories": []
        }

        category_childrens = category.get('children') 
        if category_childrens is not None and len(category_childrens) >= 1:
            for child_category in category_childrens:
                if child_category['type'] == "Tag":
                    continue
                childs_childrens = child_category.get("children")
                if childs_childrens is not None and len(childs_childrens) >= 1:
                    for childs_child in childs_childrens:
                        if childs_child['type'] == "Tag":
                            continue
                        if childs_child.get('seo_data') is not None:
                            categories_object[category_name]['categories'].append(childs_child.get('seo_data').get("canonical_url"))
                else:
                    categories_object[category_name]['categories'].append(child_category.get('seo_data').get("canonical_url"))

    return categories_object

In [6]:
# Get product attributes
def get_product_attributes(soup: BeautifulSoup):
    # Init product attributes dict
    product_attributes = {}

    # Find parent div of specification and description
    spec_and_desc_parent = soup.find("div", attrs={"id": "specAndDescript"})

    # Find specification and description row
    spec_and_desc_row = spec_and_desc_parent.find_all("div", recursive=False)[0]

    # Find specification and description divs
    spec_and_desc_divs = spec_and_desc_row.find_all("div", recursive=False)

    # Specification div
    specification_div = spec_and_desc_divs[0]

    # Find all product parameters
    product_parameters = specification_div.find_all("div", recursive=False)

    # Remove 2 last results, these are some notices
    product_parameters = product_parameters[:-2]

    for parameter in product_parameters:
        title = parameter.find("b")
        desc = parameter.find("div")

        if title and desc is not None:
            key = parameter.find("b").text[:-1].lower()
            value = parameter.find("div", recursive=False).find(text=True, recursive=False)[1:]

            product_attributes[key] = value

    return product_attributes

In [None]:
def get_product(product_url: str, visitor_id: str, store_id: int) -> Optional[dict]:
    product_id = product_url.split("?")[0].split("-")[-1]
    api_endpoint = "https://redsky.target.com/redsky_aggregations/v1/web/pdp_client_v1"
    params = {
        "key": "ff457966e64d5e877fdbad070f276d18ecec4a01",
        "tcin": product_id,
        "store_id": "none",
        "has_store_id": "false",
        "pricing_store_id": store_id,
        "has_pricing_store_id": "true",
        "scheduled_delivery_store_id": "none",
        "has_scheduled_delivery_store_id": "false",
        "has_financing_options": "true",
        "visitor_id": visitor_id,
        "has_size_context": "true",
    }

    headers = {
        "accept": "application/json",
        "accept-language": "en-US,en;q=0.9,pl;q=0.8",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-site",
        "sec-gpc": "1",
    }

    product_json = requests.get(
        api_endpoint, params=params, headers=headers, timeout=10
    ).json()
    product = product_json.get("data", {}).get("product")

    return product if product is not None else None

In [None]:
# Get urls for all products for category
def iter_pages(category_url: str, store_id: str, visitor_id: str) -> list:
    products_urls = []
    category_id = category_url.split("-")[-1]

    params = {
        # Key variable probably changes, so either I will find a way to find it somehow
        # or I will just keep updating it
        "key": "ff457966e64d5e877fdbad070f276d18ecec4a01",
        "category": category_id,
        "channel": "WEB",
        "count": 24,
        "default_purchasability_filter": "false",
        "include_sponsored": "true",
        "page": f"%2Fc%2F{category_id}",
        "platform": "desktop",
        "offset": 0,
        "pricing_store_id": store_id,
        "useragent": "Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36",
        "visitor_id": visitor_id,
    }

    products_req = requests.get(
        "https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1",
        params=params,
    )
    products_json = products_req.json()

    current_page = products_json["data"]["search"]["search_response"]["typed_metadata"][
        "current_page"
    ]
    last_page = products_json["data"]["search"]["search_response"]["typed_metadata"][
        "total_pages"
    ]

    products = [
        product["item"]["enrichment"]["buy_url"]
        for product in products_json["data"]["search"]["products"]
    ]
    products_urls.extend(products)

    print(f"Page {int(current_page)}/{last_page}")

    # While loop acts like do while
    while int(current_page) < last_page:
        params["offset"] = len(products_urls)
        products_req = requests.get(
            "https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1",
            params=params,
        )
        products_json = products_req.json()

        products = [
            product["item"]["enrichment"]["buy_url"]
            for product in products_json["data"]["search"]["products"]
        ]
        products_urls.extend(products)

        current_page = products_json["data"]["search"]["search_response"][
            "typed_metadata"
        ]["current_page"]
        print(f"Page {int(current_page)}/{last_page}")
        time.sleep(1)

    print(f"Found {len(products_urls)} products in {category_url}")
    return products_urls

In [None]:
def main():
    # Send request to generate cookies
    session = requests.session()
    session.get("https://www.target.com")
    visitor_id = session.cookies["visitorId"]

    # Get categories
    categories = get_categories()

    # Get nearby stores
    nearby_stores = requests.get(
        f"https://redsky.target.com/v3/stores/nearby/{LOCATION}?"
        f"key={visitor_id}&limit=1&within=100&unit=mile"
    ).json()

    # Get store id
    store_id = nearby_stores[0]["locations"][0]["location_id"]

    all_objects = []
    for top_level_category in CATEGORIES:
        category = categories.get(top_level_category)

        if category is not None:
            if PAGES_PER_SUBCATEGORY is not None:
                subcategories = category["categories"][:PAGES_PER_SUBCATEGORY]
            else:
                subcategories = category["categories"]

            for subcategory in subcategories:
                print(f"Scraping subcategory: {subcategory}")
                products = iter_pages(subcategory, store_id, visitor_id)
                for index, product_url in enumerate(products):
                    time.sleep(1)
                    product = get_product(product_url, visitor_id, store_id)

                    if product is not None:
                        all_objects.append(product_details)
                        print(
                            f"Product {product_url} [{index + 1}/{len(products)}]"
                        )
                    else:
                        print(
                            f"Failed to fetch product {product_url} [{index + 1}/{len(products)}]"
                        )

    return all_objects
