## 1.1 Collect Products 

We begin by collecting product category information from DigiKey (https://www.digikey.com/) website. This information will be used to match with relevant articles and generate insightful analysis.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import json

#### Target site url

In [2]:
BASE_URL = "https://www.digikey.com"

#### Headers to parse site

In [3]:
# Headers used to parse sites with barriers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Referer": BASE_URL,
}

#### Assign listed main categories for each product category

In [4]:
def determine_main_category(name):
    name_lower = name.lower()
    if any(term in name_lower for term in ['anti-static', 'esd', 'clean room']):
        return 'Anti-Static, ESD, Clean Room Products'
    elif any(term in name_lower for term in ['audio', 'microphone', 'speaker', 'amplifier']):
        return 'Audio Products'
    elif any(term in name_lower for term in ['battery', 'batteries']):
        return 'Battery Products'
    elif any(term in name_lower for term in ['cable', 'wire', 'connector']):
        return 'Cables & Connectors'
    elif any(term in name_lower for term in ['capacitor']):
        return 'Capacitors'
    else:
        return 'Other'

#### Parse source json for product information

In [5]:
def parse_json_data(html_content):
    categories = []
    pattern = r'"label":"([^"]+)","productCount":"([^"]+)","url":"([^"]+)"'
    matches = re.findall(pattern, html_content)
    
    for label, count_str, url in matches:
        if url.startswith('/en/products/') and count_str.replace(',', '').isdigit():
            count = int(count_str.replace(',', ''))
            if count > 0:
                categories.append({
                    'name': label,
                    'url': BASE_URL + url,
                    'category': determine_main_category(label),
                    'Products': count
                })
    return categories

#### Scrape product categories

In [6]:
def scrape_main_categories():
    try:
        response = requests.get("https://www.digikey.com/en/products", headers=HEADERS)
        response.raise_for_status()
        categories = parse_json_data(response.text)
        # Remove duplicates by URL
        seen = set()
        unique_categories = []
        for cat in categories:
            if cat['url'] not in seen:
                seen.add(cat['url'])
                unique_categories.append(cat)
        return unique_categories
    
    except Exception:
        return []

### Run and Save Product List

The saved file consists of raw product information extracted from DigiKey website. This file will be used to clean and structure the product list further.

In [7]:
# Run method
categories = scrape_main_categories()
# Save output
with open('./intermediate_data/Products_List_Raw.json', 'w') as f:
    json.dump(categories, f, indent=2)
