In [None]:
import requests
import xml.etree.ElementTree as ET
import csv
import time
import random
import csv
import json

# URL of the XML sitemap
sitemap_url = 'https://www.trade-point.co.uk/static/sitemap.xml'

# Download the XML file
response = requests.get(sitemap_url)
response.raise_for_status()  # Check if the request was successful

# Parse the XML content
tree = ET.ElementTree(ET.fromstring(response.content))
root = tree.getroot()

# Extract all <loc> URLs and clean them
# The namespace might need to be handled if present
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = [url.text.strip().replace('"', '') for url in root.findall('.//ns:loc', namespace)]

# Save cleaned URLs to a CSV file
csv_file = 'sitemap_urls.csv'
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL'])  # Write header row
    for url in urls:
        writer.writerow([url])

print(f"Saved {len(urls)} cleaned URLs to {csv_file}")


In [None]:
# Import necessary modules
from selenium import webdriver 
from browsermobproxy import Server
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from chromedriver_py import binary_path 
# Import statements as before

# Function to wait for an element to be present
def wait_for_element_presence(driver, by, selector, timeout=10):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((by, selector))
    )

# Main Function 
if __name__ == "__main__":
    path_to_browsermobproxy = "/home/wlodzimierrr/tools/browsermob-proxy-2.1.4/bin/"
    server = Server(path_to_browsermobproxy + "browsermob-proxy", options={'port': 8090})
    server.start()

    proxy = server.create_proxy(params={"trustAllServers": "true"})

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--proxy-server={0}".format(proxy.proxy))

    service = Service(executable_path=binary_path)
    driver = webdriver.Chrome(service=service, options=options)
    try:
        # Initialize an empty list to store HAR entries
        network_logs = []
        
        # Read URLs from CSV file
        with open('sitemap_urls.csv', 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            urls = [row[0] for row in reader]
        
        # Iterate over each URL from the sitemap
        for url in urls:
            time.sleep(10)
            print(f"Navigating to: {url}")
            
            # Start capturing HAR data for the current URL
            proxy.new_har(url)
            
            # Load the URL in the browser
            driver.get(url)
            
            # Check if the "Load More" button exists
            try:
                load_more_element = wait_for_element_presence(driver, By.CSS_SELECTOR, '[data-test-id="load-more-products"]')
                
                # Scroll into view if necessary
                actions = ActionChains(driver)
                actions.move_to_element(load_more_element).perform()
                time.sleep(1)
                
                # Click the "Load More" button using JavaScript to bypass interception
                driver.execute_script("arguments[0].click();", load_more_element)
                
                # Wait for additional content to load
                time.sleep(5)
            
            except TimeoutException:
                # Handle the case where "Load More" button doesn't exist
                print("No 'Load More' button found. Proceeding with the current page content.")
            
            # Capture the HAR data after the page has fully loaded
            har_entry = proxy.har
            
            # Append the HAR entry to the list
            network_logs.append(har_entry)
    
        # Write the list of HAR entries to a file
        with open("network_logs_all.har", "a", encoding="utf-8") as f:
            json.dump(network_logs, f, ensure_ascii=False)

    finally:
        driver.quit()
        server.stop()


In [None]:
import json
from urllib.parse import urlparse, parse_qs
import csv

# Path to your HAR file
har_file_path = "network_logs_all.har"
csv_file = 'all_tradepoint_category_codes.csv'

all_codes = []

try:
    # Read the HAR File and parse it using JSON
    with open(har_file_path, "r", encoding="utf-8") as f:
        logs = json.load(f)
        
    # Open the CSV file once and write the header row
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['category_code'])  # Write header row

    # Iterate through the logs and extract the desired URLs and parameters
    for log in logs:
        network_logs = log['log']['entries']
        for entry in network_logs:
            try:
                url = entry['request']['url']
                parsed_url = urlparse(url)
                query_params = parse_qs(parsed_url.query)
                
                # Find the entry with 'filter[category]' parameter
                if 'filter[category]' in query_params:
                    value_param = query_params['filter[category]'][0]
                    print(f"URL: {url}")
                    print(f"Value of 'filter[category]': {value_param}")
                    all_codes.append(value_param)
                        
            except KeyError:
                pass  # If 'request' or 'url' key is missing, skip this entry

                        
    # Append the value to the CSV file
    with open(csv_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        all_codes = set(all_codes)
        for code in all_codes:
                writer.writerow([code])

except FileNotFoundError:
    print(f"Error: HAR file '{har_file_path}' not found.")
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


In [3]:
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8,pl;q=0.7,nl;q=0.6',
    'Authorization': 'Atmosphere atmosphere_app_id=kingfisher-LTbGHXKinHaJSV86nEjf0KnO70UOVE6UcYAswLuC',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Origin': 'https://www.trade-point.co.uk',
    'Pragma': 'no-cache',
    'Referer': 'https://www.trade-point.co.uk/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'cross-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'X-Context-Location': '/kitchen/kitchen-cabinets/kitchen-doors.cat',
    'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'x-dtc': 'sn="v_4_srv_-2D20_sn_3DVN1BO5L1CBBVKHKCI1EI1OPMMJFPCD", pc="-20$218348505_48h39vCMRFHLTDBVQCPNAPAHBUKCANTLTWOTFH-0e0", v="17112790819981S31G3BNLBCJ3II45KG52PONU7NP0H1A", app="d09643a9157f0c88", r="https://www.trade-point.co.uk/kitchen/kitchen-cabinets/kitchen-doors.cat?Brand=GoodHome&Range=Stevia"',
    'x-tenant': 'TPUK',
}

In [6]:
import time
import random
import requests
import csv
import json

def initial_request(category_code):
    """Make initial API request."""
    try:
        time.sleep(random.uniform(5, 10))
        response = requests.get(
            f'https://api.kingfisher.com/v2/mobile/products/TPUK?filter[category]={category_code}&include=content&page[size]=1',
            headers=headers,
        )
        response.raise_for_status()  
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"The request timed out: {timeout_err}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred while handling your request: {err}")
    return None  

def get_total_page_count(response):
    """Extract total page count from API response."""
    if response and response.status_code == 200:
        try:
            data = response.json()
            meta = data.get('meta', {})
            paging = meta.get('paging', None)
            if paging is None:
                print("Paging data is missing in the response.")
                return 0, False, data
            total_results = paging.get('totalResults', 0)
            return total_results, True, data
        except json.JSONDecodeError as e:
           print(f"JSON decode error: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
    else:
        print("Failed to fetch data or no response")
        return 0, False, None

def process_category_codes(input_csv, output_csv):
    """Process category codes from input CSV and save relevant data to output CSV."""
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile: 
        reader = csv.reader(csvfile)
        next(reader, None)
        for code in reader:
            category_code = code[0]
            print(f"checking: {category_code}")
            response = initial_request(category_code)
            count, is_valid, data = get_total_page_count(response)
            if is_valid and data:
                index = len(data['data'][0]['attributes']['breadcrumbList']) - 1 
                page_url = data['data'][0]['attributes']['pdpURL']
                category = data['data'][0]['attributes']['breadcrumbList'][index - 1]['name']
                subcategory = data['data'][0]['attributes']['breadcrumbList'][index - 2]['name']
                print(f"count: {count} category code: {page_url}")
                if count > 1:         
                    with open(output_csv, mode='a', newline='') as file:
                        writer = csv.writer(file)
                        writer.writerow([page_url, category, subcategory, category_code])
                        print(f"saved: {category_code}")

# Define the input and output CSV file paths
input_csv = 'all_tradepoint_category_codes.csv'
output_csv = 'sorted_tradepoint_codes.csv'

# # Process the category codes
# process_category_codes(input_csv, output_csv)