In [None]:
import xml.etree.ElementTree as ET
from pathlib import Path

def extract_english_links(xml_file):
    # Define the XML namespaces
    namespaces = {
        'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
        'xhtml': 'http://www.w3.org/1999/xhtml'
    }
    
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Create a list to store English URLs
    english_urls = []
    
    # Find all URL elements
    for url in root.findall('.//ns:url', namespaces):
        # Find all xhtml:link elements within each URL
        for link in url.findall('.//xhtml:link', namespaces):
            # Check if the link is English
            if link.get('hreflang') == 'en':
                english_urls.append(link.get('href'))
    
    return english_urls

def save_urls_to_file(urls, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for url in urls:
            f.write(f"{url}\n")

xml_file = '../data/sitemap.xml'  # Replace with your sitemap file path
output_file = '../data/english_urls.txt'

# Extract and save URLs
english_urls = extract_english_links(xml_file)
save_urls_to_file(english_urls, output_file)

print(f"Found {len(english_urls)} English URLs and saved them to {output_file}")

In [None]:
import requests
target_url = "https://www.haagen-dazs.ca/en/haagen-dazs/extraaz-cookie-dough-dynamo-ice-cream-single-bars"
url = 'https://r.jina.ai/' + target_url
headers = {
    'Authorization': 'Bearer',
    'X-Remove-Selector': '.latest-article-slider, #block-views-block-recent-videos-recent-videos, .product-recipes-container',
    'X-Retain-Images': 'none',
    "X-Base": "final",
    'X-Target-Selector': 'main',
    'X-With-Links-Summary': 'true'
}

response = requests.get(url, headers=headers)
print(response.text)


In [None]:
import asyncio
import aiohttp
from pathlib import Path
import logging
from typing import List, Set
from dotenv import load_dotenv
import csv

load_dotenv(dotenv_path="../.env")

# for jupyter only
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/scraping.log'),
        logging.StreamHandler()
    ]
)


class Scraper:
    def __init__(self, urls_file: str, log_dir: str, output_dir: str, max_concurrent: int):
        self.urls_file = urls_file
        self.log_dir = Path(log_dir)
        self.output_dir = Path(output_dir)
        self.max_concurrent = max_concurrent
        self.successful_urls: Set[str] = set()
        self.failed_urls: Set[str] = set()
        self.processed_urls: List[tuple] = []  # List of (filename, url) tuples
        self.headers = {
            'X-Remove-Selector': '.latest-article-slider, #block-views-block-recent-videos-recent-videos, .product-recipes-container',
            'X-Retain-Images': 'none',
            'X-Base': 'final',
            'X-Target-Selector': 'main',
            'X-With-Links-Summary': 'true',
        }

        # Create output directory if it doesn't exist
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Create a queue for URLs
        self.queue = asyncio.Queue()
        
    async def load_urls(self):
        """Load URLs from file into queue, skipping those with existing files"""
        # Get list of existing files
        existing_files = set(f.stem for f in self.output_dir.glob('*.md'))
        
        with open(self.urls_file, 'r') as f:
            urls = [line.strip() for line in f if line.strip()]
        
        skipped = 0
        queued = 0
        
        for url in urls:
            # Generate the expected filename using same logic as in process_url
            path = url.split('://', 1)[-1].split('/', 1)[-1]
            if path.startswith(('video/', 'node/')):
                skipped += 1
                continue
            parts = [p for p in path.split('/') if p]
            expected_filename = self.sanitize_filename("_".join(parts))
            
            # Check if file already exists
            if expected_filename in existing_files:
                skipped += 1
                continue

            # Queue URL if file doesn't exist
            await self.queue.put(url)
            queued += 1
        
        logging.info(f"Loaded {queued} URLs into queue, skipped {skipped} existing files")
    
    def sanitize_filename(self, filename: str) -> str:
        # Remove invalid characters
        invalid_chars = '<>:"/\\|?*'
        for char in invalid_chars:
            filename = filename.replace(char, '_')
        return filename

    async def process_url(self, session: aiohttp.ClientSession, url: str, retries=2):
        """Process a single URL"""
        for attempt in range(retries):
            try:
                api_url = f'https://r.jina.ai/{url}'
                async with session.get(api_url, headers=self.headers) as response:
                    if response.status == 200:
                        content = await response.text()
                        
                        path = url.split('://', 1)[-1].split('/', 1)[-1]

                        # Split by '/' and filter out empty parts
                        parts = [p for p in path.split('/') if p]

                        # Join with underscore
                        filename = self.sanitize_filename("_".join(parts))
                        # If empty or just a slash, use 'index'
                        if not filename:
                            filename = 'index'
                        filepath = self.output_dir / f"{filename}.md"
                        
                        # Save content
                        with open(filepath, 'w', encoding='utf-8') as f:
                            f.write(content)
                        
                        self.processed_urls.append((f"{filename}.md", url))

                        
                        self.successful_urls.add(url)
                        logging.info(f"Successfully processed {url}")
                    else:
                        self.failed_urls.add(url)
                        logging.error(f"Failed to process {url}: Status {response.status}")
            except Exception as e:
                if attempt < retries - 1:
                    logging.warning(f"Attempt {attempt+1}/{retries} failed for {url}. Retrying in 20s")
                    await asyncio.sleep(20)
                else:
                    self.failed_urls.add(url)
                    logging.error(f"Error processing {url}: {str(e)}")
                    logging.error(f"Error type: {type(e).__name__}")
                    if hasattr(e, 'response'):
                        logging.error(f"Response status: {e.response.status if hasattr(e.response, 'status') else 'N/A'}")

    async def worker(self, session: aiohttp.ClientSession):
        """Worker to process URLs from queue"""
        while True:
            try:
                url = await self.queue.get()
                await self.process_url(session, url)
                self.queue.task_done()
            except asyncio.CancelledError:
                break
            except Exception as e:
                logging.error(f"Worker error: {str(e)}")
                self.failed_urls.add(url)
                self.queue.task_done()

    async def run(self):
        """Run the scraper"""
        timeout = aiohttp.ClientTimeout(total=20)
        async with aiohttp.ClientSession(timeout=timeout) as session:
            workers = [
                asyncio.create_task(self.worker(session))
                for _ in range(self.max_concurrent)
            ]
            
            # Wait for all URLs to be processed
            await self.queue.join()
            for w in workers:
                w.cancel()
            
            # Wait for workers to finish
            await asyncio.gather(*workers, return_exceptions=True)
    
    def save_processed_urls(self):
        """Save processed URLs to CSV file"""
        with open(self.log_dir / 'processed_urls.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['filename', 'url'])
            writer.writerows(self.processed_urls)
            logging.info(f"Saved {len(self.processed_urls)} processed URLs")
        
        with open(self.log_dir / 'successful_urls.txt', 'w') as f:
            for url in self.successful_urls:
                f.write(f"{url}\n")
        
        # Save failed URLs
        with open(self.log_dir / 'failed_urls.txt', 'w') as f:
            for url in self.failed_urls:
                f.write(f"{url}\n")
        
        logging.info(f"Processed {len(self.successful_urls)} URLs successfully")
        logging.info(f"Failed to process {len(self.failed_urls)} URLs")

async def main():
    scraper = Scraper(
        urls_file='../data/english_urls.txt',
        output_dir='../data/site',
        log_dir='./logs',
        max_concurrent=5
    )
    
    await scraper.load_urls()
    await scraper.run()
    scraper.save_processed_urls()

await main()
