In [1]:
import time
import random
import logging
import requests
import json
import pandas as pd
from typing import Optional, List, Dict, Union
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import quote
import os

In [2]:
class CommonCrawlScraper:
    def __init__(self, base_index: str = "CC-MAIN-2024-30"):
        self.base_url = f"https://index.commoncrawl.org/{base_index}-index"
        self.session = requests.Session()
        
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
    
    def fetch_page(self, search_term: str, page: int) -> Optional[List[Dict]]:
        """
        Fetch a single page of results for a search term.
        
        Args:
            search_term: Term to search for (e.g., "*.gov")
            page: Page number to fetch
            
        Returns:
            List of result dictionaries or None if page is invalid.
        """
        encoded_term = quote(search_term)
        url = f"{self.base_url}?url={encoded_term}&output=json&page={page}"
        logging.info(f"Fetching page {page} for {search_term}")
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            results = []
            for line in response.text.strip().split('\n'):
                try:
                    data = json.loads(line)
                    if 'message' in data and 'invalid' in data['message'].lower():
                        logging.info(f"Reached end of valid pages at page {page} for {search_term}")
                        return None
                    results.append(data)
                except json.JSONDecodeError as e:
                    logging.warning(f"Could not parse line on page {page} for {search_term}: {e}")
                    continue
            logging.info(f"Page {page} returned {len(results)} results.")
            return results
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching page {page} for {search_term}: {e}")
            return None
    
    def scrape_pattern(self, country: str, pattern: str, start_page: int = 0, 
                      delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for a single search pattern.
        
        Args:
            country: Country label (e.g., "United States")
            pattern: Domain pattern to search (e.g., "*.gov")
            start_page: Page number to start from
            delay: Base delay between requests in seconds
            
        Returns:
            DataFrame containing results for this pattern.
        """
        all_results = []
        page = start_page
        pages_scraped = 0
        
        logging.info(f"Starting scrape for {country} pattern: {pattern}")
        while True:
            results = self.fetch_page(pattern, page)
            if not results:
                logging.info(f"No more results for {country} pattern {pattern} after page {page}.")
                break
            for result in results:
                result['country'] = country
                result['pattern'] = pattern
            all_results.extend(results)
            pages_scraped += 1
            logging.info(f"Scraped page {page} for {country} pattern {pattern} ({len(results)} results). Total pages scraped: {pages_scraped}")
            
            sleep_time = delay + random.uniform(0, delay)
            logging.info(f"Sleeping for {sleep_time:.2f} seconds before next page request.")
            time.sleep(sleep_time)
            page += 1
        
        logging.info(f"Finished scraping pattern {pattern} for {country}. Total results: {len(all_results)}")
        return pd.DataFrame(all_results) if all_results else pd.DataFrame()
    
    def scrape_all(self, search_terms: Dict[str, Union[str, List[str]]], 
               delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for multiple countries and their patterns.

        Args:
            search_terms: Dictionary mapping countries to patterns, 
                          e.g., {"USA": ["*.gov", "*.fed.us"], "Canada": ["*.gc.ca"]}
            delay: Base delay between requests in seconds.

        Returns:
            DataFrame containing all results.
        """
        all_dfs = []
        for country, patterns in search_terms.items():
            if isinstance(patterns, str):
                patterns = [patterns]
            for pattern in patterns:
                safe_pattern = pattern.replace('*', 'ALL').replace('.', '_')
                filename = f'commoncrawl_{country.lower()}_{safe_pattern}_results.parquet'
                if os.path.exists(filename):
                    logging.info(f"File {filename} already exists; skipping scrape for {country} pattern {pattern}")
                    continue

                logging.info(f"Starting scrape for {country} pattern: {pattern}")
                try:
                    df = self.scrape_pattern(country, pattern, delay=delay)
                    if not df.empty:
                        all_dfs.append(df)
                        df.to_parquet(filename, index=False)
                        logging.info(f"Saved intermediate results for {country} pattern {pattern} to {filename}")
                    else:
                        logging.info(f"No data found for {country} pattern {pattern}.")
                except Exception as e:
                    logging.error(f"Error processing {country} pattern {pattern}: {e}")
                    continue
        if all_dfs:
            final_df = pd.concat(all_dfs, ignore_index=True)
            final_filename = "commoncrawl_all_results.parquet"
            final_df.to_parquet(final_filename, index=False)
            logging.info(f"Scrape complete. Total results: {len(final_df)}. Saved final results to {final_filename}")

            summary = final_df.groupby(['country', 'pattern']).size().reset_index(name='count')
            logging.info("Summary of results by country and pattern:")
            logging.info(summary.to_string(index=False))
            return final_df
        else:
            logging.info("No results were scraped.")
            return pd.DataFrame()

In [3]:
gov_domains = {
    # North America
    "United States": ["*.gov", "*.mil"],
    "Canada": ["*.gc.ca", "*.canada.ca"],
    "Mexico": "*.gob.mx",
    
    # Caribbean
    "Jamaica": "*.gov.jm",
    "Trinidad and Tobago": "*.gov.tt",
    "Barbados": "*.gov.bb",
    "Bahamas": "*.gov.bs",
    "Dominican Republic": "*.gob.do",
    
    # Central America
    "Costa Rica": "*.go.cr",
    "Panama": "*.gob.pa",
    "Guatemala": "*.gob.gt",
    "El Salvador": "*.gob.sv",
    "Honduras": "*.gob.hn",
    "Nicaragua": "*.gob.ni",
    "Belize": "*.gov.bz",
    
    # South America
    "Brazil": "*.gov.br",
    "Argentina": "*.gob.ar",
    "Chile": "*.gob.cl",
    "Colombia": "*.gov.co",
    "Peru": "*.gob.pe",
    "Venezuela": "*.gob.ve",
    "Ecuador": "*.gob.ec",
    "Bolivia": "*.gob.bo",
    "Paraguay": "*.gov.py",
    "Uruguay": "*.gub.uy",
    "Guyana": "*.gov.gy",
    "Suriname": "*.gov.sr",
    
    # Western Europe
    "United Kingdom": "*.gov.uk",
    "France": "*.gouv.fr",
    "Germany": ["*.bund.de", "*.bayern.de"],
    "Italy": "*.gov.it",
    "Spain": "*.gob.es",
    "Portugal": "*.gov.pt",
    "Netherlands": ["*.overheid.nl", "*.regering.nl"],
    "Belgium": ["*.belgium.be", "*.fed.be"],
    "Ireland": "*.gov.ie",
    "Luxembourg": "*.gouvernement.lu",
    "Monaco": "*.gouv.mc",
    
    # Northern Europe
    "Sweden": ["*.regeringen.se", "*.gov.se"],
    "Norway": "*.regjeringen.no",
    "Denmark": "*.gov.dk",
    "Finland": "*.gov.fi",
    "Iceland": "*.island.is",
    "Estonia": "*.gov.ee",
    "Latvia": "*.gov.lv",
    "Lithuania": "*.gov.lt",
    
    # Eastern Europe
    "Poland": "*.gov.pl",
    "Czech Republic": "*.gov.cz",
    "Slovakia": "*.gov.sk",
    "Hungary": "*.gov.hu",
    "Romania": "*.gov.ro",
    "Bulgaria": "*.government.bg",
    "Moldova": "*.gov.md",
    "Ukraine": "*.gov.ua",
    "Belarus": "*.gov.by",
    
    # Southern Europe
    "Greece": "*.gov.gr",
    "Croatia": "*.gov.hr",
    "Serbia": "*.gov.rs",
    "Slovenia": "*.gov.si",
    "Albania": "*.gov.al",
    "North Macedonia": "*.gov.mk",
    "Bosnia and Herzegovina": "*.gov.ba",
    "Montenegro": "*.gov.me",
    "Malta": "*.gov.mt",
    "Cyprus": "*.gov.cy",
    
    # South Asia
    "India": ["*.gov.in", "*.nic.in"],
    "Pakistan": "*.gov.pk",
    "Bangladesh": "*.gov.bd",
    "Sri Lanka": "*.gov.lk",
    "Nepal": "*.gov.np",
    "Bhutan": "*.gov.bt",
    "Maldives": "*.gov.mv",
    "Afghanistan": "*.gov.af",
    
    # East Asia
    "Japan": "*.go.jp",
    "South Korea": "*.go.kr",
    "North Korea": "*.gov.kp",
    "China": ["*.gov.cn", "*.政务.cn"],
    "Mongolia": "*.gov.mn",
    "Taiwan": "*.gov.tw",

    # Southeast Asia
    "Indonesia": "*.go.id",
    "Malaysia": "*.gov.my",
    "Singapore": "*.gov.sg",
    "Philippines": "*.gov.ph",
    "Thailand": "*.go.th",
    "Vietnam": "*.gov.vn",
    "Myanmar": "*.gov.mm",
    "Cambodia": "*.gov.kh",
    "Laos": "*.gov.la",
    "Brunei": "*.gov.bn",
    "Timor-Leste": "*.gov.tl",
    
    # Central Asia
    "Kazakhstan": "*.gov.kz",
    "Uzbekistan": "*.gov.uz",
    "Kyrgyzstan": "*.gov.kg",
    "Tajikistan": "*.gov.tj",
    "Turkmenistan": "*.gov.tm",
    
    # Middle East
    "Saudi Arabia": "*.gov.sa",
    "UAE": "*.gov.ae",
    "Iran": "*.gov.ir",
    "Iraq": "*.gov.iq",
    "Israel": "*.gov.il",
    "Jordan": "*.gov.jo",
    "Lebanon": "*.gov.lb",
    "Oman": "*.gov.om",
    "Qatar": "*.gov.qa",
    "Kuwait": "*.gov.kw",
    "Bahrain": "*.gov.bh",
    "Yemen": "*.gov.ye",
    "Syria": "*.gov.sy",
    
    # North Africa
    "Egypt": "*.gov.eg",
    "Morocco": "*.gov.ma",
    "Tunisia": "*.gov.tn",
    "Algeria": "*.gov.dz",
    "Libya": "*.gov.ly",
    "Sudan": "*.gov.sd",
    
    # West Africa
    "Nigeria": "*.gov.ng",
    "Ghana": "*.gov.gh",
    "Senegal": "*.gouv.sn",
    "Ivory Coast": "*.gouv.ci",
    "Mali": "*.gouv.ml",
    "Burkina Faso": "*.gov.bf",
    "Guinea": "*.gov.gn",
    "Sierra Leone": "*.gov.sl",
    "Liberia": "*.gov.lr",
    "Togo": "*.gouv.tg",
    "Benin": "*.gouv.bj",
    "Niger": "*.gouv.ne",
    "Gambia": "*.gov.gm",
    "Guinea-Bissau": "*.gov.gw",
    "Cape Verde": "*.gov.cv",
    
    # East Africa
    "Kenya": "*.go.ke",
    "Tanzania": "*.go.tz",
    "Uganda": "*.go.ug",
    "Ethiopia": "*.gov.et",
    "Rwanda": "*.gov.rw",
    "Burundi": "*.gov.bi",
    "South Sudan": "*.gov.ss",
    "Eritrea": "*.gov.er",
    "Djibouti": "*.gouv.dj",
    "Somalia": "*.gov.so",
    
    # Southern Africa
    "South Africa": "*.gov.za",
    "Namibia": "*.gov.na",
    "Botswana": "*.gov.bw",
    "Zimbabwe": "*.gov.zw",
    "Mozambique": "*.gov.mz",
    "Zambia": "*.gov.zm",
    "Malawi": "*.gov.mw",
    "Angola": "*.gov.ao",
    "Madagascar": "*.gov.mg",
    "Mauritius": "*.gov.mu",
    "Seychelles": "*.gov.sc",
    "Lesotho": "*.gov.ls",
    "Eswatini": "*.gov.sz",
    
    # Oceania
    "Australia": "*.gov.au",
    "New Zealand": "*.govt.nz",
    "Papua New Guinea": "*.gov.pg",
    "Fiji": "*.gov.fj",
    "Solomon Islands": "*.gov.sb",
    "Vanuatu": "*.gov.vu",
    "New Caledonia": "*.gouv.nc",
    "Samoa": "*.gov.ws",
    "Tonga": "*.gov.to",
    "French Polynesia": "*.gouv.pf"
}

In [4]:
scraper = CommonCrawlScraper()

df = scraper.scrape_all(gov_domains)

2025-02-17 12:01:41,387 - INFO - File commoncrawl_united states_ALL_gov_results.parquet already exists; skipping scrape for United States pattern *.gov
2025-02-17 12:01:41,388 - INFO - File commoncrawl_united states_ALL_mil_results.parquet already exists; skipping scrape for United States pattern *.mil
2025-02-17 12:01:41,388 - INFO - File commoncrawl_canada_ALL_gc_ca_results.parquet already exists; skipping scrape for Canada pattern *.gc.ca
2025-02-17 12:01:41,389 - INFO - File commoncrawl_canada_ALL_canada_ca_results.parquet already exists; skipping scrape for Canada pattern *.canada.ca
2025-02-17 12:01:41,389 - INFO - File commoncrawl_mexico_ALL_gob_mx_results.parquet already exists; skipping scrape for Mexico pattern *.gob.mx
2025-02-17 12:01:41,389 - INFO - File commoncrawl_jamaica_ALL_gov_jm_results.parquet already exists; skipping scrape for Jamaica pattern *.gov.jm
2025-02-17 12:01:41,390 - INFO - File commoncrawl_trinidad and tobago_ALL_gov_tt_results.parquet already exists; s