In [1]:
import pandas as pd
import requests
import json
import time
from typing import List, Dict, Optional
import logging
from urllib.parse import quote
from urllib.parse import urlparse

In [7]:
import pandas as pd
import requests
import json
import time
from typing import List, Dict, Optional, Union
import logging
from urllib.parse import quote

class CommonCrawlScraper:
    def __init__(self, base_index: str = "CC-MAIN-2024-30"):
        self.base_url = f"https://index.commoncrawl.org/{base_index}-index"
        self.session = requests.Session()
        
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        
    def fetch_page(self, search_term: str, page: int) -> Optional[List[Dict]]:
        """
        Fetch a single page of results for a search term.
        
        Args:
            search_term: Term to search for (e.g., "*.gov")
            page: Page number to fetch
            
        Returns:
            List of result dictionaries or None if page is invalid
        """
        encoded_term = quote(search_term)
        url = f"{self.base_url}?url={encoded_term}&output=json&page={page}"
        logging.info(f"Fetching page {page} for {search_term}")
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            results = []
            for line in response.text.strip().split('\n'):
                try:
                    data = json.loads(line)
                    if 'message' in data and 'invalid' in data['message'].lower():
                        logging.info(f"Reached end of valid pages at page {page} for {search_term}")
                        return None
                    results.append(data)
                except json.JSONDecodeError as e:
                    logging.warning(f"Could not parse line on page {page} for {search_term}: {str(e)}")
                    continue
                    
            return results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching page {page} for {search_term}: {str(e)}")
            return None
            
    def scrape_pattern(self, country: str, pattern: str, start_page: int = 0, 
                      delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for a single search pattern.
        
        Args:
            country: Country label (e.g., "United States")
            pattern: Domain pattern to search (e.g., "*.gov")
            start_page: Page number to start from
            delay: Delay between requests in seconds
            
        Returns:
            DataFrame containing results for this pattern
        """
        all_results = []
        page = start_page
        
        while True:
            results = self.fetch_page(pattern, page)
            
            if not results:
                break
                
            # Add country and pattern to each result
            for result in results:
                result['country'] = country
                result['pattern'] = pattern
                
            all_results.extend(results)
            logging.info(f"Got {len(results)} results from page {page} for {country} pattern {pattern}")
            
            time.sleep(delay)
            page += 1
            
        return pd.DataFrame(all_results) if all_results else pd.DataFrame()
    
    def scrape_all(self, search_terms: Dict[str, Union[str, List[str]]], 
                   delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for multiple countries and their patterns.
        
        Args:
            search_terms: Dictionary mapping countries to patterns
                        e.g., {"USA": ["*.gov", "*.fed.us"], "Canada": ["*.gc.ca"]}
            delay: Delay between requests in seconds
            
        Returns:
            DataFrame containing all results
        """
        all_dfs = []
        
        for country, patterns in search_terms.items():
            # Convert single string pattern to list
            if isinstance(patterns, str):
                patterns = [patterns]
                
            # Process each pattern for the country
            for pattern in patterns:
                logging.info(f"Starting scrape for {country} pattern: {pattern}")
                
                try:
                    df = self.scrape_pattern(country, pattern, delay=delay)
                    
                    if not df.empty:
                        all_dfs.append(df)
                        
                        # Save intermediate results with pattern info
                        safe_pattern = pattern.replace('*', 'ALL').replace('.', '_')
                        filename = f'commoncrawl_{country.lower()}_{safe_pattern}_results.csv'
                        df.to_csv(filename, index=False)
                        logging.info(f"Saved intermediate results for {country} pattern {pattern}")
                        
                except Exception as e:
                    logging.error(f"Error processing {country} pattern {pattern}: {str(e)}")
                    continue
        
        # Combine all results
        if all_dfs:
            final_df = pd.concat(all_dfs, ignore_index=True)
            logging.info(f"Total results: {len(final_df)}")
            
            # Add summary statistics
            logging.info("\nResults by country and pattern:")
            summary = final_df.groupby(['country', 'pattern']).size().reset_index(name='count')
            logging.info(summary)
            
            return final_df
        else:
            return pd.DataFrame()

In [8]:
gov_domains = {
    # North America
    "United States": ["*.gov", "*.mil"],
    "Canada": ["*.gc.ca", "*.canada.ca"],
    "Mexico": "*.gob.mx",
    
    # Caribbean
    "Jamaica": "*.gov.jm",
    "Trinidad and Tobago": "*.gov.tt",
    "Barbados": "*.gov.bb",
    "Bahamas": "*.gov.bs",
    "Dominican Republic": "*.gob.do",
    
    # Central America
    "Costa Rica": "*.go.cr",
    "Panama": "*.gob.pa",
    "Guatemala": "*.gob.gt",
    "El Salvador": "*.gob.sv",
    "Honduras": "*.gob.hn",
    "Nicaragua": "*.gob.ni",
    "Belize": "*.gov.bz",
    
    # South America
    "Brazil": "*.gov.br",
    "Argentina": "*.gob.ar",
    "Chile": "*.gob.cl",
    "Colombia": "*.gov.co",
    "Peru": "*.gob.pe",
    "Venezuela": "*.gob.ve",
    "Ecuador": "*.gob.ec",
    "Bolivia": "*.gob.bo",
    "Paraguay": "*.gov.py",
    "Uruguay": "*.gub.uy",
    "Guyana": "*.gov.gy",
    "Suriname": "*.gov.sr",
    
    # Western Europe
    "United Kingdom": "*.gov.uk",
    "France": "*.gouv.fr",
    "Germany": ["*.bund.de", "*.bayern.de"],
    "Italy": "*.gov.it",
    "Spain": "*.gob.es",
    "Portugal": "*.gov.pt",
    "Netherlands": ["*.overheid.nl", "*.regering.nl"],
    "Belgium": ["*.belgium.be", "*.fed.be"],
    "Ireland": "*.gov.ie",
    "Luxembourg": "*.gouvernement.lu",
    "Monaco": "*.gouv.mc",
    
    # Northern Europe
    "Sweden": ["*.regeringen.se", "*.gov.se"],
    "Norway": "*.regjeringen.no",
    "Denmark": "*gov.dk",
    "Finland": "*.gov.fi",
    "Iceland": "*.island.is",
    "Estonia": "*.gov.ee",
    "Latvia": "*.gov.lv",
    "Lithuania": "*.gov.lt",
    
    # Eastern Europe
    "Poland": "*.gov.pl",
    "Czech Republic": "*.gov.cz",
    "Slovakia": "*.gov.sk",
    "Hungary": "*.gov.hu",
    "Romania": "*.gov.ro",
    "Bulgaria": "*.government.bg",
    "Moldova": "*.gov.md",
    "Ukraine": "*.gov.ua",
    "Belarus": "*.gov.by",
    
    # Southern Europe
    "Greece": "*.gov.gr",
    "Croatia": "*.gov.hr",
    "Serbia": "*.gov.rs",
    "Slovenia": "*.gov.si",
    "Albania": "*.gov.al",
    "North Macedonia": "*.gov.mk",
    "Bosnia and Herzegovina": "*.gov.ba",
    "Montenegro": "*.gov.me",
    "Malta": "*.gov.mt",
    "Cyprus": "*.gov.cy",
    
    # South Asia
    "India": ["*.gov.in", "*.nic.in"],
    "Pakistan": "*.gov.pk",
    "Bangladesh": "*.gov.bd",
    "Sri Lanka": "*.gov.lk",
    "Nepal": "*.gov.np",
    "Bhutan": "*.gov.bt",
    "Maldives": "*.gov.mv",
    "Afghanistan": "*.gov.af",
    
    # East Asia
    "Japan": "*.go.jp",
    "South Korea": "*.go.kr",
    "North Korea": "*.gov.kp",
    "China": ["*.gov.cn", "*.政务.cn"],
    "Mongolia": "*.gov.mn",
    "Taiwan": "*.gov.tw",
    "Singapore":"*.gov.sg",

    # Southeast Asia
    "Indonesia": "*.go.id",
    "Malaysia": "*.gov.my",
    "Singapore": "*.gov.sg",
    "Philippines": "*.gov.ph",
    "Thailand": "*.go.th",
    "Vietnam": "*.gov.vn",
    "Myanmar": "*.gov.mm",
    "Cambodia": "*.gov.kh",
    "Laos": "*.gov.la",
    "Brunei": "*.gov.bn",
    "Timor-Leste": "*.gov.tl",
    
    # Central Asia
    "Kazakhstan": "*.gov.kz",
    "Uzbekistan": "*.gov.uz",
    "Kyrgyzstan": "*.gov.kg",
    "Tajikistan": "*.gov.tj",
    "Turkmenistan": "*.gov.tm",
    
    # Middle East
    "Saudi Arabia": "*.gov.sa",
    "UAE": "*.gov.ae",
    "Iran": "*.gov.ir",
    "Iraq": "*.gov.iq",
    "Israel": "*.gov.il",
    "Jordan": "*.gov.jo",
    "Lebanon": "*.gov.lb",
    "Oman": "*.gov.om",
    "Qatar": "*.gov.qa",
    "Kuwait": "*.gov.kw",
    "Bahrain": "*.gov.bh",
    "Yemen": "*.gov.ye",
    "Syria": "*.gov.sy",
    
    # North Africa
    "Egypt": "*.gov.eg",
    "Morocco": "*.gov.ma",
    "Tunisia": "*.gov.tn",
    "Algeria": "*.gov.dz",
    "Libya": "*.gov.ly",
    "Sudan": "*.gov.sd",
    
    # West Africa
    "Nigeria": "*.gov.ng",
    "Ghana": "*.gov.gh",
    "Senegal": "*.gouv.sn",
    "Ivory Coast": "*.gouv.ci",
    "Mali": "*.gouv.ml",
    "Burkina Faso": "*.gov.bf",
    "Guinea": "*.gov.gn",
    "Sierra Leone": "*.gov.sl",
    "Liberia": "*.gov.lr",
    "Togo": "*.gouv.tg",
    "Benin": "*.gouv.bj",
    "Niger": "*.gouv.ne",
    "Gambia": "*.gov.gm",
    "Guinea-Bissau": "*.gov.gw",
    "Cape Verde": "*.gov.cv",
    
    # East Africa
    "Kenya": "*.go.ke",
    "Tanzania": "*.go.tz",
    "Uganda": "*.go.ug",
    "Ethiopia": "*.gov.et",
    "Rwanda": "*.gov.rw",
    "Burundi": "*.gov.bi",
    "South Sudan": "*.gov.ss",
    "Eritrea": "*.gov.er",
    "Djibouti": "*.gouv.dj",
    "Somalia": "*.gov.so",
    
    # Southern Africa
    "South Africa": "*.gov.za",
    "Namibia": "*.gov.na",
    "Botswana": "*.gov.bw",
    "Zimbabwe": "*.gov.zw",
    "Mozambique": "*.gov.mz",
    "Zambia": "*.gov.zm",
    "Malawi": "*.gov.mw",
    "Angola": "*.gov.ao",
    "Madagascar": "*.gov.mg",
    "Mauritius": "*.gov.mu",
    "Seychelles": "*.gov.sc",
    "Lesotho": "*.gov.ls",
    "Eswatini": "*.gov.sz",
    
    # Oceania
    "Australia": "*.gov.au",
    "New Zealand": "*.govt.nz",
    "Papua New Guinea": "*.gov.pg",
    "Fiji": "*.gov.fj",
    "Solomon Islands": "*.gov.sb",
    "Vanuatu": "*.gov.vu",
    "New Caledonia": "*.gouv.nc",
    "Samoa": "*.gov.ws",
    "Tonga": "*.gov.to"
}

In [9]:
scraper = CommonCrawlScraper()

df = scraper.scrape_all(gov_domains)

2025-02-08 12:24:26,160 - INFO - Starting scrape for United States pattern: *.gov
2025-02-08 12:24:26,162 - INFO - Fetching page 0 for *.gov
2025-02-08 12:24:29,438 - INFO - Got 13208 results from page 0 for United States pattern *.gov
2025-02-08 12:24:30,443 - INFO - Fetching page 1 for *.gov
2025-02-08 12:24:33,715 - INFO - Got 15000 results from page 1 for United States pattern *.gov
2025-02-08 12:24:34,718 - INFO - Fetching page 2 for *.gov
2025-02-08 12:24:37,899 - INFO - Got 15000 results from page 2 for United States pattern *.gov
2025-02-08 12:24:38,905 - INFO - Fetching page 3 for *.gov
2025-02-08 12:24:42,240 - INFO - Got 15000 results from page 3 for United States pattern *.gov
2025-02-08 12:24:43,246 - INFO - Fetching page 4 for *.gov
2025-02-08 12:24:46,459 - INFO - Got 15000 results from page 4 for United States pattern *.gov
2025-02-08 12:24:47,461 - INFO - Fetching page 5 for *.gov
2025-02-08 12:24:51,053 - INFO - Got 15000 results from page 5 for United States pattern 

2025-02-08 12:28:12,293 - INFO - Got 15000 results from page 52 for United States pattern *.gov
2025-02-08 12:28:13,299 - INFO - Fetching page 53 for *.gov
2025-02-08 12:28:16,559 - INFO - Got 15000 results from page 53 for United States pattern *.gov
2025-02-08 12:28:17,563 - INFO - Fetching page 54 for *.gov
2025-02-08 12:28:20,777 - INFO - Got 15000 results from page 54 for United States pattern *.gov
2025-02-08 12:28:21,783 - INFO - Fetching page 55 for *.gov
2025-02-08 12:28:25,059 - INFO - Got 15000 results from page 55 for United States pattern *.gov
2025-02-08 12:28:26,065 - INFO - Fetching page 56 for *.gov
2025-02-08 12:28:29,289 - INFO - Got 15000 results from page 56 for United States pattern *.gov
2025-02-08 12:28:30,297 - INFO - Fetching page 57 for *.gov
2025-02-08 12:28:33,555 - INFO - Got 15000 results from page 57 for United States pattern *.gov
2025-02-08 12:28:34,563 - INFO - Fetching page 58 for *.gov
2025-02-08 12:28:37,913 - INFO - Got 15000 results from page 58 

2025-02-08 12:32:00,100 - INFO - Got 15000 results from page 41 for United States pattern *.mil
2025-02-08 12:32:01,106 - INFO - Fetching page 42 for *.mil
2025-02-08 12:32:57,617 - INFO - Got 15000 results from page 42 for United States pattern *.mil
2025-02-08 12:32:58,628 - INFO - Fetching page 43 for *.mil
2025-02-08 12:33:02,520 - INFO - Got 15000 results from page 43 for United States pattern *.mil
2025-02-08 12:33:03,525 - INFO - Fetching page 44 for *.mil
2025-02-08 12:33:06,833 - INFO - Got 15000 results from page 44 for United States pattern *.mil
2025-02-08 12:33:07,839 - INFO - Fetching page 45 for *.mil
2025-02-08 12:33:10,938 - INFO - Got 15000 results from page 45 for United States pattern *.mil
2025-02-08 12:33:11,944 - INFO - Fetching page 46 for *.mil
2025-02-08 12:33:15,064 - INFO - Got 15000 results from page 46 for United States pattern *.mil
2025-02-08 12:33:16,070 - INFO - Fetching page 47 for *.mil
2025-02-08 12:33:19,203 - INFO - Got 15000 results from page 47 

2025-02-08 12:37:27,168 - INFO - Got 15000 results from page 35 for Canada pattern *.gc.ca
2025-02-08 12:37:28,175 - INFO - Fetching page 36 for *.gc.ca
2025-02-08 12:37:31,977 - INFO - Got 15000 results from page 36 for Canada pattern *.gc.ca
2025-02-08 12:37:32,984 - INFO - Fetching page 37 for *.gc.ca
2025-02-08 12:37:36,278 - INFO - Got 12970 results from page 37 for Canada pattern *.gc.ca
2025-02-08 12:37:37,285 - INFO - Fetching page 38 for *.gc.ca
2025-02-08 12:37:37,381 - ERROR - Error fetching page 38 for *.gc.ca: 400 Client Error: Bad Request for url: https://index.commoncrawl.org/CC-MAIN-2024-30-index?url=%2A.gc.ca&output=json&page=38
2025-02-08 12:37:46,786 - INFO - Saved intermediate results for Canada pattern *.gc.ca
2025-02-08 12:37:46,789 - INFO - Starting scrape for Canada pattern: *.canada.ca
2025-02-08 12:37:46,790 - INFO - Fetching page 0 for *.canada.ca
2025-02-08 12:37:50,457 - INFO - Got 13384 results from page 0 for Canada pattern *.canada.ca
2025-02-08 12:37:51

2025-02-08 12:40:13,823 - INFO - Fetching page 6 for *.gob.do
2025-02-08 12:40:15,316 - INFO - Got 3796 results from page 6 for Dominican Republic pattern *.gob.do
2025-02-08 12:40:16,321 - INFO - Fetching page 7 for *.gob.do
2025-02-08 12:40:16,417 - ERROR - Error fetching page 7 for *.gob.do: 400 Client Error: Bad Request for url: https://index.commoncrawl.org/CC-MAIN-2024-30-index?url=%2A.gob.do&output=json&page=7
2025-02-08 12:40:18,700 - INFO - Saved intermediate results for Dominican Republic pattern *.gob.do
2025-02-08 12:40:18,702 - INFO - Starting scrape for Costa Rica pattern: *.go.cr
2025-02-08 12:40:18,703 - INFO - Fetching page 0 for *.go.cr
2025-02-08 12:40:22,607 - INFO - Got 13896 results from page 0 for Costa Rica pattern *.go.cr
2025-02-08 12:40:23,613 - INFO - Fetching page 1 for *.go.cr
2025-02-08 12:40:27,255 - INFO - Got 15000 results from page 1 for Costa Rica pattern *.go.cr
2025-02-08 12:40:28,257 - INFO - Fetching page 2 for *.go.cr
2025-02-08 12:40:31,611 - I

2025-02-08 12:42:47,021 - INFO - Fetching page 9 for *.gov.br
2025-02-08 12:42:50,915 - INFO - Got 15000 results from page 9 for Brazil pattern *.gov.br
2025-02-08 12:42:51,920 - INFO - Fetching page 10 for *.gov.br
2025-02-08 12:42:56,881 - INFO - Got 15000 results from page 10 for Brazil pattern *.gov.br
2025-02-08 12:42:57,887 - INFO - Fetching page 11 for *.gov.br
2025-02-08 12:43:01,946 - INFO - Got 15000 results from page 11 for Brazil pattern *.gov.br
2025-02-08 12:43:02,952 - INFO - Fetching page 12 for *.gov.br
2025-02-08 12:43:06,564 - INFO - Got 15000 results from page 12 for Brazil pattern *.gov.br
2025-02-08 12:43:07,570 - INFO - Fetching page 13 for *.gov.br
2025-02-08 12:43:11,195 - INFO - Got 15000 results from page 13 for Brazil pattern *.gov.br
2025-02-08 12:43:12,201 - INFO - Fetching page 14 for *.gov.br
2025-02-08 12:43:15,870 - INFO - Got 15000 results from page 14 for Brazil pattern *.gov.br
2025-02-08 12:43:16,872 - INFO - Fetching page 15 for *.gov.br
2025-02-0

2025-02-08 12:47:21,074 - INFO - Fetching page 62 for *.gov.br
2025-02-08 12:47:24,826 - INFO - Got 15000 results from page 62 for Brazil pattern *.gov.br
2025-02-08 12:47:25,831 - INFO - Fetching page 63 for *.gov.br
2025-02-08 12:47:29,703 - INFO - Got 15000 results from page 63 for Brazil pattern *.gov.br
2025-02-08 12:47:30,708 - INFO - Fetching page 64 for *.gov.br
2025-02-08 12:47:34,546 - INFO - Got 15000 results from page 64 for Brazil pattern *.gov.br
2025-02-08 12:47:35,551 - INFO - Fetching page 65 for *.gov.br
2025-02-08 12:47:39,301 - INFO - Got 15000 results from page 65 for Brazil pattern *.gov.br
2025-02-08 12:47:40,307 - INFO - Fetching page 66 for *.gov.br
2025-02-08 12:47:44,886 - INFO - Got 15000 results from page 66 for Brazil pattern *.gov.br
2025-02-08 12:47:45,889 - INFO - Fetching page 67 for *.gov.br
2025-02-08 12:47:50,608 - INFO - Got 15000 results from page 67 for Brazil pattern *.gov.br
2025-02-08 12:47:51,614 - INFO - Fetching page 68 for *.gov.br
2025-02

2025-02-08 12:52:00,909 - INFO - Fetching page 4 for *.gov.co
2025-02-08 12:52:04,465 - INFO - Got 15000 results from page 4 for Colombia pattern *.gov.co
2025-02-08 12:52:05,471 - INFO - Fetching page 5 for *.gov.co
2025-02-08 12:52:09,364 - INFO - Got 15000 results from page 5 for Colombia pattern *.gov.co
2025-02-08 12:52:10,370 - INFO - Fetching page 6 for *.gov.co
2025-02-08 12:52:13,929 - INFO - Got 15000 results from page 6 for Colombia pattern *.gov.co
2025-02-08 12:52:14,934 - INFO - Fetching page 7 for *.gov.co
2025-02-08 12:52:18,386 - INFO - Got 15000 results from page 7 for Colombia pattern *.gov.co
2025-02-08 12:52:19,392 - INFO - Fetching page 8 for *.gov.co
2025-02-08 12:52:22,914 - INFO - Got 15000 results from page 8 for Colombia pattern *.gov.co
2025-02-08 12:52:23,920 - INFO - Fetching page 9 for *.gov.co
2025-02-08 12:52:27,470 - INFO - Got 15000 results from page 9 for Colombia pattern *.gov.co
2025-02-08 12:52:28,476 - INFO - Fetching page 10 for *.gov.co
2025-02

2025-02-08 12:55:40,824 - INFO - Fetching page 3 for *.gob.ec
2025-02-08 12:55:44,544 - INFO - Got 15000 results from page 3 for Ecuador pattern *.gob.ec
2025-02-08 12:55:45,550 - INFO - Fetching page 4 for *.gob.ec
2025-02-08 12:55:48,956 - INFO - Got 15000 results from page 4 for Ecuador pattern *.gob.ec
2025-02-08 12:55:49,961 - INFO - Fetching page 5 for *.gob.ec
2025-02-08 12:55:53,617 - INFO - Got 15000 results from page 5 for Ecuador pattern *.gob.ec
2025-02-08 12:55:54,620 - INFO - Fetching page 6 for *.gob.ec
2025-02-08 12:55:57,794 - INFO - Got 15000 results from page 6 for Ecuador pattern *.gob.ec
2025-02-08 12:55:58,799 - INFO - Fetching page 7 for *.gob.ec
2025-02-08 12:56:02,053 - INFO - Got 15000 results from page 7 for Ecuador pattern *.gob.ec
2025-02-08 12:56:03,059 - INFO - Fetching page 8 for *.gob.ec
2025-02-08 12:56:06,790 - INFO - Got 15000 results from page 8 for Ecuador pattern *.gob.ec
2025-02-08 12:56:07,797 - INFO - Fetching page 9 for *.gob.ec
2025-02-08 12:

2025-02-08 12:58:22,459 - INFO - Got 15000 results from page 7 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:23,465 - INFO - Fetching page 8 for *.gov.uk
2025-02-08 12:58:26,416 - INFO - Got 15000 results from page 8 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:27,422 - INFO - Fetching page 9 for *.gov.uk
2025-02-08 12:58:32,281 - INFO - Got 15000 results from page 9 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:33,287 - INFO - Fetching page 10 for *.gov.uk
2025-02-08 12:58:36,648 - INFO - Got 15000 results from page 10 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:37,655 - INFO - Fetching page 11 for *.gov.uk
2025-02-08 12:58:40,658 - INFO - Got 15000 results from page 11 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:41,663 - INFO - Fetching page 12 for *.gov.uk
2025-02-08 12:58:44,816 - INFO - Got 15000 results from page 12 for United Kingdom pattern *.gov.uk
2025-02-08 12:58:45,820 - INFO - Fetching page 13 for *.gov.uk
2025-02-08 12:58:49,109 - I

2025-02-08 13:02:13,495 - INFO - Fetching page 58 for *.gov.uk
2025-02-08 13:02:17,064 - INFO - Got 15000 results from page 58 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:18,070 - INFO - Fetching page 59 for *.gov.uk
2025-02-08 13:02:22,182 - INFO - Got 15000 results from page 59 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:23,188 - INFO - Fetching page 60 for *.gov.uk
2025-02-08 13:02:26,684 - INFO - Got 15000 results from page 60 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:27,690 - INFO - Fetching page 61 for *.gov.uk
2025-02-08 13:02:31,883 - INFO - Got 15000 results from page 61 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:32,911 - INFO - Fetching page 62 for *.gov.uk
2025-02-08 13:02:36,300 - INFO - Got 15000 results from page 62 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:37,306 - INFO - Fetching page 63 for *.gov.uk
2025-02-08 13:02:44,531 - INFO - Got 15000 results from page 63 for United Kingdom pattern *.gov.uk
2025-02-08 13:02:45,53

2025-02-08 13:06:52,746 - INFO - Got 15000 results from page 108 for United Kingdom pattern *.gov.uk
2025-02-08 13:06:53,752 - INFO - Fetching page 109 for *.gov.uk
2025-02-08 13:07:00,365 - INFO - Got 15000 results from page 109 for United Kingdom pattern *.gov.uk
2025-02-08 13:07:01,372 - INFO - Fetching page 110 for *.gov.uk
2025-02-08 13:07:04,603 - INFO - Got 15000 results from page 110 for United Kingdom pattern *.gov.uk
2025-02-08 13:07:05,608 - INFO - Fetching page 111 for *.gov.uk
2025-02-08 13:07:10,702 - INFO - Got 15000 results from page 111 for United Kingdom pattern *.gov.uk
2025-02-08 13:07:11,705 - INFO - Fetching page 112 for *.gov.uk
2025-02-08 13:07:20,452 - INFO - Got 11831 results from page 112 for United Kingdom pattern *.gov.uk
2025-02-08 13:07:21,458 - INFO - Fetching page 113 for *.gov.uk
2025-02-08 13:07:21,559 - ERROR - Error fetching page 113 for *.gov.uk: 400 Client Error: Bad Request for url: https://index.commoncrawl.org/CC-MAIN-2024-30-index?url=%2A.gov.

2025-02-08 13:08:26,814 - INFO - Starting scrape for Czech Republic pattern: *.gov.cz
2025-02-08 13:08:26,816 - INFO - Fetching page 0 for *.gov.cz
2025-02-08 13:08:27,099 - ERROR - Error fetching page 0 for *.gov.cz: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:27,101 - INFO - Starting scrape for Slovakia pattern: *.gov.sk
2025-02-08 13:08:27,102 - INFO - Fetching page 0 for *.gov.sk
2025-02-08 13:08:27,382 - ERROR - Error fetching page 0 for *.gov.sk: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:27,383 - INFO - Starting scrape for Hungary pattern: *.gov.hu
2025-02-08 13:08:27,383 - INFO - Fetching page 0 for *.gov.hu
2025-02-08 13:08:27,666 - ERROR - Error fetching page 0 for *.gov.hu: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:27,668 - INFO - Starting scrape for Romania pattern: *.gov.ro
2

2025-02-08 13:08:34,878 - INFO - Starting scrape for Singapore pattern: *.gov.sg
2025-02-08 13:08:34,878 - INFO - Fetching page 0 for *.gov.sg
2025-02-08 13:08:35,145 - ERROR - Error fetching page 0 for *.gov.sg: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:35,146 - INFO - Starting scrape for Japan pattern: *.go.jp
2025-02-08 13:08:35,146 - INFO - Fetching page 0 for *.go.jp
2025-02-08 13:08:35,416 - ERROR - Error fetching page 0 for *.go.jp: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:35,438 - INFO - Starting scrape for South Korea pattern: *.go.kr
2025-02-08 13:08:35,439 - INFO - Fetching page 0 for *.go.kr
2025-02-08 13:08:35,733 - ERROR - Error fetching page 0 for *.go.kr: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:35,735 - INFO - Starting scrape for North Korea pattern: *.gov.kp
2025-02

2025-02-08 13:08:42,785 - INFO - Starting scrape for Israel pattern: *.gov.il
2025-02-08 13:08:42,786 - INFO - Fetching page 0 for *.gov.il
2025-02-08 13:08:43,097 - ERROR - Error fetching page 0 for *.gov.il: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:43,099 - INFO - Starting scrape for Jordan pattern: *.gov.jo
2025-02-08 13:08:43,099 - INFO - Fetching page 0 for *.gov.jo
2025-02-08 13:08:43,383 - ERROR - Error fetching page 0 for *.gov.jo: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:43,385 - INFO - Starting scrape for Lebanon pattern: *.gov.lb
2025-02-08 13:08:43,386 - INFO - Fetching page 0 for *.gov.lb
2025-02-08 13:08:43,681 - ERROR - Error fetching page 0 for *.gov.lb: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:43,682 - INFO - Starting scrape for Oman pattern: *.gov.om
2025-02-08 13:

2025-02-08 13:08:50,645 - INFO - Starting scrape for Gambia pattern: *.gov.gm
2025-02-08 13:08:50,648 - INFO - Fetching page 0 for *.gov.gm
2025-02-08 13:08:50,925 - ERROR - Error fetching page 0 for *.gov.gm: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:50,926 - INFO - Starting scrape for Guinea-Bissau pattern: *.gov.gw
2025-02-08 13:08:50,926 - INFO - Fetching page 0 for *.gov.gw
2025-02-08 13:08:51,257 - ERROR - Error fetching page 0 for *.gov.gw: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:51,262 - INFO - Starting scrape for Cape Verde pattern: *.gov.cv
2025-02-08 13:08:51,265 - INFO - Fetching page 0 for *.gov.cv
2025-02-08 13:08:51,663 - ERROR - Error fetching page 0 for *.gov.cv: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:51,665 - INFO - Starting scrape for Kenya pattern: *.go.ke
2025

2025-02-08 13:08:58,977 - INFO - Starting scrape for New Zealand pattern: *.govt.nz
2025-02-08 13:08:58,977 - INFO - Fetching page 0 for *.govt.nz
2025-02-08 13:08:59,249 - ERROR - Error fetching page 0 for *.govt.nz: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:59,252 - INFO - Starting scrape for Papua New Guinea pattern: *.gov.pg
2025-02-08 13:08:59,253 - INFO - Fetching page 0 for *.gov.pg
2025-02-08 13:08:59,583 - ERROR - Error fetching page 0 for *.gov.pg: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:59,586 - INFO - Starting scrape for Fiji pattern: *.gov.fj
2025-02-08 13:08:59,587 - INFO - Fetching page 0 for *.gov.fj
2025-02-08 13:08:59,869 - ERROR - Error fetching page 0 for *.gov.fj: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-08 13:08:59,871 - INFO - Starting scrape for Solomon Islands patter

In [10]:
df.head()

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,redirect,country,pattern,languages,encoding,truncated
0,"gov,18f)/robots.txt",20240714194446,https://18f.gov/robots.txt,text/html,text/html,302,6A7BQRQYC732MVDMRP4PVDLINV7DBLFA,905,638196,crawl-data/CC-MAIN-2024-30/segments/1720763514...,https://18f.gsa.gov/robots.txt,United States,*.gov,,,
1,"gov,18f)/sitemap.xml",20240714194447,https://18f.gov/sitemap.xml,text/html,text/html,302,6A7BQRQYC732MVDMRP4PVDLINV7DBLFA,904,2750660,crawl-data/CC-MAIN-2024-30/segments/1720763514...,https://18f.gsa.gov/sitemap.xml,United States,*.gov,,,
2,"gov,18f,ads)/buyers",20240725131945,https://ads.18f.gov/buyers/,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,867,3223529,crawl-data/CC-MAIN-2024-30/segments/1720763858...,https://github.com/18F/ads-bpa,United States,*.gov,,,
3,"gov,18f,ads)/robots.txt",20240725131944,https://ads.18f.gov/robots.txt,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,864,465277,crawl-data/CC-MAIN-2024-30/segments/1720763858...,https://github.com/18F/ads-bpa,United States,*.gov,,,
4,"gov,18f,agile)/",20240725135610,https://agile.18f.gov/,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,853,3051965,crawl-data/CC-MAIN-2024-30/segments/1720763858...,https://guides.18f.gov/agile/,United States,*.gov,,,


In [11]:
def extract_domains(df: pd.DataFrame, url_column: str) -> pd.DataFrame:
    """
    Extract unique domains/subdomains from URLs, removing trailing files and paths.
    
    Args:
        df: DataFrame containing URLs
        url_column: Name of column containing URLs
        
    Returns:
        DataFrame with unique domains and their counts
    """
    def clean_url(url: str) -> str:
        try:
            parsed = urlparse(url)
            return parsed.netloc.lower()
        except:
            return None
    
    domains = df[url_column].apply(clean_url)
    
    domain_counts = domains.value_counts().reset_index()
    domain_counts.columns = ['domain', 'count']
    
    return domain_counts

In [12]:
domains_df = extract_domains(df, 'url')
domains_df

Unnamed: 0,domain,count
0,dime.dot.ca.gov,43569
1,find-and-update.company-information.service.go...,39848
2,www.veterans.gc.ca,35973
3,www.spa.usace.army.mil,34440
4,www.goodfellow.af.mil,34264
...,...,...
44325,cbdn.gob.do,1
44326,www.capgefi.gob.do,1
44327,www.ssmc.gob.cl,1
44328,bishopscanningsparishcouncil.gov.uk,1
