In [10]:
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from collections import Counter


In [11]:
class GovUKPolicyScraper:
    def __init__(self, base_dir="policy_data"):
        """
        Initialize the scraper
        
        Args:
            base_dir (str): Base directory to store all downloaded data
        """
        self.base_url = "https://www.gov.uk"
        self.base_dir = base_dir
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'PolicyResearch/1.0 (Academic Research)'
        })
        
        # Create base directory
        os.makedirs(self.base_dir, exist_ok=True)
    
    def search_policies(self, search_term=None, page_limit=5, exclude_types=None):
        """
        Search for policy papers on gov.uk
        
        Args:
            search_term (str): Optional keyword to search for
            page_limit (int): Maximum number of pages to scrape
            exclude_types (list): Types of documents to exclude, e.g. ['consultations']
            
        Returns:
            pd.DataFrame: DataFrame containing search results
        """
        all_results = []
        
        # Use only policy-papers, not consultations
        search_url = f"{self.base_url}/search/policy-papers-and-consultations"
        
        params = {
            'keywords': search_term if search_term else '',
            'content_store_document_type': 'policy_papers'
        }
        
        print(f"Starting full extraction process for all policy papers")
        print(f"Scraping all policy papers (up to {page_limit} pages)")
        
        for page in range(1, page_limit + 1):
            params['page'] = page
            try:
                print(f"Scraping page {page}...")
                response = self.session.get(search_url, params=params)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                results_container = soup.find('div', id='js-results')
                
                if not results_container:
                    print(f"Could not find results container with id 'js-results' on page {page}")
                    continue
                    
                results_list = results_container.find_all('li', class_='gem-c-document-list__item')
                
                if not results_list:
                    print(f"No results found on page {page}, stopping pagination")
                    break
                
                print(f"Found {len(results_list)} results on page {page}")
                
                page_results = []
                for item in results_list:
                    # Skip consultations if in exclude_types
                    if exclude_types and any(excluded in item.text.lower() for excluded in exclude_types):
                        continue
                        
                    policy_data = self._extract_policy_data(item)
                    if policy_data:
                        # Extract categories
                        policy_url = policy_data['url']
                        categories = self._extract_categories(policy_url)
                        policy_data.update(categories)
                        
                        page_results.append(policy_data)
                        print(f"Extracted: {policy_data['title']}")
                
                all_results.extend(page_results)
                print(f"Total results so far: {len(all_results)}")
                
                # Add a small delay to be respectful to the server
                time.sleep(2)
                
            except requests.exceptions.RequestException as e:
                print(f"Error fetching page {page}: {e}")
                break
        
        policies_df = pd.DataFrame(all_results)
        print(f"Found {len(policies_df)} policies")
        
        # Save the policies data
        if not policies_df.empty:
            csv_path = os.path.join(self.base_dir, "all_policies.csv")
            policies_df.to_csv(csv_path, index=False)
            print(f"Saved policies data to {csv_path}")
        
        return policies_df
    
    # def _extract_policy_data(self, item):
    #     """
    #     Extract policy data from a search result item
        
    #     Args:
    #         item: BeautifulSoup object for a search result
            
    #     Returns:
    #         dict: Dictionary with policy data
    #     """
    #     try:
    #         title_tag = item.find('a', class_='gem-c-document-list__item-title')
    #         if not title_tag:
    #             # Try alternative selector
    #             title_tag = item.select_one('div.gem-c-document-list__item-title a')
    #             if not title_tag:
    #                 return None
                    
    #         title = title_tag.text.strip()
    #         link = title_tag.get('href')
    #         full_link = self.base_url + link if link.startswith('/') else link
            
    #         # Extract metadata
    #         metadata_container = item.find('ul', class_='gem-c-document-list__item-metadata')
    #         metadata = {
    #             'published_date': None,
    #             'department': None,
    #             'type': None
    #         }
            
    #         if metadata_container:
    #             metadata_items = metadata_container.find_all('li')
    #             for meta_item in metadata_items:
    #                 text = meta_item.text.strip()
    #                 if "Published" in text:
    #                     metadata['published_date'] = text.replace("Published: ", "")
    #                 elif "Organisation" in text or "Department" in text or "From" in text:
    #                     metadata['department'] = text.replace("Organisation: ", "").replace("Department: ", "").replace("From: ", "")
    #                 else:
    #                     metadata['type'] = text
            
    #         # Extract description
    #         description_tag = item.find('p', class_='gem-c-document-list__item-description')
    #         description = description_tag.text.strip() if description_tag else None
            
    #         return {
    #             'title': title,
    #             'url': full_link,
    #             'description': description,
    #             **metadata
    #         }
            
    #     except Exception as e:
    #         print(f"Error extracting policy data: {e}")
    #         return None

    
    def _extract_policy_data(self, item):
        """
        Extract policy data from a search result item
        
        Args:
            item: BeautifulSoup object for a search result
            
        Returns:
            dict: Dictionary with policy data
        """
        try:
            title_tag = item.find('a', class_='gem-c-document-list__item-title')
            if not title_tag:
                # Try alternative selector
                title_tag = item.select_one('div.gem-c-document-list__item-title a')
                if not title_tag:
                    return None
                    
            title = title_tag.text.strip()
            link = title_tag.get('href')
            full_link = self.base_url + link if link.startswith('/') else link
            
            # Extract metadata
            metadata_container = item.find('ul', class_='gem-c-document-list__item-metadata')
            metadata = {
                'published_date': None,
                'updated_date': None,
                'department': None,
                'type': None
            }
            
            if metadata_container:
                metadata_items = metadata_container.find_all('li')
                for meta_item in metadata_items:
                    text = meta_item.text.strip()
                    if "Published" in text:
                        date_part = text.replace("Published: ", "").strip()
                        metadata['published_date'] = date_part
                    elif "Organisation" in text or "Department" in text or "From" in text:
                        metadata['department'] = text.replace("Organisation: ", "").replace("Department: ", "").replace("From: ", "")
                    else:
                        metadata['type'] = text
            
            # If dates weren't found in the list metadata, try to get them from the policy page metadata
            if (not metadata['published_date'] or not metadata['updated_date']) and full_link:
                try:
                    # Get the policy detail page
                    response = self.session.get(full_link)
                    response.raise_for_status()
                    
                    detail_soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Look for the published date in the head metadata
                    published_meta = detail_soup.find('meta', attrs={'name': 'govuk:first-published-at'})
                    if published_meta:
                        published_date = published_meta.get('content')
                        if published_date:
                            # Convert ISO format to more readable format (optional)
                            try:
                                from datetime import datetime
                                dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
                                metadata['published_date'] = dt.strftime('%d %B %Y')
                            except Exception:
                                # If date parsing fails, just use the original string
                                metadata['published_date'] = published_date
                    
                    # Look for the updated date in the head metadata
                    updated_meta = detail_soup.find('meta', attrs={'name': 'govuk:updated-at'})
                    if updated_meta:
                        updated_date = updated_meta.get('content')
                        if updated_date:
                            # Convert ISO format to more readable format (optional)
                            try:
                                from datetime import datetime
                                dt = datetime.fromisoformat(updated_date.replace('Z', '+00:00'))
                                metadata['updated_date'] = dt.strftime('%d %B %Y')
                            except Exception:
                                # If date parsing fails, just use the original string
                                metadata['updated_date'] = updated_date
                    
                    # If still no published date, try the public web published date
                    if not metadata['published_date']:
                        public_meta = detail_soup.find('meta', attrs={'name': 'govuk:public-updated-at'})
                        if public_meta:
                            public_date = public_meta.get('content')
                            if public_date:
                                try:
                                    from datetime import datetime
                                    dt = datetime.fromisoformat(public_date.replace('Z', '+00:00'))
                                    metadata['published_date'] = dt.strftime('%d %B %Y')
                                except Exception:
                                    metadata['published_date'] = public_date
                    
                    # Add a small delay to be respectful to the server
                    time.sleep(1)
                
                except Exception as e:
                    print(f"Error fetching policy page for date extraction: {e}")
            
            # Extract description
            description_tag = item.find('p', class_='gem-c-document-list__item-description')
            description = description_tag.text.strip() if description_tag else None
            
            return {
                'title': title,
                'url': full_link,
                'description': description,
                **metadata
            }
            
        except Exception as e:
            print(f"Error extracting policy data: {e}")
            return None
    
    def _extract_categories(self, url):
        """
        Extract categories from a policy page
        
        Args:
            url (str): URL of the policy page
            
        Returns:
            dict: Dictionary with category information
        """
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            categories = {
                'category': None,
                'subcategory': None
            }
            
            # Try to find categories from breadcrumbs
            breadcrumb = soup.find('nav', class_='govuk-breadcrumbs')
            if breadcrumb:
                breadcrumb_items = breadcrumb.find_all('li', class_='govuk-breadcrumbs__list-item')
                
                # Breadcrumb structure is typically: Home > Category > Subcategory
                if len(breadcrumb_items) >= 2:
                    # Skip "Home", get category
                    category_item = breadcrumb_items[1]
                    category_link = category_item.find('a')
                    if category_link:
                        categories['category'] = category_link.text.strip()
                
                # Get subcategory if available
                if len(breadcrumb_items) >= 3:
                    subcategory_item = breadcrumb_items[2]
                    subcategory_link = subcategory_item.find('a')
                    if subcategory_link:
                        categories['subcategory'] = subcategory_link.text.strip()
            
            # If not found in breadcrumbs, try alternative methods
            if not categories['category']:
                # Look for topic tags
                topic_tags = soup.find_all('a', class_=lambda c: c and 'topic' in (c or '').lower())
                if topic_tags:
                    categories['category'] = topic_tags[0].text.strip()
            
            return categories
            
        except Exception as e:
            print(f"Error extracting categories: {e}")
            return {'category': None, 'subcategory': None}
    
    def download_policy_attachments(self, policies_df):
        """
        Download and organize attachments from policies
        
        Args:
            policies_df (pd.DataFrame): DataFrame with policy data
            
        Returns:
            int: Number of attachments downloaded
        """
        download_count = 0
        
        if policies_df.empty:
            print("No policies to process")
            return download_count
        
        print(f"Downloading attachments for {len(policies_df)} policies...")
        
        for _, policy in policies_df.iterrows():
            try:
                title = policy['title']
                url = policy['url']
                category = policy.get('category', 'Uncategorized')
                subcategory = policy.get('subcategory', 'General')
                
                print(f"Processing: {title}")
                
                # Create folder structure
                category_dir = self._sanitize_filename(category)
                subcategory_dir = self._sanitize_filename(subcategory)
                policy_dir = self._sanitize_filename(title)
                
                full_path = os.path.join(self.base_dir, category_dir, subcategory_dir, policy_dir)
                os.makedirs(full_path, exist_ok=True)
                
                # Get policy detail page
                response = self.session.get(url)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Find attachments
                attachment_sections = soup.find_all(['div', 'section'], class_=lambda c: c and 'attachment' in c)
                if not attachment_sections:
                    print(f"No attachments found for: {title}")
                    continue
                
                # Extract and download attachments
                policy_attachments = []
                for section in attachment_sections:
                    attachment_links = section.find_all('a')
                    for link in attachment_links:
                        href = link.get('href')
                        if not href:
                            continue
                            
                        # Only download document files
                        if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|csv)$', href, re.I):
                            attachment_url = urljoin(self.base_url, href)
                            filename = os.path.basename(href)
                            save_path = os.path.join(full_path, filename)
                            
                            # Download the file
                            print(f"Downloading: {filename}")
                            try:
                                file_response = self.session.get(attachment_url, stream=True)
                                file_response.raise_for_status()
                                
                                with open(save_path, 'wb') as f:
                                    for chunk in file_response.iter_content(chunk_size=8192):
                                        f.write(chunk)
                                
                                policy_attachments.append({
                                    'filename': filename,
                                    'url': attachment_url,
                                    'path': save_path
                                })
                                
                                download_count += 1
                                print(f"Saved to: {save_path}")
                                
                                # Be nice to the server
                                time.sleep(1)
                                
                            except Exception as e:
                                print(f"Error downloading {filename}: {e}")
                
                # Create metadata file with policy information
                metadata = {
                    'title': title,
                    'url': url,
                    'published_date': policy.get('published_date'),
                    'department': policy.get('department'),
                    'category': category,
                    'subcategory': subcategory,
                    'description': policy.get('description'),
                    'attachments': policy_attachments
                }
                
                metadata_path = os.path.join(full_path, 'metadata.json')
                import json
                with open(metadata_path, 'w', encoding='utf-8') as f:
                    json.dump(metadata, f, indent=2, ensure_ascii=False)
                
            except Exception as e:
                print(f"Error processing policy {policy['title']}: {e}")
        
        print(f"Total attachments downloaded: {download_count}")
        return download_count
    
    def categorize_policies(self, policies_df):
        """
        Organize policies by category and subcategory
        
        Args:
            policies_df (pd.DataFrame): DataFrame with policy data
            
        Returns:
            dict: Dictionary with categorized policies
        """
        if policies_df.empty:
            return {}
        
        # Initialize category structure
        categories = {}
        
        # Group by category and subcategory
        for _, policy in policies_df.iterrows():
            category = policy.get('category', 'Uncategorized')
            subcategory = policy.get('subcategory', 'General')
            
            if category not in categories:
                categories[category] = {'subcategories': {}}
            
            if subcategory not in categories[category]['subcategories']:
                categories[category]['subcategories'][subcategory] = {'policies': []}
            
            # Add policy to appropriate subcategory
            categories[category]['subcategories'][subcategory]['policies'].append({
                'title': policy['title'],
                'url': policy['url'],
                'published_date': policy.get('published_date'),
                'department': policy.get('department')
            })
        
        # Calculate counts
        for category, cat_data in categories.items():
            cat_policy_count = 0
            
            for subcategory, subcat_data in cat_data['subcategories'].items():
                policies_count = len(subcat_data['policies'])
                cat_data['subcategories'][subcategory]['count'] = policies_count
                cat_policy_count += policies_count
            
            cat_data['count'] = cat_policy_count
        
        # Save category structure
        import json
        categories_path = os.path.join(self.base_dir, 'category_structure.json')
        with open(categories_path, 'w', encoding='utf-8') as f:
            json.dump(categories, f, indent=2, ensure_ascii=False)
        
        print(f"Category structure saved to {categories_path}")
        
        # Print category summary
        print("\nCategory Summary:")
        print("=" * 40)
        for category, cat_data in categories.items():
            print(f"{category}: {cat_data['count']} policies")
            for subcategory, subcat_data in cat_data['subcategories'].items():
                print(f"  - {subcategory}: {subcat_data['count']} policies")
        
        return categories
    
    def _sanitize_filename(self, name):
        """
        Convert a string to a valid directory name
        
        Args:
            name (str): String to sanitize
            
        Returns:
            str: Sanitized string
        """
        if not name:
            return "Unknown"
            
        # Replace invalid characters
        s = re.sub(r'[\\/*?:"<>|]', '', name)
        # Replace multiple spaces with a single space
        s = re.sub(r'\s+', ' ', s)
        # Trim the name if it's too long
        if len(s) > 75:
            s = s[:75]
        
        return s

In [12]:
def main():
    """Main function to run the scraper"""
    # Create scraper instance
    scraper = GovUKPolicyScraper(base_dir="policy_data")
    
    # Step 1: Search and extract policies
    policies = scraper.search_policies(
        search_term=None,  # Set a search term or None for all policies
        page_limit=1,      # Number of pages to scrape
        exclude_types=["consultation", "open consultation", "closed consultation"]
    )
    
    # Step 2: Categorize the policies
    categories = scraper.categorize_policies(policies)
    
    # Step 3: Download and organize attachments
    download_count = scraper.download_policy_attachments(policies)
    
    print("\nScraping Summary:")
    print(f"Total policies found: {len(policies)}")
    print(f"Total categories: {len(categories)}")
    print(f"Total attachments downloaded: {download_count}")
    print(f"All data has been organized in the '{scraper.base_dir}' directory")


if __name__ == "__main__":
    main()

Starting full extraction process for all policy papers
Scraping all policy papers (up to 1 pages)
Scraping page 1...
Found 20 results on page 1
Extracted: Crime and Policing Bill 2025: Delegated powers supplementary memoranda
Extracted: Crime and Policing Bill 2025: ECHR supplementary memoranda
Extracted: Neighbourhood Policing Guarantee performance framework
Extracted: EM on UK/EU TCA Specialised Committee decision (COM(2025)48)
Extracted: Programme of flood and coastal erosion risk management (FCERM) schemes
Extracted: Agenda of the 14th meeting of the Withdrawal Agreement Joint Committee, 29 April 2025
Extracted: Annual report on devolution 2023 to 2024
Extracted: Terrorism (Protection of Premises) Act 2025: factsheets
Extracted: Mpox control and elimination: UK strategy 2025 to 2026
Extracted: The Pall Mall Process Code of Practice for States
Extracted: Cyber governance mapping
Extracted: DSIT cyber security newsletter - April 2025
Extracted: Direction under regulation 2(2) of the 