In [131]:
# 11/9/23

In [132]:
# STEP 1: LIBRARIES AND SETTINGS

In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
from io import BytesIO
from tqdm import tqdm
import time
import csv
from random import randint

# Other Settings
# pd.set_option('display.max.colwidth', None) # max display width

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options

# Headless (No GUI) Mode for Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run in headless mode

# NEW CODE for SELENIUM

### Open Connection and Accept Cookies

In [218]:
# Base URL https://www.regeringen.se/dokument-och-publikationer for the search query
url = "https://www.regeringen.se/dokument-och-publikationer"

# Set up the Selenium WebDriver
driver = webdriver.Chrome('chromedriver-win64\chromedriver.exe', options = chrome_options)  # You can use other web drivers like Firefox if you prefer
driver.maximize_window()

# Open the URL in the web browser
driver.get(url)

time.sleep(10)

try:
    button = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, "btn.c-cookie__action.js-cookie-click"))
    )

    # Print confirmation
    print("Button found:", button.text)
    # Click the button
    button.click()
    print("Button clicked")

except Exception as e:
    print("An error occurred:", str(e))

Button found: Ja, jag accepterar kakor.
Button clicked


### Prepare Function for Collect Publication Links and Publishing Information

In [219]:
# Create lists to hold the key data for each publication
all_links = []
all_publishing_info = []

In [220]:
def collect_publications(iteration):
    # Parse HTML
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Create lists to hold the key data for each publication
    links = []
    publishing_info = []

    # Isolate where the publications are stored
    ul_tags = soup.find_all("ul", class_="list--block cl")

    for ul_tag in ul_tags:
        # Collect publication links
        a_tags = ul_tag.find_all("a", href = True)
        for a in a_tags:
            link = a["href"]
            if not link.startswith("/tx"):
                full_url = "https://www.regeringen.se" + link
                links.append(full_url)
        # Collect publishing info
        publication_divs = soup.find_all('div', class_='block--timeLinks')
        for publication_div in publication_divs:
            publication_info = publication_div.get_text(strip = True)
            publishing_info.append(publication_info)
            
    # Add the scraped data from the current page to the collection of all data
    all_links.extend(links)
    all_publishing_info.extend(publishing_info)

    

### Run Publication Collection Loop and Close Driver

In [221]:
last_page = 1464
page_number = 1


while page_number <= last_page:
    collect_publications(page_number)
    # Next Page
    try:
        next_link = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.nav--pagination__next a.filter-pagination"))
        )

        print("Next link found:", next_link.text)
        next_link.click()
        print("Next link clicked")

        # Sleep to let the new page load
        time.sleep(randint(3, 5))

        # Increment page number
        page_number += 1
        print(page_number)

        # Recreate the BeautifulSoup object for the new page
        soup = BeautifulSoup(driver.page_source, 'html.parser')


    except Exception as e:
        print("An error occurred or ", page_number, " was the last page", str(e))
        break

Next link found: Nästa
Next link clicked
2
Next link found: Nästa
Next link clicked
3
Next link found: Nästa
Next link clicked
4
Next link found: Nästa
Next link clicked
5
Next link found: Nästa
Next link clicked
6
Next link found: Nästa
Next link clicked
7
Next link found: Nästa
Next link clicked
8
Next link found: Nästa
Next link clicked
9
Next link found: Nästa
Next link clicked
10
Next link found: Nästa
Next link clicked
11
Next link found: Nästa
Next link clicked
12
Next link found: Nästa
Next link clicked
13
Next link found: Nästa
Next link clicked
14
Next link found: Nästa
Next link clicked
15
Next link found: Nästa
Next link clicked
16
Next link found: Nästa
Next link clicked
17
Next link found: Nästa
Next link clicked
18
Next link found: Nästa
Next link clicked
19
Next link found: Nästa
Next link clicked
20
Next link found: Nästa
Next link clicked
21
Next link found: Nästa
Next link clicked
22
Next link found: Nästa
Next link clicked
23
Next link found: Nästa
Next link clicked

In [222]:
# Close Driver
driver.quit()

### Organize Output

In [223]:
# Combine the lists into pairs using zip
output_data = list(zip(all_links, all_publishing_info))

# Output DF
output_df = pd.DataFrame(output_data, columns=['Link', 'Publishing Info'])

In [224]:
# Ensure no duplicates--that was our error last time, so check this before going into Script mode!
output_df_unique = output_df.drop_duplicates()

### Save PDF to CSV

In [226]:
# Save to PDF
output_df_unique.to_csv('publication_info.csv', index=False)

In [3]:
article_link_directory

Unnamed: 0,Content Links,Publishing Dates
0,https://www.regeringen.se/remisser/2023/11/inb...,Publicerad02 november 2023· Uppdaterad10 novem...
1,https://www.regeringen.se/regeringsuppdrag/202...,Publicerad10 november 2023·Regeringsuppdragfrå...
2,https://www.regeringen.se/remisser/2023/06/rem...,Publicerad27 juni 2023· Uppdaterad10 november ...
3,https://www.regeringen.se/remisser/2023/09/rem...,Publicerad08 september 2023· Uppdaterad10 nove...
4,https://www.regeringen.se/remisser/2023/07/rem...,Publicerad07 juli 2023· Uppdaterad10 november ...
...,...,...
27074,https://www.regeringen.se/informationsmaterial...,Publicerad25 juni 2007·Informationsmaterialfrå...
27075,https://www.regeringen.se/informationsmaterial...,Publicerad21 maj 2007·Informationsmaterialfrån...
27076,https://www.regeringen.se/internationella-mr-g...,Publicerad01 maj 2007·Internationella MR-grans...
27077,https://www.regeringen.se/internationella-mr-g...,Publicerad30 april 2007·Internationella MR-gra...
