# Load Libraries and Set Settings

In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
from io import BytesIO
from tqdm import tqdm
import time
import csv
import sys
from random import randint

# Other Settings
pd.set_option('display.max.colwidth', None) # max display width

# Load Data

In [19]:
# Load data from CSV file into a DataFrame
csv_file_path = "publication_info_problem.csv"  # Update with the correct file path
existing_data = pd.read_csv(csv_file_path)

# Rename the columns to match your desired names
existing_data.columns = ['Content Links', 'Publishing Dates']

# Append the loaded data to the existing DataFrame
article_link_directory = existing_data.copy()

# COLLECT THE PDF LINKS FROM EACH CONTENT PAGE CONTAINING A PDF; IF A CONTENT PAGE CONTAINS NO PDF, GRAB ALL TEXT INSTEAD

### Collect data

In [20]:
# Initialize an empty list to collect the links
layered_links = []
text_data_list = []

# No content alert
no_content_alert = "NO CONTENT"

for link in tqdm(article_link_directory['Content Links'], desc="Step 1: Collecting PDF Links", unit="link"):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(link)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, "html.parser")

            # Find the first instance of <ul class="list--Block--icons">
            ul_tag = soup.find('ul', class_='list--Block--icons')

            # Check if the <ul> tag is found
            if ul_tag:
                # Find the first <a> tag with href inside the <ul> tag
                first_link = ul_tag.find('a', href=True)
                # no_content_alert = "NO CONTENT"
                if first_link:
                    layered_links.append(first_link['href'])
                    text_data_list.append(no_content_alert)
                else:
                    layered_links.append(no_content_alert)
            else:
                # If <ul> tag is not found, extract text from <p> tags
                paragraphs = soup.find_all('p')
                text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
                text_data_list.append(text_data)
                layered_links.append(no_content_alert)
        else:
            print(f"Failed to fetch URL: {link}, Status code: {response.status_code}")
            # If <ul> tag is not found, extract text from <p> tags
            paragraphs = soup.find_all('p')
            text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
            text_data_list.append(text_data)
            layered_links.append(no_content_alert)

    except Exception as e:
        print(f"An error occurred while processing URL: {link}, Error: {str(e)}")
        text_data_list.append(no_content_alert)
        layered_links.append(no_content_alert)

    # Introduce a random delay time before the next request
    time.sleep(randint(1))  # Adjust the delay time as needed

Step 1: Collecting PDF Links:  78%|███████▊  | 72/92 [03:22<01:01,  3.05s/link]

Failed to fetch URL: https://www.regeringen.se/rattsliga-dokument/proposition/2014/03/201314184/, Status code: 500


Step 1: Collecting PDF Links: 100%|██████████| 92/92 [04:11<00:00,  2.73s/link]


### Create dataframe to hold scraped text information

In [21]:
# Create a DataFrame with the lists and rename columns
additional_data = pd.DataFrame({'Collected Links': layered_links, 'Outside PDF Text': text_data_list})
# Concatenate the new DataFrame with the original DataFrame along the columns axis (axis=1)
article_link_directory = pd.concat([article_link_directory, additional_data], axis=1)

### Add prefix to links to make valid links

In [22]:
# Add prefix to the links in the result_df to complete the links
# Define the prefix to add
prefix = 'https://www.regeringen.se'

# Define a function to conditionally add the prefix
def add_prefix(link):
    if link == 'NO CONTENT':
        return link
    else:
        return f'{prefix}{link}'

# Use the .apply() method with the defined function to add the prefix conditionally
article_link_directory['Full Collected Links'] = article_link_directory['Collected Links'].apply(add_prefix)

### Remove cookies warning from text

In [23]:
# Remove the repetitive string in front of all the rows on the Outside PDF Text
# Define the string to remove
string_to_remove = "På regeringen.se använder vi kakor för att löpande förbättra webbplatsen. Du väljer själv om du accepterar kakor.Läs om kakor\nHuvudnavigering\nHuvudnavigering\n"

# Define a function to conditionally remove the specified string
def remove_string(text):
    if text == 'NO CONTENT':
        return text
    else:
        return text.replace(string_to_remove, '', 1)  # Remove the specified string only from the beginning

# Apply the defined function to 'Outside PDF Text' column
article_link_directory['Outside PDF Text'] = article_link_directory['Outside PDF Text'].apply(remove_string)

In [24]:
article_link_directory

Unnamed: 0,Content Links,Publishing Dates,Collected Links,Outside PDF Text,Full Collected Links
0,https://www.regeringen.se/rattsliga-dokument/skrivelse/2006/03/skr.-200506171,"Publicerad22 mars 2006· Uppdaterad02 april 2015·Rättsliga dokument,SkrivelsefrånKlimat- och näringslivsdepartementet",/contentassets/540e1ee83da946328aecfd682f4303e3/vissa-fiskeripolitiska-fragor-skr.-200506171,NO CONTENT,https://www.regeringen.se/contentassets/540e1ee83da946328aecfd682f4303e3/vissa-fiskeripolitiska-fragor-skr.-200506171
1,https://www.regeringen.se/rattsliga-dokument/skrivelse/2006/11/skr.-20060721,"Publicerad14 november 2006· Uppdaterad02 april 2015·Rättsliga dokument,SkrivelsefrånKlimat- och näringslivsdepartementet",/contentassets/5b183cf3f7194c63873738e2998cd578/aterkallelse-av-proposition-200506214-offentliggorande-av-resultat-av-livsmedelskontroll-skr.-20060721,NO CONTENT,https://www.regeringen.se/contentassets/5b183cf3f7194c63873738e2998cd578/aterkallelse-av-proposition-200506214-offentliggorande-av-resultat-av-livsmedelskontroll-skr.-20060721
2,https://www.regeringen.se/rattsliga-dokument/skrivelse/2010/03/skr.-200910187,"Publicerad23 mars 2010· Uppdaterad02 april 2015·Rättsliga dokument,SkrivelsefrånLandsbygds- och infrastrukturdepartementet",/contentassets/ca3d6909196c471597ddbcab449498bc/redovisning-av-fiskeripolitiska-insatser-skr.-200910187,NO CONTENT,https://www.regeringen.se/contentassets/ca3d6909196c471597ddbcab449498bc/redovisning-av-fiskeripolitiska-insatser-skr.-200910187
3,https://www.regeringen.se/rattsliga-dokument/skrivelse/2012/04/skr.-201112124,"Publicerad04 april 2012· Uppdaterad02 april 2015·Rättsliga dokument,SkrivelsefrånKlimat- och näringslivsdepartementet,Landsbygds- och infrastrukturdepartementet",/contentassets/002902dd1126471d8f42dd5017bad411/miljo--klimat--och-energiinsatser-inom-jordbruket-skr.-201112124,NO CONTENT,https://www.regeringen.se/contentassets/002902dd1126471d8f42dd5017bad411/miljo--klimat--och-energiinsatser-inom-jordbruket-skr.-201112124
4,https://www.regeringen.se/rattsliga-dokument/skrivelse/2014/03/skr.-201314158,"Publicerad11 mars 2014· Uppdaterad02 april 2015·Rättsliga dokument,SkrivelsefrånLandsbygds- och infrastrukturdepartementet",/contentassets/afe5e93c7f2b41519c2093a48f146242/riksrevisionens-rapport-om-det-svenska-landsbygdsprogrammet-20072013/,NO CONTENT,https://www.regeringen.se/contentassets/afe5e93c7f2b41519c2093a48f146242/riksrevisionens-rapport-om-det-svenska-landsbygdsprogrammet-20072013/
...,...,...,...,...,...
87,https://www.regeringen.se/rattsliga-dokument/departementsserien-och-promemorior/2004/01/ds-200414/,"Publicerad01 januari 2004· Uppdaterad02 april 2015·Departementsserien och promemorior,Rättsliga dokumentfrånKlimat- och näringslivsdepartementet",/contentassets/9a99922e7e54408294929462b3d9092f/lag-om-andring-i-epizootilagen/,NO CONTENT,https://www.regeringen.se/contentassets/9a99922e7e54408294929462b3d9092f/lag-om-andring-i-epizootilagen/
88,https://www.regeringen.se/rattsliga-dokument/departementsserien-och-promemorior/2004/01/ds-200415/,"Publicerad01 januari 2004· Uppdaterad02 april 2015·Departementsserien och promemorior,Rättsliga dokumentfrånKlimat- och näringslivsdepartementet",/contentassets/dc8c1bf54f3e4c17b08a91706eb907a3/tillstand-vid-forvarv-av-lantbruksegendom-i-glesbygd/,NO CONTENT,https://www.regeringen.se/contentassets/dc8c1bf54f3e4c17b08a91706eb907a3/tillstand-vid-forvarv-av-lantbruksegendom-i-glesbygd/
89,https://www.regeringen.se/rattsliga-dokument/departementsserien-och-promemorior/2004/08/ds-200439/,"Publicerad01 augusti 2004· Uppdaterad02 april 2015·Departementsserien och promemorior,Rättsliga dokumentfrånKlimat- och näringslivsdepartementet",/contentassets/c2216a0082fc4078837c17bde116d9b2/det-gar-langsamt-fram...---jamstalldheten-inom-jord--och-skogsbrukssektorn,NO CONTENT,https://www.regeringen.se/contentassets/c2216a0082fc4078837c17bde116d9b2/det-gar-langsamt-fram...---jamstalldheten-inom-jord--och-skogsbrukssektorn
90,https://www.regeringen.se/rattsliga-dokument/departementsserien-och-promemorior/2005/07/ds-200522/,"Publicerad27 juli 2005· Uppdaterad02 april 2015·Departementsserien och promemorior,Rättsliga dokumentfrånKlimat- och näringslivsdepartementet",/contentassets/a39a1030d800489480e6e0667d29467c/smaskalig-livsmedelsforadling/,NO CONTENT,https://www.regeringen.se/contentassets/a39a1030d800489480e6e0667d29467c/smaskalig-livsmedelsforadling/


### Download data as .csv

In [25]:
# Download article_link_directory as a fail safe
article_link_directory.to_csv('article_link_directory_save_step_2.csv', index=False)