# Load Libraries and Set Settings

In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
from io import BytesIO
from tqdm import tqdm
import time
import csv
import sys
from random import randint

# Other Settings
pd.set_option('display.max.colwidth', None) # max display width

# Load Data

In [7]:
# Load data from CSV file into a DataFrame
csv_file_path = "publication_info.csv"  # Update with the correct file path
existing_data = pd.read_csv(csv_file_path)

# Rename the columns to match your desired names
existing_data.columns = ['Content Links', 'Publishing Dates']

# Append the loaded data to the existing DataFrame
article_link_directory = existing_data.copy()

### Subset the data for quick-time tests

In [5]:
practice_article_link_directory = article_link_directory.head(30)

# COLLECT THE PDF LINKS FROM EACH CONTENT PAGE CONTAINING A PDF; IF A CONTENT PAGE CONTAINS NO PDF, GRAB ALL TEXT INSTEAD

### Collect data

In [11]:
# Initialize an empty list to collect the links
layered_links = []
text_data_list = []

# No content alert
no_content_alert = "NO CONTENT"

for i, link in enumerate(tqdm(practice_article_link_directory['Content Links'], desc="Step 1: Collecting PDF Links", unit="link")):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(link)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, "html.parser")

            # Find the first instance of <ul class="list--Block--icons">
            ul_tag = soup.find('ul', class_='list--Block--icons')

            # Check if the <ul> tag is found
            if ul_tag:
                # Find the first <a> tag with href inside the <ul> tag
                first_link = ul_tag.find('a', href=True)
                # no_content_alert = "NO CONTENT"
                if first_link:
                    layered_links.append(first_link['href'])
                    text_data_list.append(no_content_alert)
                else:
                    layered_links.append(no_content_alert)
            else:
                # If <ul> tag is not found, extract text from <p> tags
                paragraphs = soup.find_all('p')
                text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
                text_data_list.append(text_data)
                layered_links.append(no_content_alert)
        else:
            print(f"Failed to fetch URL: {link}, Status code: {response.status_code}")
            # If <ul> tag is not found, extract text from <p> tags
            paragraphs = soup.find_all('p')
            text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
            text_data_list.append(text_data)
            layered_links.append(no_content_alert)

    except Exception as e:
        print(f"An error occurred while processing URL: {link}, Error: {str(e)}")
        text_data_list.append(no_content_alert)
        layered_links.append(no_content_alert)

    # Introduce a random delay time before the next request
    time.sleep(1)
    # time.sleep(randint(1, 2))  # Adjust the delay time as needed

    # Add a break statement if the loop index exceeds the expected number of links
    if i + 1 > len(practice_article_link_directory['Content Links']):
        break

Step 1: Collecting PDF Links: 100%|██████████| 30/30 [01:06<00:00,  2.22s/link]


### Create dataframe to hold scraped text information

In [21]:
# Create a DataFrame with the lists and rename columns
additional_data = pd.DataFrame({'Collected Links': layered_links, 'Outside PDF Text': text_data_list})
# Concatenate the new DataFrame with the original DataFrame along the columns axis (axis=1)
practice_article_link_directory = pd.concat([practice_article_link_directory, additional_data], axis=1)

### Add prefix to links to make valid links

In [22]:
# Add prefix to the links in the result_df to complete the links
# Define the prefix to add
prefix = 'https://www.regeringen.se'

# Define a function to conditionally add the prefix
def add_prefix(link):
    if link == 'NO CONTENT':
        return link
    else:
        return f'{prefix}{link}'

# Use the .apply() method with the defined function to add the prefix conditionally
practice_article_link_directory['Full Collected Links'] = practice_article_link_directory['Collected Links'].apply(add_prefix)

### Remove cookies warning from text

In [23]:
# Remove the repetitive string in front of all the rows on the Outside PDF Text
# Define the string to remove
string_to_remove = "På regeringen.se använder vi kakor för att löpande förbättra webbplatsen. Du väljer själv om du accepterar kakor.Läs om kakor\nHuvudnavigering\nHuvudnavigering\n"

# Define a function to conditionally remove the specified string
def remove_string(text):
    if text == 'NO CONTENT':
        return text
    else:
        return text.replace(string_to_remove, '', 1)  # Remove the specified string only from the beginning

# Apply the defined function to 'Outside PDF Text' column
practice_article_link_directory['Outside PDF Text'] = practice_article_link_directory['Outside PDF Text'].apply(remove_string)

### Download data as .csv

In [25]:
# Download practice_article_link_directory as a fail safe
practice_article_link_directory.to_csv('practice_article_link_directory.csv', index=False)

# TEST

In [14]:
import csv

# No content alert
no_content_alert = "NO CONTENT"

# Add new columns to the DataFrame
practice_article_link_directory['Collected Links'] = ""
practice_article_link_directory['Outside PDF Text'] = ""

# Specify the CSV file path
csv_file_path = "output_data.csv"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  practice_article_link_directory['Collected Links'] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  practice_article_link_directory['Outside PDF Text'] = ""


In [16]:
# Open the CSV file in write mode with a CSV writer
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write the header row to the CSV file
    csv_writer.writerow(["Collected Links", "Outside PDF Text"])

    for i, link in enumerate(tqdm(practice_article_link_directory['Content Links'], desc="Step 1: Collecting PDF Links", unit="link")):
        try:
            # Send an HTTP GET request to the URL
            response = requests.get(link)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, "html.parser")

                # Find the first instance of <ul class="list--Block--icons">
                ul_tag = soup.find('ul', class_='list--Block--icons')

                # Check if the <ul> tag is found
                if ul_tag:
                    # Find the first <a> tag with href inside the <ul> tag
                    first_link = ul_tag.find('a', href=True)
                    if first_link:
                        practice_article_link_directory.at[i, 'Collected Links'] = first_link['href']
                        practice_article_link_directory.at[i, 'Outside PDF Text'] = no_content_alert
                    else:
                        practice_article_link_directory.at[i, 'Collected Links'] = no_content_alert
                else:
                    # If <ul> tag is not found, extract text from <p> tags
                    paragraphs = soup.find_all('p')
                    text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
                    practice_article_link_directory.at[i, 'Outside PDF Text'] = text_data
                    practice_article_link_directory.at[i, 'Collected Links'] = no_content_alert
            else:
                print(f"Failed to fetch URL: {link}, Status code: {response.status_code}")
                # If <ul> tag is not found, extract text from <p> tags
                paragraphs = soup.find_all('p')
                text_data = '\n'.join([p.get_text(strip=True) for p in paragraphs])
                practice_article_link_directory.at[i, 'Outside PDF Text'] = text_data
                practice_article_link_directory.at[i, 'Collected Links'] = no_content_alert

            # Write the data to the CSV file in each iteration
            csv_writer.writerow([practice_article_link_directory.at[i, 'Collected Links'],
                                 practice_article_link_directory.at[i, 'Outside PDF Text']])

        except Exception as e:
            print(f"An error occurred while processing URL: {link}, Error: {str(e)}")
            # If an error occurs, write the available data to the CSV file before continuing
            csv_writer.writerow([no_content_alert, no_content_alert])

        # Introduce a random delay time before the next request
        time.sleep(1)

        # Add a break statement if the loop index is equal to the expected number of links minus 1
        if i == len(practice_article_link_directory['Content Links']) - 1:
            break


Step 1: Collecting PDF Links:  97%|█████████▋| 29/30 [01:16<00:02,  2.64s/link]
