# Load Libraries and Set Settings

In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
from io import BytesIO
from tqdm import tqdm
import time
import csv
import sys
from random import randint

# Other Settings
pd.set_option('display.max.colwidth', None) # max display width

# Load Data

In [3]:
# Load data from CSV file into a DataFrame
pdf_links = pd.read_csv('pdf_links.csv')
publication_info = pd.read_csv('publication_info.csv')

# Clean Data

### Add prefix to links to make valid links

In [8]:
# Add prefix to the links in the result_df to complete the links
# Define the prefix to add
prefix = 'https://www.regeringen.se'

# Define a function to conditionally add the prefix
def add_prefix(link):
    if link == 'NO CONTENT':
        return link
    else:
        return f'{prefix}{link}'

# Use the .apply() method with the defined function to add the prefix conditionally
pdf_links['Full Collected Links'] = pdf_links['Collected Links'].apply(add_prefix)

### Remove cookies warning from text

In [12]:
# Assuming pdf_links is your DataFrame and 'Outside PDF Text' is the column you want to convert to string
pdf_links['Outside PDF Text'] = pdf_links['Outside PDF Text'].astype(str)

In [13]:
# Remove the repetitive string in front of all the rows on the Outside PDF Text
# Define the string to remove
string_to_remove = "På regeringen.se använder vi kakor för att löpande förbättra webbplatsen. Du väljer själv om du accepterar kakor.Läs om kakor\nHuvudnavigering\nHuvudnavigering\n"

# Define a function to conditionally remove the specified string
def remove_string(text):
    if text == 'NO CONTENT':
        return text
    else:
        return text.replace(string_to_remove, '', 1)  # Remove the specified string only from the beginning

# Apply the defined function to 'Outside PDF Text' column
pdf_links['Outside PDF Text'] = pdf_links['Outside PDF Text'].apply(remove_string)

# Combine Data

In [28]:
# Initiate article_link_directory as a copy of publication_info
article_link_directory = publication_info

In [29]:
# Complete the article_link_directory as a combination of pdf_links and publication_info
article_link_directory['Collected Links'] = pdf_links['Collected Links']
article_link_directory['Outside PDF Text'] = pdf_links['Outside PDF Text']
article_link_directory['Full Collected Links'] = pdf_links['Full Collected Links']

In [36]:
# Rename column 'Link' to 'Content Links' in article_link_directory
article_link_directory.rename(columns={'Link': 'Content Links'}, inplace=True)

In [37]:
article_link_directory.head(1)

Unnamed: 0,Content Links,Publishing Info,Collected Links,Outside PDF Text,Full Collected Links
0,https://www.regeringen.se/remisser/2023/11/inbjudan-till-larosaten-myndigheter-och-organisationer-att-inkomma-med-synpunkter-till-regeringens-forskningspolitik/,Publicerad02 november 2023· Uppdaterad10 november 2023·RemissfrånUtbildningsdepartementet,/contentassets/c9981cbd1c054e3fa1a2a917d3e22939/inbjudan-till-larosatena-att-inkomma-med-underlag-till-regeringens-forskningspolitik.pdf,NO CONTENT,https://www.regeringen.se/contentassets/c9981cbd1c054e3fa1a2a917d3e22939/inbjudan-till-larosatena-att-inkomma-med-underlag-till-regeringens-forskningspolitik.pdf


# Export Data

In [35]:
# Download article_link_directory as a fail safe
article_link_directory.to_csv('article_link_directory.csv', index=False)