In [None]:
import os
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def download_pdfs_from_url(url):
    # Create a new instance of the Firefox driver
    driver = webdriver.Firefox()

    # Load the webpage
    driver.get(url)

    # Get the page source
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Search for all anchor tags with href ending in ".fn.pdf"
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.fn.pdf')]

    # Base domain
    base_domain = "https://le.utah.gov"

    # Destination folder
    destination_folder = "/Users/yams/Dropbox/Github/fiscal-notes/pdfs/2022/"

    # Ensure the destination folder exists
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Download and save each PDF
    for link in pdf_links:
        full_url = base_domain + link
        response = requests.get(full_url, stream=True)
        filename = os.path.join(destination_folder, link.split('/')[-1])
        
        with open(filename, 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)

    return len(pdf_links)

# Loop over the desired range of bill numbers
total_pdfs = 0
for bill_number in range(1, 261):  # This will loop from SB0001 to SB0010
    url = f"https://le.utah.gov/~2022/bills/static/HB{bill_number:04}.html"
    total_pdfs += download_pdfs_from_url(url)
    
    # Sleep for 3 seconds
    time.sleep(3)

print(f"Downloaded {total_pdfs} PDFs in total to {destination_folder}")


In [1]:
import os
import csv
import fitz  # PyMuPDF
import re

# Define functions for extraction

def extract_section_content(text, section_title):
    pattern = re.compile(rf"{re.escape(section_title)}(.*?)(?=\\n[A-Z][a-z]+|\\Z)", re.DOTALL)
    matches = pattern.findall(text)
    return matches[0].strip() if matches else None

def extract_data_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page in pdf_document:
        text += page.get_text("text")
    pdf_document.close()
    
    # Extracting data based on the sections and format we discussed
    geusf = extract_section_content(text, "General, Education, and Uniform School Funds")
    local_gov = extract_section_content(text, "Local Government")
    individuals_businesses = extract_section_content(text, "Individuals & Businesses")
    regulatory_impact = extract_section_content(text, "Regulatory Impact")
    performance_evaluation = extract_section_content(text, "Performance Evaluation")
    
    return [geusf, local_gov, individuals_businesses, regulatory_impact, performance_evaluation]

# Specify the directory containing the PDFs
directory_path = '/Users/yams/Dropbox/Github/fiscal-notes/pdfs/'

# Loop through each PDF and extract data
all_data = []
for filename in os.listdir(directory_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(directory_path, filename)
        extracted_data = extract_data_from_pdf(pdf_path)
        all_data.append(extracted_data)

# Write the extracted data to a CSV
csv_path = 'extracted_data.csv'
with open(csv_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["General, Education, and Uniform School Funds", "Local Government", "Individuals & Businesses", "Regulatory Impact", "Performance Evaluation"])
    writer.writerows(all_data)

print(f"Data extracted and saved to {csv_path}")


Data extracted and saved to extracted_data.csv


In [None]:
import os
import csv
import fitz  # PyMuPDF
import re

# Define functions for extraction

def extract_section_content(text, section_title):
    pattern = re.compile(rf"{re.escape(section_title)}(.*?)(?=\\n[A-Z][a-z]+|\\Z)", re.DOTALL)
    matches = pattern.findall(text)
    return matches[0].strip() if matches else None

def extract_data_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page in pdf_document:
        text += page.get_text("text")
    pdf_document.close()
    
    # Print the first 500 characters of the extracted text for debugging
    print("===== BEGIN EXTRACTED TEXT =====")
    print(text[:500])
    print("===== END EXTRACTED TEXT =====")
    
    # Extracting data based on the sections and format we discussed
    geusf = extract_section_content(text, "General, Education, and Uniform School Funds")
    local_gov = extract_section_content(text, "Local Government")
    individuals_businesses = extract_section_content(text, "Individuals & Businesses")
    regulatory_impact = extract_section_content(text, "Regulatory Impact")
    performance_evaluation = extract_section_content(text, "Performance Evaluation")
    
    return [geusf, local_gov, individuals_businesses, regulatory_impact, performance_evaluation]

# Specify the directory containing the PDFs
directory_path = '/Users/yams/Dropbox/Github/fiscal-notes/pdfs/'

# Loop through each PDF and extract data
all_data = []
for filename in os.listdir(directory_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(directory_path, filename)
        extracted_data = extract_data_from_pdf(pdf_path)
        all_data.append(extracted_data)

# Write the extracted data to a CSV
csv_path = 'extracted_data.csv'
with open(csv_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["General, Education, and Uniform School Funds", "Local Government", "Individuals & Businesses", "Regulatory Impact", "Performance Evaluation"])
    writer.writerows(all_data)

print(all_data)    
print(f"Data extracted and saved to {csv_path}")


In [1]:

import fitz  # PyMuPDF

doc = fitz.open('/Users/yams/Dropbox/Github/fiscal-notes/pdfs/HB0013.fn.pdf')
page = doc[0]  # Assuming you're looking at the first page

# Extract blocks of text
blocks = page.get_text("blocks")

# Sort blocks based on vertical position
blocks.sort(key=lambda block: block[1])

for block in blocks:
    print(block[4])  # This will print the text in each block

AttributeError: 'Page' object has no attribute 'getText'

In [2]:
import os
import csv
import fitz  # PyMuPDF
import re

def refined_extract_section_content(text, section_title, end_title):
    """Extract content between two section titles."""
    pattern = re.compile(rf"{re.escape(section_title)}(.*?){re.escape(end_title)}", re.DOTALL)
    matches = pattern.findall(text)
    return matches[0].strip() if matches else None

def refined_extract_data_from_pdf(pdf_path, section_titles):
    """Extract data from a PDF using predefined section titles."""
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page in pdf_document:
        text += page.get_text("text")
    pdf_document.close()
    
    sections_content_refined = {}
    for i, title in enumerate(section_titles[:-1]):
        end_title = section_titles[i+1]
        section_content = refined_extract_section_content(text, title, end_title)
        sections_content_refined[title] = section_content
    
    return sections_content_refined

# Define a list of expected section titles based on our observations
expected_section_titles = [
    "General, Income Tax, and Uniform School Funds",
    "State Government",
    "Expenditures",
    "Individuals & Businesses",
    "Regulatory Impact",
    "Performance Evaluation"
    # Further sections can be added as they are identified
]

# Specify the directory containing the PDFs
directory_path = '/Users/yams/Dropbox/Github/fiscal-notes/pdfs/2022/'

# Modify the extraction loop to include the filename in the extracted data

all_data_refined_with_filename = []

for filename in os.listdir(directory_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(directory_path, filename)
        extracted_data_refined = refined_extract_data_from_pdf(pdf_path, expected_section_titles)
        
        # Add the filename to the extracted data
        extracted_data_refined['Filename'] = filename
        all_data_refined_with_filename.append(extracted_data_refined)

# Update the CSV writing part to include the new "Filename" field

csv_path_with_filename = '/Users/yams/Dropbox/Github/fiscal-notes/extracted_data_UT_2022.csv'

with open(csv_path_with_filename, 'w', newline='') as file:
    # Adding "Filename" to the beginning of the fieldnames list
    fieldnames = ['Filename'] + expected_section_titles[:-1]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    writer.writeheader()
    for data in all_data_refined_with_filename:
        writer.writerow(data)

print(f"Data extracted and saved to {csv_path_with_filename}")


Data extracted and saved to /Users/yams/Dropbox/Github/fiscal-notes/extracted_data_UT_2022.csv


In [1]:
import pandas as pd

# Load the CSV data into a DataFrame
data_df = pd.read_csv('/Users/yams/Dropbox/Github/fiscal-notes/extracted_data_UT_2022.csv')

# Display the first five rows of the DataFrame
print(data_df.head())



           Filename  General, Income Tax, and Uniform School Funds  \
0     HB0084.fn.pdf                                            NaN   
1  SB0150S03.fn.pdf                                            NaN   
2     HB0233.fn.pdf                                            NaN   
3     HB0184.fn.pdf                                            NaN   
4     HB0462.fn.pdf                                            NaN   

                                    State Government  \
0  UCA 36-12-13(2)(c)\nRevenues\nFY 2022\nFY 2023...   
1  UCA 36-12-13(2)(c)\nRevenues\nFY 2022\nFY 2023...   
2  UCA 36-12-13(2)(c)\nRevenues\nFY 2022\nFY 2023...   
3  UCA 36-12-13(2)(c)\nRevenues\nFY 2022\nFY 2023...   
4  UCA 36-12-13(2)(c)\nRevenues\nFY 2022\nFY 2023...   

                                        Expenditures  \
0  FY 2022\nFY 2023\nFY 2024\nGeneral Fund\n$0\n$...   
1  FY 2022\nFY 2023\nFY 2024\nGeneral Fund\n$0\n$...   
2  FY 2022\nFY 2023\nFY 2024\nGeneral Fund\n$0\n$...   
3  FY 2022\nFY 202

In [3]:
import pandas as pd

# Load the CSV data into a DataFrame
data_df = pd.read_csv('/Users/yams/Dropbox/Github/fiscal-notes/extracted_data_UT_2022.csv')

# Display the first few rows of the DataFrame to understand its structure
data_df.head()

# Extract unique values from the "Regulatory Impact" column
unique_regulatory_impacts = data_df["Regulatory Impact"].dropna().unique()

unique_regulatory_impacts

# Count the occurrences of each unique value in the "Regulatory Impact" column
regulatory_impact_counts = data_df["Regulatory Impact"].value_counts()

regulatory_impact_counts

Regulatory Impact
UCA 36-12-13(2)(d)\nEnactment of this legislation likely will not change the regulatory burden for Utah residents or\nbusinesses.                                                                                                           899
UCA 36-12-13(2)(d)\nEnactment of this legislation could result in a small increase in the regulatory burden for Utah residents\nor businesses.                                                                                              124
UCA 36-12-13(2)(d)\nEnactment of this legislation could result in a small reduction in the regulatory burden for Utah\nresidents or businesses.                                                                                              37
UCA 36-12-13(2)(d)\nEnactment of this legislation could result in a medium increase in the regulatory burden for Utah\nresidents or businesses.                                                                                               8
UCA 36-12-13(2)(d)\nEn

In [4]:
import pandas as pd

def categorize_regulatory_impact(statement):
    """Categorize the regulatory impact based on specific phrases."""
    if pd.isna(statement):
        return "Missing Data"
    elif "not change the regulatory burden" in statement:
        return "No change in regulatory burden"
    elif "small increase in the regulatory burden" in statement:
        return "Small increase in regulatory burden"
    elif "small reduction in the regulatory burden" in statement:
        return "Small reduction in regulatory burden"
    elif "medium increase in the regulatory burden" in statement:
        return "Medium increase in regulatory burden"
    elif "medium reduction in the regulatory burden" in statement:
        return "Medium reduction in regulatory burden"
    else:
        return "Other unique statements"

# Load the CSV data into a DataFrame
data_df = pd.read_csv('/Users/yams/Dropbox/Github/fiscal-notes/extracted_data_UT_2022.csv')

# Categorize each statement in the "Regulatory Impact" column
data_df['Impact Category'] = data_df['Regulatory Impact'].apply(categorize_regulatory_impact)

# Count the occurrences of each category
category_counts = data_df['Impact Category'].value_counts()

print(category_counts)


Impact Category
No change in regulatory burden           1100
Small increase in regulatory burden       146
Missing Data                               76
Small reduction in regulatory burden       40
Medium increase in regulatory burden       10
Medium reduction in regulatory burden       2
Other unique statements                     2
Name: count, dtype: int64


In [5]:
def categorize_individuals_businesses(statement):
    """Categorize the impact on individuals & businesses based on specific phrases."""
    if pd.isna(statement):
        return "Missing Data"
    elif "not result in direct expenditures from tax or fee changes" in statement:
        return "No direct expenditures from tax/fee changes"
    elif "decrease costs for" in statement:
        return "Decrease in costs for specific groups"
    elif "savings for certain offenders" in statement:
        return "Savings for certain offenders"
    elif "lead to" in statement and "paying" in statement:
        return "Increase in fines/costs for specific services"
    elif "individuals could pay additional taxes" in statement:
        return "Additional tax implications"
    elif "increase aggregate tax credits" in statement:
        return "Increase in tax credits for specific groups"
    elif "decrease sales tax liability" in statement:
        return "Reduction in tax liability"
    else:
        return "Other unique statements"

# Categorize each statement in the "Individuals & Businesses" column
data_df['IB Impact Category'] = data_df['Individuals & Businesses'].apply(categorize_individuals_businesses)

# Count the occurrences of each category
ib_category_counts = data_df['IB Impact Category'].value_counts()

ib_category_counts



IB Impact Category
No direct expenditures from tax/fee changes      972
Other unique statements                          389
Increase in fines/costs for specific services      9
Increase in tax credits for specific groups        3
Decrease in costs for specific groups              2
Reduction in tax liability                         1
Name: count, dtype: int64

In [6]:

# Filter the DataFrame to get rows where the "IB Impact Category" is "Other unique statements"
other_unique_statements_df = data_df[data_df['IB Impact Category'] == 'Other unique statements']

# Extract the "Individuals & Businesses" column for these rows
other_unique_statements_list = other_unique_statements_df['Individuals & Businesses'].tolist()

other_unique_statements_list[:10]  # Display the first 10 for brevity

# Save the filtered "Other unique statements" to a new CSV file
csv_path_unique_statements = '/Users/yams/Dropbox/Github/fiscal-notes/other_unique_statements.csv'
other_unique_statements_df[['Filename', 'Individuals & Businesses']].to_csv(csv_path_unique_statements, index=False)

csv_path_unique_statements



'/Users/yams/Dropbox/Github/fiscal-notes/other_unique_statements.csv'