In [1]:

import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import gspread

In [2]:
from scraper_utils import FTP_Connection
import yaml
CONFIG_PATH = 'scraper_config.yaml'

with open(CONFIG_PATH, 'r', encoding='utf-8') as file:
    scraper_config = yaml.safe_load(file)

conn = FTP_Connection(scraper_config['sources']['ftp']['host'])
print('\n'.join(conn.ls('ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099')))


ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 1.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 10.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 11.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 12.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 13.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 14.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 15.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 2.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 20.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 21.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 22.xml
ftp://ftp.legis.state.tx.us/bills/

In [3]:
def get_bill_history_df(conn, scraper_config):
    """
    Get bill history data from FTP server and return as DataFrame
    
    Args:
        conn: FTP_Connection object
        scraper_config: Dictionary containing scraper configuration
        
    Returns:
        DataFrame containing bill history data
    """
    # Build URL from config
    base_url = scraper_config['sources']['ftp']['base_path'].format(
        LegSess=scraper_config['info']['LegSess']
    )
    history_url = f"{base_url}/billhistory/history.xml"
    
    # Get and parse XML
    bill_history_xml = conn.get_data(history_url)
    bill_history_soup = BeautifulSoup(bill_history_xml, 'xml')
        
    # Get bills element
    bills = bill_history_soup.find('bills')

    # Convert bills to list of dicts
    bill_list = []
    for bill in bills.find_all('bill'):
        bill_dict = {
            'id': bill.get('id'),
            'path': bill.get('path'), 
            'timestamp': bill.get('timestamp')
        }
        bill_list.append(bill_dict)
    return pd.DataFrame(bill_list)

bill_df = get_bill_history_df(conn, scraper_config)
bill_df.to_csv('raw_data/bill_history.csv', index=False)


In [None]:
def get_bill_urls(conn, scraper_config):
    """
    Get list of URLs for all bill XML files in house_bills and senate_bills directories.
    
    Args:
        conn: FTP_Connection object
        scraper_config: Dictionary containing scraper configuration
        
    Returns:
        List of URLs for all bill XML files
    """
    # Build base URL from config
    base_url = scraper_config['sources']['ftp']['base_path'].format(
        LegSess=scraper_config['info']['LegSess']
    )
    
    bill_urls = []
    
    # Process both house and senate bills
    for chamber in ['house_bills', 'senate_bills']:
        # Build URL for this chamber
        chamber_url = f"{base_url}/billhistory/{chamber}"
        
        # Get list of bill range folders (HB00001_HB00099 etc)
        range_folders = conn.ls(chamber_url)
        
        # Get bill XML files from each range folder
        for folder_url in range_folders:
            bill_xmls = conn.ls(folder_url)
            print('\n'.join(bill_xmls))
            bill_urls.extend(bill_xmls)
            
    return bill_urls

# Get all bill URLs
bill_urls = get_bill_urls(conn, scraper_config)
print(f"Found {len(bill_urls)} total bill XML files")
print("\nFirst 5 bill URLs:")
print('\n'.join(bill_urls[:5]))


ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 1.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 10.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 11.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 12.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 13.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 14.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 15.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 2.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 20.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 21.xml
ftp://ftp.legis.state.tx.us/bills/89R/billhistory/house_bills/HB00001_HB00099/HB 22.xml
ftp://ftp.legis.state.tx.us/bills/

In [6]:

def investigate_xml(url, conn):
    """
    Get XML data from a URL, prettify it, and save it to raw_data directory with _raw suffix
    
    Args:
        url: URL to retrieve XML from
        conn: FTP_Connection object
    """
    # Get XML data
    xml_data = conn.get_data(url)
    
    if xml_data:
        # Parse and prettify XML
        soup = BeautifulSoup(xml_data, 'xml')
        pretty_xml = soup.prettify()
        
        # Generate filename from URL
        filename = url.split('/')[-1].replace('.xml', '_raw.xml')
        
        # Save to raw_data directory
        with open(f'raw_data/{filename}', 'w', encoding='utf-8') as f:
            f.write(pretty_xml)
        print(f"Saved {url} to raw_data/{filename}")
    else:
        print(f"Failed to retrieve data from {url}")

# Test the function with a bill history URL
test_url = "ftp://ftp.legis.state.tx.us/bills/88R/billhistory/house_bills/HB00001_HB00099/HB 2.xml"
investigate_xml(test_url, conn)



Saved ftp://ftp.legis.state.tx.us/bills/88R/billhistory/house_bills/HB00001_HB00099/HB 2.xml to raw_data/HB 2_raw.xml


In [9]:
import json
def parse_bill_xml(conn, url):
    """
    Parse bill XML data from a URL into a standardized dictionary format.
    
    Args:
        conn: FTP_Connection object
        url: URL to retrieve XML from
        
    Returns:
        Dictionary containing parsed bill data
    """
    # Get XML data from URL
    xml_str = conn.get_data(url)
    if not xml_str:
        print(f"Failed to retrieve data from {url}")
        return None
        
    soup = BeautifulSoup(xml_str, 'xml')
    bill_data = {}
    
    # Basic bill info
    bill_history = soup.find('billhistory')
    if bill_history:
        bill_data['bill_id'] = bill_history.get('bill')
        bill_data['last_update'] = bill_history.get('lastUpdate')
    
    # Last action
    last_action = soup.find('lastaction')
    if last_action:
        bill_data['last_action'] = last_action.text.strip()
    
    # Caption
    caption = soup.find('caption')
    if caption:
        bill_data['caption'] = caption.text.strip()
        bill_data['caption_version'] = caption.get('version')
    
    # Authors and co-authors
    bill_data['authors'] = [a.strip() for a in soup.find('authors').text.strip().split('|') if a.strip()]
    bill_data['coauthors'] = [a.strip() for a in soup.find('coauthors').text.strip().split('|') if a.strip()]
    bill_data['sponsors'] = [a.strip() for a in soup.find('sponsors').text.strip().split('|') if a.strip()]
    bill_data['cosponsors'] = [a.strip() for a in soup.find('cosponsors').text.strip().split('|') if a.strip()]
    
    # Subjects
    bill_data['subjects'] = [s.text.strip() for s in soup.find_all('subject')]
    
    # Committee info
    committees = soup.find('committees')
    if committees:
        house_comm = committees.find('house')
        senate_comm = committees.find('senate')
        
        bill_data['house_committee'] = {
            'name': house_comm.get('name'),
            'status': house_comm.get('status'),
            'votes': {
                'aye': int(house_comm.get('ayeVotes', 0)),
                'nay': int(house_comm.get('nayVotes', 0)),
                'absent': int(house_comm.get('absentVotes', 0)),
                'present_not_voting': int(house_comm.get('presentNotVotingVotes', 0))
            }
        }
        
        bill_data['senate_committee'] = {
            'name': senate_comm.get('name'),
            'status': senate_comm.get('status'),
            'votes': {
                'aye': int(senate_comm.get('ayeVotes', 0)), 
                'nay': int(senate_comm.get('nayVotes', 0)),
                'absent': int(senate_comm.get('absentVotes', 0)),
                'present_not_voting': int(senate_comm.get('presentNotVotingVotes', 0))
            }
        }
    
    # Actions
    bill_data['actions'] = []
    for action in soup.find_all('action'):
        action_data = {
            'number': action.find('actionNumber').text.strip(),
            'date': action.find('date').text.strip(),
            'description': action.find('description').text.strip(),
            'timestamp': action.find('actionTimestamp').text.strip() if action.find('actionTimestamp') else None
        }
        bill_data['actions'].append(action_data)
    
    # Bill text URLs
    versions = soup.find_all('version')
    bill_data['versions'] = []
    for version in versions:
        version_data = {
            'description': version.find('versionDescription').text.strip(),
            'urls': {
                'web_html': version.find('WebHTMLURL').text.strip(),
                'web_pdf': version.find('WebPDFURL').text.strip(),
                'ftp_html': version.find('FTPHTMLURL').text.strip(),
                'ftp_pdf': version.find('FTPPDFURL').text.strip()
            }
        }
        bill_data['versions'].append(version_data)
        
    return bill_data

# Test the parser with a bill URL
test_url = "ftp://ftp.legis.state.tx.us/bills/88R/billhistory/house_bills/HB00001_HB00099/HB 2.xml"
parsed_bill = parse_bill_xml(conn, test_url)
print("Parsed bill data:")
print(json.dumps(parsed_bill, indent=2))


Parsed bill data:
{
  "bill_id": "88(R) HB 2",
  "last_update": "6/19/2023 6:39:07 PM",
  "last_action": "05/02/2023 S Referred to Local Government",
  "caption": "Relating to providing property tax relief through the public school finance system and property tax appraisal and administration.",
  "caption_version": "Engrossed",
  "authors": [
    "Meyer",
    "Bonnen",
    "Burrows",
    "Thierry",
    "Raymond"
  ],
  "coauthors": [
    "Allison",
    "Anch\u00eda",
    "Anderson",
    "Ashby",
    "Bailes",
    "Bell, Cecil",
    "Bell, Keith",
    "Buckley",
    "Bumgarner",
    "Burns",
    "Button",
    "Cain",
    "Campos",
    "Canales",
    "Capriglione",
    "Clardy",
    "Cook",
    "Cortez",
    "Craddick",
    "Cunningham",
    "Darby",
    "Dean",
    "DeAyala",
    "Dorazio",
    "Frank",
    "Frazier",
    "Gates",
    "Gerdes",
    "Geren",
    "Gervin-Hawkins",
    "Goldman",
    "Guerra",
    "Guillen",
    "Harless",
    "Harris, Caroline",
    "Harris, Cody",
    "H

In [11]:
def get_raw_bills_data(conn, scraper_config):
    bill_urls = get_bill_urls(conn, scraper_config)
    raw_bills = []
    for url in bill_urls:
        bill_data = parse_bill_xml(conn, url)
        if bill_data:
            raw_bills.append(bill_data)
    return pd.DataFrame(raw_bills)

get_raw_bills_data(conn, scraper_config)

NameError: name 'get_bill_urls' is not defined

In [8]:
# Compare history.xml and history_periodic.xml
history_url = "ftp://ftp.legis.state.tx.us/bills/89R/billhistory/history.xml"
periodic_url = "ftp://ftp.legis.state.tx.us/bills/89R/billhistory/history_periodic.xml"

# Get both files
investigate_xml(history_url, conn)
investigate_xml(periodic_url, conn)

# Load and parse both files
with open('raw_data/history_raw.xml', 'r', encoding='utf-8') as f:
    history_soup = BeautifulSoup(f.read(), 'xml')
    
with open('raw_data/history_periodic_raw.xml', 'r', encoding='utf-8') as f:
    periodic_soup = BeautifulSoup(f.read(), 'xml')

# Get all bill IDs and timestamps from each file
history_bills = {b['id']: b['timestamp'] for b in history_soup.find_all('bill')}
periodic_bills = {b['id']: b['timestamp'] for b in periodic_soup.find_all('bill')}

# Find bills unique to each file
only_in_history = set(history_bills.keys()) - set(periodic_bills.keys())
only_in_periodic = set(periodic_bills.keys()) - set(history_bills.keys())

print("Differences between history.xml and history_periodic.xml:")
print(f"\nNumber of bills in history.xml: {len(history_bills)}")
print(f"Number of bills in history_periodic.xml: {len(periodic_bills)}")

print(f"\nBills only in history.xml: {len(only_in_history)}")
if only_in_history:
    print("Examples:", list(only_in_history)[:5])
    
print(f"\nBills only in history_periodic.xml: {len(only_in_periodic)}")
if only_in_periodic:
    print("Examples:", list(only_in_periodic)[:5])

# Compare timestamps for bills in both files
common_bills = set(history_bills.keys()) & set(periodic_bills.keys())
different_timestamps = {
    bill: (history_bills[bill], periodic_bills[bill])
    for bill in common_bills
    if history_bills[bill] != periodic_bills[bill]
}

print(f"\nBills with different timestamps: {len(different_timestamps)}")
if different_timestamps:
    print("\nExample timestamp differences:")
    for bill, (hist_time, per_time) in list(different_timestamps.items())[:5]:
        print(f"{bill}:")
        print(f"  history.xml: {hist_time}")
        print(f"  periodic.xml: {per_time}")




Saved ftp://ftp.legis.state.tx.us/bills/89R/billhistory/history.xml to raw_data/history_raw.xml
Saved ftp://ftp.legis.state.tx.us/bills/89R/billhistory/history_periodic.xml to raw_data/history_periodic_raw.xml
Differences between history.xml and history_periodic.xml:

Number of bills in history.xml: 724
Number of bills in history_periodic.xml: 787

Bills only in history.xml: 544
Examples: ['HB 3855', 'HB 24', 'HB 806', 'HB 3861', 'SB 1920']

Bills only in history_periodic.xml: 607
Examples: ['HB 957', 'SB 1366', 'HR 428', 'SB 1514', 'SJR 66']

Bills with different timestamps: 180

Example timestamp differences:
HR 50:
  history.xml: 3/6/2025 2:00 AM
  periodic.xml: 3/6/2025 2:00 PM
HCR 26:
  history.xml: 3/6/2025 2:00 AM
  periodic.xml: 3/6/2025 12:00 PM
HR 142:
  history.xml: 3/6/2025 2:00 AM
  periodic.xml: 3/6/2025 12:00 PM
HR 149:
  history.xml: 3/6/2025 2:00 AM
  periodic.xml: 3/6/2025 12:00 PM
HR 212:
  history.xml: 3/6/2025 2:00 AM
  periodic.xml: 3/6/2025 2:00 PM
