In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import unicodedata

base_url = "https://capitol.texas.gov"
urls = {
    "House": f"{base_url}/Committees/MeetingsUpcoming.aspx?Chamber=H",
    "Senate": f"{base_url}/Committees/MeetingsUpcoming.aspx?Chamber=S",
}

# Function to clean text
def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

# Function to extract weekday
def extract_weekday(date_text):
    try:
        match = re.search(r"([A-Za-z]+ \d{1,2}, \d{4})", date_text)
        if match:
            return datetime.strptime(match.group(1), "%B %d, %Y").strftime("%A")
    except ValueError:
        pass
    return "Unknown"

# Scrape data
data = []
for chamber, url in urls.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    meeting_links = [base_url + a["href"] for a in soup.find_all("a", href=True) if a["href"].endswith(".HTM")]

    for link in meeting_links:
        meeting_response = requests.get(link)
        meeting_soup = BeautifulSoup(meeting_response.text, "html.parser")

        committee_name = next((p.text.split(":")[-1].strip() for p in meeting_soup.find_all("p") if "COMMITTEE:" in p.text), "Unknown Committee")
        committee_name = re.sub(r"\s+", " ", committee_name).strip()  # Clean extra spaces

        meeting_day = next((extract_weekday(p.text) for p in meeting_soup.find_all("p") if "TIME & DATE:" in p.text), "Unknown")

        for td in meeting_soup.find_all("td"):
            bill_link = td.find("a")
            if bill_link and "Bill=" in bill_link["href"]:
                bill_number = bill_link.text.strip()
                full_text = td.get_text("\n").strip()
                text_parts = list(filter(None, full_text.split("\n")))

                # Extract Bill Author Properly
                bill_author = text_parts[1].strip() if len(text_parts) > 1 else "Unknown"
                if len(text_parts) > 2 and not text_parts[2].startswith("Relating to"):
                    bill_author += " " + text_parts[2].strip()
                bill_author = re.sub(r"\s+", " ", bill_author).strip()  # Clean spaces

                # Extract Caption Correctly
                caption_start = 2 if bill_author != "Unknown" else 1
                caption = " ".join(text_parts[caption_start:]).strip()
                
                # Fix Captions That Contain Author’s Name
                if bill_author in caption:
                    caption = caption.replace(bill_author, "").strip()

                # Remove extra spaces and artifacts
                caption = re.sub(r"\s+", " ", caption)  # Removes excessive spaces
                caption = caption.replace("\xa0", " ").strip()  # Removes `¬†` artifacts

                # Ensure Captions Start with "Relating to" ONLY when necessary
                if not caption.startswith("Relating to") and "Relating to" not in caption[:20]:
                    caption = "Relating to " + caption

                # Fix Captions That Start with "Relating to Relating to"
                caption = caption.replace("Relating to Relating to", "Relating to")

                # Append Cleaned Data
                data.append([chamber, meeting_day, committee_name, bill_number, bill_author, caption])

# Convert to DataFrame and Save
df = pd.DataFrame(data, columns=["Chamber", "Day", "Committee Name", "Bill Number", "Bill Author", "Caption"])
df["Stance"] = ""
df.to_csv("bills.csv", index=False)

print("✅ Scraper completed! 'bills.csv' saved successfully.")


✅ Scraper completed! 'bills.csv' saved successfully.


In [None]:
for td in meeting_soup.find_all("td"):
    bill_link = td.find("a")
    if bill_link and "Bill=" in bill_link["href"]:
        bill_number = bill_link.text.strip()
        full_text = td.get_text("\n").strip()
        text_parts = list(filter(None, full_text.split("\n")))

        # Extract Bill Author correctly
        bill_author = text_parts[1].strip() if len(text_parts) > 1 else "Unknown"

        # Ensure multi-word names don't get split
        if len(text_parts) > 2 and not text_parts[2].startswith("Relating to"):
            bill_author += " " + text_parts[2].strip()

        # Clean special characters & extra spaces
        bill_author = bill_author.replace("\r", "").strip()

        # Extract Caption correctly
        caption_start = 2 if bill_author != "Unknown" else 1
        caption = " ".join(text_parts[caption_start:]).strip()

        # Ensure Caption starts with "Relating to..."
        if not caption.startswith("Relating to"):
            caption = "Relating to " + caption

        # Append data
        data.append([chamber, meeting_day, committee_name, bill_number, bill_author, caption])
