In [None]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    """Clean text by removing extra spaces and unwanted characters."""
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

def scrape_pittsburgh_data():
    url = "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Collection-Schedule-and-Newsletter"
    response = requests.get(url)
    all_data = []
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract heading for Collection Map
        collection_map_section = soup.find('h2', string=lambda text: text and 'Collection Map' in text)
        if collection_map_section:
            all_data.append(clean_text(collection_map_section.get_text()))
            
            # Extract paragraphs under Collection Map
            next_sibling = collection_map_section.find_next_sibling()
            while next_sibling and next_sibling.name == 'p':
                all_data.append(clean_text(next_sibling.get_text()))
                next_sibling = next_sibling.find_next_sibling()
        
        # Extract entire Holiday table
        tables = soup.find_all('table')
        for table in tables:
            if 'Holiday' in table.get_text():
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    row_text = " | ".join(clean_text(cell.get_text()) for cell in cells if cell.get_text())
                    if row_text:
                        all_data.append(row_text)
                
                # Extract paragraphs under Holiday table
                table_next_sibling = table.find_next_sibling()
                while table_next_sibling and table_next_sibling.name == 'p':
                    all_data.append(clean_text(table_next_sibling.get_text()))
                    table_next_sibling = table_next_sibling.find_next_sibling()
    else:
        print(f"Failed to retrieve page: {response.status_code}")
    
    return all_data

# Run the scraper
news_data = scrape_pittsburgh_data()

# Save data to a TXT file
with open("/home/jdalvi/jdalvi/anlp2/scraped_txt_files/pittsburgh_collection_schedule.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(news_data))

print("Scraping complete. Data saved to pittsburgh_collection_schedule.txt")

Scraping complete. Data saved to pittsburgh_collection_schedule.txt


In [19]:
import requests
from bs4 import BeautifulSoup
import os

url = "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Commercial-Recycling"

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

main_content = soup.find('div', {'id': 'main-content'})

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "commercial_recycling.txt")

if main_content:
    tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
    content_tags = main_content.find_all(tags_to_scrape)
    prev_text = ""
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_tags:
            text = tag.get_text(strip=True)
            
            # Avoid duplicate consecutive headings
            if tag.name in ['h1', 'h2', 'h3']:
                if text == prev_text:
                    continue
                text = f"\n{text}\n"  # Keep only the unique heading
                prev_text = text
            elif tag.name == 'ul':
                text = "\n"  # Add spacing before unordered lists
            elif tag.name == 'li':
                parent = tag.find_parent('ul')
                if parent and parent.find_parent('li'):
                    text = '    * ' + text  # Sub-bullet
                else:
                    text = '- ' + text  # Main bullet
            
            if text:
                f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")
else:
    print("Could not find main content.")


Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/commercial_recycling.txt


In [18]:
import requests
from bs4 import BeautifulSoup
import os

url = "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Curbside-Pick-Up"

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

main_content = soup.find('div', {'id': 'main-content'})

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "curbside_pickup.txt")

if main_content:
    tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
    content_tags = main_content.find_all(tags_to_scrape)
    prev_text = ""
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_tags:
            text = tag.get_text(strip=True)
            
            # Avoid duplicate consecutive headings
            if tag.name in ['h1', 'h2', 'h3']:
                if text == prev_text:
                    continue
                text = f"\n{text}\n"  # Keep only the unique heading
                prev_text = text
            elif tag.name == 'ul':
                text = "\n"  # Add spacing before unordered lists
            elif tag.name == 'li':
                parent = tag.find_parent('ul')
                if parent and parent.find_parent('li'):
                    text = '    * ' + text  # Sub-bullet
                else:
                    text = '- ' + text  # Main bullet
            
            if text:
                f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")
else:
    print("Could not find main content.")


Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/curbside_pickup.txt


In [20]:
import requests
from bs4 import BeautifulSoup
import os

url = "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Copy-of-Special-Event-Recycling-1"

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

main_content = soup.find('div', {'id': 'main-content'})

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "special_event_recycling.txt")

if main_content:
    tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
    content_tags = main_content.find_all(tags_to_scrape)
    prev_text = ""
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_tags:
            text = tag.get_text(strip=True)
            
            # Avoid duplicate consecutive headings
            if tag.name in ['h1', 'h2', 'h3']:
                if text == prev_text:
                    continue
                text = f"\n{text}\n"  # Keep only the unique heading
                prev_text = text
            elif tag.name == 'ul':
                text = "\n"  # Add spacing before unordered lists
            elif tag.name == 'li':
                parent = tag.find_parent('ul')
                if parent and parent.find_parent('li'):
                    text = '    * ' + text  # Sub-bullet
                else:
                    text = '- ' + text  # Main bullet
            
            if text:
                f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")
else:
    print("Could not find main content.")


Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/special_event_recycling.txt


In [54]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources/Electronic-Waste-and-Household-Hazardous-Waste-Disposal",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources/Recycling-Drop-Off-Locations",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources/Waste-Disposal-Resources",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources/Christmas-Tree-Recycling",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Drop-Off-Info-Additional-Resources/Zero-Waste"
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "drop_off_info.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/drop_off_info.txt


In [26]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping/Frequently-Asked-Questions",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping/Plastic-Bag-Ban",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping/Volunteer-Applications",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping/Additional-Organization-Resources",
    "https://www.pittsburghpa.gov/Resident-Services/Trash-Recycling/Anti-Litter-and-Illegal-Dumping/Litter-Index    "
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "anti_litter.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/anti_litter.txt


In [29]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
      "https://www.pittsburghpa.gov/Resident-Services/311",
      "https://www.pittsburghpa.gov/Resident-Services/311/Contacting-311"
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Service_311.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Service_311.txt


In [None]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights",
        "https://engage.pittsburghpa.gov/disability-rights-ada", #didnt work
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/Disability-Resources-Center",
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/Statutes-Policies",
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "ADA-Disabiltiy-Rights1.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/ADA-Disabiltiy-Rights1.txt


In [41]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/City-County-Task-Force-on-Disability",
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/Make-an-Accommodations-Request"  ,
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/CCB-Accessibility-Entrance",
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/File-a-Grievance",
      "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Equal-Protection/Office-of-Equal-Protection-News",
      "https://www.pittsburghpa.gov/Resident-Services/ADA-Disability-Rights/FAQs"
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "ADA-Disabiltiy-Rights2.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/ADA-Disabiltiy-Rights2.txt


In [45]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Snow-Removal",
        "https://www.pittsburghpa.gov/Resident-Services/Snow-Removal/Snow-Plow-Tracker",
      "https://www.pittsburghpa.gov/Resident-Services/Snow-Removal/Snow-Ice"
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Snow_Removal.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Snow_Removal.txt


In [46]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance",
        "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Street-Maintenance/Street-Sweeping",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Street-Maintenance/Disaster-Response",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Paving-Schedule",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Paving-Schedule/Pittsburgh-Paving-Program",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Paving-Schedule/Curb-Ramp-Schedule",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Paving-Schedule/PWSA-Paving-Schedule",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Potholes",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Road-Safety",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Road-Safety/Traffic-Calming",
      "https://www.pittsburghpa.gov/Resident-Services/Road-Maintenance/Road-Safety/Autonomous-Technology"
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Road_Maintenance.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Road_Maintenance.txt


In [47]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Food-Programs",
    "https://www.pittsburghpa.gov/Resident-Services/Food-Programs/After-School-Feeding-Program",
    "https://www.pittsburghpa.gov/Resident-Services/Food-Programs/Summer-Food-Program",
    "https://www.pittsburghpa.gov/Resident-Services/Food-Programs/BigBurgh",
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Food_Program.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Food_Program.txt


In [55]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/For-Pet-Owners",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Dog-Licenses",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/SpayNeuter-Program",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Stray-Animals",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Rodent-Baiting-Program",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Beekeeping-and-Farm-Animal-Permits",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Living-with-Wildlife",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Animal-Control-FAQs",
    "https://www.pittsburghpa.gov/Safety/Animal-Care-Control/Animal-Control-FAQs/Contact-Animal-Control"

   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Animal_Care_Control.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Animal_Care_Control.txt


In [51]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming",
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming/Love-Your-Block",
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming/Love-Your-Block/Guidelines",
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming/Love-Your-Block/Apply",
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming/City-Cuts",
    "https://engage.pittsburghpa.gov/snow-angels?_gl=1*7hj524*_ga*OTI0NDY2NzgyLjE2MDcyMDkxNjg.*_ga_6YKGFYBQNH*MTczMDA3Mjc4OS44NzQuMS4xNzMwMDczNjc2LjAuMC4w",
    "https://www.pittsburghpa.gov/Resident-Services/Community-Programming/Black-Pittsburgh-Matters"

   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Community_Programming.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Community_Programming.txt


In [52]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs",
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs/Welcoming-Communities-Network",
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs/Renting-to-Refugees",
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs/Immigrant-Refugee-Regional-Data",
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs/Latinos-in-Pittsburgh",
    "https://www.pittsburghpa.gov/City-Government/Mayor/Office-of-Immigrant-and-Refugee-Affairs/Newsletter-Signup",
   

   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Office_of_immigrants.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Office_of_immigrants.txt


In [53]:


import requests
from bs4 import BeautifulSoup
import os

urls = [

   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Pittsburgh-Urban-Forest",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Special-Initiatives",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Partnerships",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Events",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Tree-Planting",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Request-Tree-Work",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Appealing-Street-Tree-Removal",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Resource-Links",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-FAQ",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Vacant-Lot-Greening"


   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Forestry_division.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Forestry_division.txt


In [58]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Recreation-Events/Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Our-Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/Rec2Tech",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/KidSMART-Club",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/CitiCamp",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/Tot-Camp",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/Alphabet-Trail-Tales",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Kids/Track-Treat",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Seniors",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/For-Seniors/Senior-Community-Centers",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Dog-Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Dog-Parks/Dog-Park-Rules-Guidelines   ",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Mellon-Park-Tennis-Center",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Bob-OConnor-Classic",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Paul-G.-Sullivan-Clay-Championship",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Frick-Park-Red-Clay-Jr.-Open",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Community-Tennis-Associations",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Tennis-Clinics",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Tennis-Permit-Rules",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Tennis/Pickleball",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/CitiSports",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Cycling",
     "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Dek-Hockey",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Disc-Golf",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Lawn-Bowling",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Skate-Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Outdoor-Activities/Trails",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/City-Farms",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Community-Garden-Plots",
    "https://www.pittsburghpa.gov/Recreation-Events/Parks/Community-Garden-Plots/Community-Garden-Plots-Form",
  
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Parks.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Parks.txt


In [59]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Recreation-Events/Pools",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Oliver-Bath-House",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Outdoor-Pools",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Spray-Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Pool-Fees",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Rules",
    "https://www.pittsburghpa.gov/Recreation-Events/Pools/Citiparks-Lifeguard",
     "https://www.pittsburghpa.gov/Recreation-Events/Events",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/Farmers-Market-Overview",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/Farmers-Market-Vendor-Registration",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/Carrick-Farmers-Market",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/East-End-Farmers-Market",
     "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/Northside-Farmers-Market",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Farmers-Markets/Squirrel-Hill-Farmers-Market",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Arsenal-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Banksville-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Brookline-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Flagstaff-Hill-in-Schenley-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Grandview-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Highland-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/McBride-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Ormsby-Park",
      "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Riverview-Park",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/Schenley-Plaza",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Dollar-Bank-Cinema-In-The-Park/West-End-Elliott-Overlook",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Roving-Art-Cart",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Roving-Art-Cart/Roving-Art-Cart-Registration",
      "https://www.pittsburghpa.gov/Recreation-Events/Events/ARTWorks",
      "https://www.pittsburghpa.gov/Recreation-Events/Events/ARTWorks/ARTWorks-Eligibility-Guidelines-Rules",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/ARTWorks/ARTWorks-Registration",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Concerts",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Concerts/Bach-Beethoven-and-Brunch",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Concerts/Jam-At-Grandview",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Concerts/Reservoir-of-Jazz",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Concerts/Stars-at-Riverview-Jazz-Series",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Brookline-Breeze-5k",
     "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Historic-Hill-5K-Run-Walk-Ride",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Run-Around-The-Square",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/The-Great-Race",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Junior-Great-Race",
    
     "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Greenfield-Glide",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Footraces/Riverview-Park-5K-Run-Fitness-Walk",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Holidays-At-City-County-Building",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Pistons-in-the-Park-Car-Cruise",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Pittsburgh-Cultural-Activities",
    "https://www.pittsburghpa.gov/Recreation-Events/Events/Summer-Soul-Line-Dancing",
    "https://www.pittsburghpa.gov/Recreation-Events/Park-Facilities",
       "https://www.pittsburghpa.gov/Recreation-Events/Park-Facilities/Schenley-Skating-Rink",
    "https://www.pittsburghpa.gov/Recreation-Events/Park-Facilities/Recreation-Centers"
   
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Pools_Events_park_facilities.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Pools_Events_park_facilities.txt


In [61]:


import requests
from bs4 import BeautifulSoup
import os

urls = [
   "https://www.pittsburghpa.gov/Recreation-Events/Special-Events",
   "https://getinvolved.pittsburghpa.gov/pittsburgh-e-arena",
   "https://getinvolved.pittsburghpa.gov/gingerbread-house-competition",
    "https://www.pittsburghpa.gov/Recreation-Events/Special-Events/Independence-Day-Celebration",
   "https://www.pittsburghpa.gov/Recreation-Events/Special-Events/Black-History-Month",
   "https://www.pittsburghpa.gov/Recreation-Events/Special-Events/Community-Festivals",
    "https://www.pittsburghpa.gov/Recreation-Events/Special-Events/Contact-Special-Events",
   "https://www.pittsburghpa.gov/Recreation-Events/Special-Events/Special-Events-News",
   "https://www.pittsburghpa.gov/Recreation-Events/About-Parks",
    "https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Citiparks-Phone-Directory",
   "https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Parks-Rules",
   "https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Park-Rangers",
    "https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Park-Maintenance",
   "https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Parks-FAQ"

]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Speical_Events_About_Parks.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Speical_Events_About_Parks.txt


In [62]:



import requests
from bs4 import BeautifulSoup
import os

urls = [
     "https://www.pittsburghpa.gov/Recreation-Events/Film-Event-Management",
    "https://www.pittsburghpa.gov/Recreation-Events/Film-Event-Management/Important-Documents",
   "https://www.pittsburghpa.gov/Recreation-Events/Film-Event-Management/Special-Event-Block-Party-Permitting",
   "https://www.pittsburghpa.gov/Recreation-Events/Film-Event-Management/Film-Permits",
   "https://www.pittsburghpa.gov/Recreation-Events/Film-Event-Management/Special-Events-Committee",
    "https://eproval.pittsburghpa.gov/pages/ros-public-calendar/",
   "https://www.pittsburghpa.gov/Recreation-Events/Park-Permits",
   "https://www.pittsburghpa.gov/Recreation-Events/Park-Permits/Reserve-Park-Shelter",
   "https://www.pittsburghpa.gov/Recreation-Events/Park-Permits/Reserve-Park-Shelter/Frequently-Asked-Questions",
    "https://www.pittsburghpa.gov/Recreation-Events/Park-Permits/Reserve-Park-Shelter/Picnic-Shelter-Regulations-Guidelines",
   "https://www.pittsburghpa.gov/Recreation-Events/Park-Permits/Sports-Field-Permit",


]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Films_Event_Management_Park_Permit.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Films_Event_Management_Park_Permit.txt


In [1]:



import requests
from bs4 import BeautifulSoup
import os

urls = [
     "https://www.pittsburghpa.gov/City-Government/Mayor/Biography",
     "https://sway.cloud.microsoft/6Pr0g8iCs9pEoT5s?ref=Link&loc=play",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Mayors-Cabinet",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Youth-Civic-Leadership-Academy",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Press-Releases",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Contact",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Letters-of-Support-Forms",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Executive-Orders",
     "https://engage.pittsburghpa.gov/my-brothers-keeper",
     "https://www.pittsburghpa.gov/City-Government/Mayor/City-County-Building",
     "https://www.pittsburghpa.gov/City-Government/Mayor/Key-Focus-Areas",
     "https://www.pittsburghpa.gov/City-Government/Mayor/i-team",
 



]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Mayor.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Mayor.txt


In [1]:



import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/About-the-Controller",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/About-the-Controller/Core-Services",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/About-the-Controller/Controller-Rachael-Heisler-Biography",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/About-the-Controller/Contact-the-Controllers-Office",
     "https://fiscalfocus.pittsburghpa.gov/",
     "https://budgetexplorer.pittsburghpa.gov/#!/year/default",
     "https://checkbookpgh.pittsburghpa.gov/#!/year/2025/",
     "https://fiscalfocus.pittsburghpa.gov/stories/s/American-Rescue-Plan-Monitoring-Page/hn9e-7899",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Fiscal-Audits",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Performance-Audits",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Annual-Comprehensive-Financial-Report",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Expenditure-Report",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Revenue-Report",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Popular-Annual-Financial-Reports",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Disclosure-of-Interest-Reports",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Audits-Reports/Special-Reports",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Pittsburgh-Watchdog",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Pittsburgh-Watchdog/Watchdog-PGH-Privacy-Policy",
     "https://www.pittsburghpa.gov/City-Government/City-Controllers-Office/Unclaimed-Property",
     "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Campaign-Finances"
    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "City_Controller.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/City_Controller.txt


In [4]:



import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/City-Council",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Districts",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/What-is-a-City-Clerk",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Kimberly-D.-Clark-Baskin",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/City-Clerk-Functions",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Council-Meeting-Schedule",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Legislative-Information-Center",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Records-Management-Division",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Home-Rule-Charter",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Procedures",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Business-Processes",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/City-Clerk-Contacts",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Clerks-Press-Releases",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Legislative-Information",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Legislative-Information/Council-Re-Apportionment-Committee",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Legislative-Information/Standing-Committees",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Clerks-Office/Legislative-Information-Center",
     "https://www.pittsburghpa.gov/City-Government/City-Council/Council-Budget-Office",
    "https://www.pittsburghpa.gov/City-Government/City-Council/Council-Contacts"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "City_Council.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/City_Council.txt


In [6]:



import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions",
    "https://pittsburgh.granicus.com/boards/w/0fcfe299bccf70d2",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions",
     "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Allegheny-County-Sanitary-Authority-ALCOSAN",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Allegheny-Regional-Asset-District-ARAD",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Board-of-Appeals",
    

    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Civil-Service-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Clean-Pittsburgh-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Commission-on-Human-Relations",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Comprehensive-Municipal-Pension-Trust-Fund",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board/Advice-and-Enforcement",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board/Board-Members-and-Meetings",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board/Campaign-Finance",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board/Disclosures",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Ethics-Hearing-Board/Education",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Equipment-Leasing-Authority",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Equal-Opportunity-Review-Commission-EORC",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Gender-Equity-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/HIV-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Housing-Authority-City-of-Pittsburgh",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Independent-Citizen-Police-Review-Board",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/LGBTQIA-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Municipal-Pension-Fund",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Other-Post-Employment-Benefits-OPEB-Trust-Fund",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Pittsburgh-Land-Bank",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Records-Management-Advisory-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Shade-Tree-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Southwestern-Pennsylvania-Commission",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Sports-and-Exhibition-Authority",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Stadium-Authority",
    "https://www.pittsburghpa.gov/City-Government/Boards-Authorities-Commissions/List-of-Boards-Authorities-Commissions/Urban-Redevelopment-Authority",
   
    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Board_Authorities.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Board_Authorities.txt


In [7]:
 




import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Department-of-Law/Core-Services",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Department-of-Law/Filing-a-Claim-with-Law",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Department-of-Law/Right-to-Know",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Department-of-Law/Acceptance-of-Service-of-Process",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Equal-Protection",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Equal-Protection/Paid-Sick-Days-Act",
    "https://www.pittsburghpa.gov/Safety/Public-Safety/Disruptive-Properties",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Equal-Protection/Prevailing-Wage-Ordinance",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Equal-Protection/Ban-the-Box",
      "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations/About-OMI",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations/Employee-Rights",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations/Function-Services-Goals",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations/Filing-a-Complaint",
    "https://www.pittsburghpa.gov/City-Government/Legal-Services/Office-of-Municipal-Investigations/Contact"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Legal_Services.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Legal_Services.txt


In [8]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

    "https://www.governmentjobs.com/careers/pittsburgh",
      "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service",
    "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service/Civil-Service-Commission",
    "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service/Pittsburgh-Partnership",
    "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service/Neighborhood-Employment-Centers-NEC%E2%80%99s",
    "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service/Learn-Earn",
    "https://www.pittsburghpa.gov/City-Government/Jobs/Human-Resources-and-Civil-Service/Classification-Compensation-Study"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Jobs.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Jobs.txt


In [9]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes",
      "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Real-Estate-Tax-Certification-Letter",
      "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Business-Tax-Compliance-Letter",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Property-Tax-Worksheet",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Tax-Forms",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Real-Estate-Taxes",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Tax-FAQs",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/New-Business-Registration",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales",
    
      "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales/Finance-Permits-Licenses",
      "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales/Debt-Obligations",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales/Contact-Finance",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales/About-Finance",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Property-Sales/Waiver-and-Refund-Request",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Finance-Fee-Schedule",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/About-Office-of-Management-and-Budget",
     "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Budgets-and-Reports",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Budget-Engagement",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Grants-Office",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Fleet-Operations",
      "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/OMB-Contacts",
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Community-Development"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Finances_&_Budget.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Finances_&_Budget.txt


In [10]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Mayors-Office",
      "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Authorities",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/City-Department-A-Z",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/EMS",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Fire",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Police",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Senior-Community-Centers",
      "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Directory/Swimming-Pools",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Press-Release-Archive",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Channel-Pittsburgh",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Channel-Pittsburgh/Schedule",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Channel-Pittsburgh/City-Channel-FAQs",
    "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Channel-Pittsburgh/City-Channel-Franchise",
      "https://www.pittsburghpa.gov/City-Government/Contacts-Information/City-Channel-Pittsburgh/City-Channel-Pittsburgh-YouTube-Highlights"
    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Contacts&Information.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Contacts&Information.txt


In [12]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

"https://www.pittsburghpa.gov/Safety/Police",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Headquarters",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-1",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-2",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-3",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-4",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-5",
"https://www.pittsburghpa.gov/Safety/Police/Police-Zones/Police-Zone-6",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Why-Serve",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Pittsburgh",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/About-PBP",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Mentors",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Salary-Benefits",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Hiring-Process",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/Specialty-Units",
"https://www.pittsburghpa.gov/Safety/Police/Police-Officer-Recruitment/FAQs",
"https://www.pittsburghpa.gov/Safety/Police/File-a-Police-Report",
"https://www.pittsburghpa.gov/Safety/Police/Submit-a-Tip",
"https://www.pittsburghpa.gov/Safety/Police/Police-Data-Portal",
"https://www.pittsburghpa.gov/Safety/Police/Police-Data-Portal/Pittsburgh-Overdose-Dashboard",
"https://www.pittsburghpa.gov/Safety/Police/Police-Data-Portal/Response-to-Community-Task-Force-on-Police-Reform",
"https://www.pittsburghpa.gov/Safety/Police/Branches",
"https://www.pittsburghpa.gov/Safety/Police/Branches/Operations-Branch",
"https://www.pittsburghpa.gov/Safety/Police/Branches/Investigations-Branch",
"https://www.pittsburghpa.gov/Safety/Police/Branches/Administration-Branch",
"https://www.pittsburghpa.gov/Safety/Police/Branches/Special-Deployment-Division-SDD",
"https://www.pittsburghpa.gov/Safety/Police/About-Police",
"https://www.pittsburghpa.gov/Safety/Police/About-Police/Mission-Values",
"https://www.pittsburghpa.gov/Safety/Police/About-Police/History-of-the-Badge",
"https://www.pittsburghpa.gov/Safety/Police/About-Police/Policies-and-Procedural-Manual",
"https://www.pittsburghpa.gov/Safety/Police/About-Police/Fallen-Officer",
"https://www.pittsburghpa.gov/Safety/Police/About-Police/Cops-Kids-Summer-Camp-Program"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Police"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Police.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Police/Police.txt


In [None]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
"https://www.pittsburghpa.gov/Safety/Fire",
"https://www.pittsburghpa.gov/Safety/Fire/Girls-Firefighter-Camp",
"https://www.pittsburghpa.gov/Safety/Fire/Fire-Permit-Information",
"https://www.pittsburghpa.gov/Safety/Fire/Bureau-of-Fire-Contacts",
"https://www.pittsburghpa.gov/Safety/Fire/About-Fire",
"https://www.pittsburghpa.gov/Safety/Fire/About-Fire/Response-Statistics",

"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/About-PBF",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/Pittsburgh",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/Salary-Benefits",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/Hiring-Process",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/FAQs",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/Firefighter-Development",
"https://www.pittsburghpa.gov/Safety/Fire/Firefighter-Recruitment/Contact-Us",
"https://www.pittsburghpa.gov/Safety/Fire/Fire-Safety-Programs",
"https://www.pittsburghpa.gov/Safety/Fire/Fire-Safety-Programs/Fire-Safe-Building-Ordinance"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Police"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Fire.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Police/Fire.txt


In [14]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Ambulance-Division",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Ambulance-Division/Station-Locations",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Ambulance-Division/What-is-a-Paramedic",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Ambulance-Division/Whats-on-your-Medic-Unit",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Special-Event-Operations",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Special-Event-Operations/Rescue-Division",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Special-Event-Operations/Special-Operations",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Special-Event-Operations/Special-Programs",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Training-Division",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/EMS-Contacts",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Become-a-Paramedic",
"https://www.pittsburghpa.gov/Safety/Emergency-Medical-Services/Freedom-House-EMT-Training"
 
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Emergency_Medical_Services.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Emergency_Medical_Services.txt


In [15]:
import requests
from bs4 import BeautifulSoup
import os

urls = [


"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Health-and-Safety-Resources",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Health-and-Safety-Resources/Overdose-Prevention",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Health-and-Safety-Resources/BigBurgh",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Health-and-Safety-Resources/Domestic-Violence-Resource-Guide",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Meet-the-Staff",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Join-Our-Team",
"https://www.pittsburghpa.gov/Safety/Office-of-Community-Health-and-Safety/Community-Engagement"

    
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "OCHS.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/OCHS.txt


In [16]:
import requests
from bs4 import BeautifulSoup
import os

urls = [


"https://www.pittsburghpa.gov/Safety/Alerts/Public-Safety-Blotter",
"https://www.pittsburghpa.gov/Safety/Alerts/Public-Safety-Press-Releases",
"https://www.pittsburghpa.gov/Safety/Alerts/COVID-19-Updates"

]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Alerts.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Alerts.txt


In [17]:
import requests
from bs4 import BeautifulSoup
import os

urls = [


"https://www.pittsburghpa.gov/Safety/Public-Safety/Emergency-Management-and-Homeland-Security",
"https://www.pittsburghpa.gov/Safety/Public-Safety/Nighttime-Economy",

"https://www.pittsburghpa.gov/Safety/Public-Safety/Violence-Prevention",
"https://www.pittsburghpa.gov/Safety/Public-Safety/Violence-Prevention/Office-of-Community-Services-Violence-Prevention",
"https://www.pittsburghpa.gov/Safety/Public-Safety/Violence-Prevention/Student-and-Citizens-Police-Academy",
"https://www.pittsburghpa.gov/Safety/Public-Safety/Violence-Prevention/Safe-Passage",
"https://www.pittsburghpa.gov/Safety/Public-Safety/Public-Safety-Contacts"

]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Public_Safety.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Safety.txt/Public_Safety.txt


In [19]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

"https://www.pittsburghpa.gov/Business-Development/City-Planning/About-DCP",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/About-DCP/Key-Contacts",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/City-Planning-Meetings",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Public-Notices",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Commissions-and-Boards",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Historic-Preservation-Program",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Public-Art",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Neighborhood-Planning",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Neighborhood-Planning/Whos-My-Neighborhood-Planner",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Neighborhood-Planning/Public-Engagement-Guide",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Neighborhood-Planning/Comprehensive-Planning-Process",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Riverbank-Conditions-Assessment-and-Best-Practices-Study",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Broadway-Avenue-Public-Realm-Project",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Carrick-Artistic-Intersection",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Hays-Woods",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Pittsburgh%E2%80%99s-Riverfront-Zoning",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Sheraden-Park-Master-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/South-Side-Park-Master-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Ecoinnovation-District",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Greater-Hazelwood-Neighborhood-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Greater-Hill-District-Master-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Homewood-Community-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/Manchester-Chateau-Neighborhood-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Projects/The-Oakland-Plan",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Programs/Adopt-A-Lot",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Programs/Greenways",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Programs/Building-Benchmarking-Compliance",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Planning-Programs/Registered-Community-Organizations",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Process-Guides",
"https://www.pittsburghpa.gov/Business-Development/City-Planning/Process-Guides/Performance-Point-Process-Guide"
]

output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "City_Planning.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/City_Planning.txt


In [20]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/About-Mobility-and-Infrastructure",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/About-Mobility-and-Infrastructure/COVID-19-Response",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/About-Mobility-and-Infrastructure/Policies-and-Standards",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Maps",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/City-Bridges",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Bike-Plan",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/City-Steps",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Mobility",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Neighborways",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Pedestrian-Safety-Action-Plan",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Pedestrian-Wayfinding-Kiosk-Project",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/2070-Mobility-Vision-Plan",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/Strip-District-Mobility-Plan",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Plans/The-Homewood-Mobility-Plan",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Current-DOMI-Projects",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Current-DOMI-Projects/Achieved-Projects",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Current-DOMI-Projects/Achieved-Projects",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Transportation-Development-Review/Curb-Cuts",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/ROW-311-and-Code-Enforcement",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/Applicant-Guidance",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/Telecommunications",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/Small-Cell-Facilities",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/Encroachment-Permit",
"https://www.pittsburghpa.gov/Business-Development/Mobility-and-Infrastructure/Right-of-Way-Management/Outdoor-Dining-Operations-Program",


]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Mobility_Infrastructre.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Mobility_Infrastructre.txt


In [21]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

   "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Applications-Forms/Permit-Applications",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Applications-Forms/License-Applications",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Applications-Forms/Checklists",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Applications-Forms/Record-Requests",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Applications-Forms/Other-Applications-Forms",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Reference",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/OneStopPGH-Tutorials",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Building-Development-Application",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Bridge-Asset-Management-Program",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Bridge-Asset-Management-Program",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Commission-Meetings-and-Minutes",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/OneStopPGH-Contacts",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Plan-Review-Meeting",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/One-Stop-PGH/Structure-Types",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits/About-PLI",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits/Residential-Permits",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits/Commercial-Permits",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits/Permit-Process",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Permits/Work-Not-Requiring-a-Permit",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Licenses",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Licenses/Business-Licenses",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Licenses/Contractor-Licenses",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Licenses/Trade-Licenses",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/PLI-Bulletins",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Community-Resources",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Community-Resources/Monthly-Community-Forum",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Code-Enforcement",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Code-Enforcement/Clean-and-Lien",
    "https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Code-Enforcement/Top-10-Code-Violations",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Code-Enforcement/Facade-Inspections",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Code-Enforcement/Fire-Prevention",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Condemned-Buildings",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Condemned-Buildings/City-Funded-Demolition",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Condemned-Buildings/Condemned-Under-Contract-and-Razed-Properties",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Fees",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Fees/Fee-Calculator",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Building-Codes",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Appeals",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Appeals/Board-of-Appeals",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Appeals/Board-of-License-and-Inspection-Review",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Contacts",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Contacts/Other-Agency-Coordination",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Announcements",
"https://www.pittsburghpa.gov/Business-Development/Permits-Licenses-and-Inspections/Registrations"

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Permits_licences_inspections.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Permits_licences_inspections.txt


In [22]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance",
    "https://getinvolved.pittsburghpa.gov/city-website-redesign?_gl=1*19x02wy*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQyMzMuMC4wLjA.",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Pittsburgh-Digital-Equity-Coalition",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Pittsburgh-Digital-Equity-Coalition/PDEC-Members-Acknowledgements",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Pittsburgh-Digital-Equity-Coalition/Media-and-Information-Resources",
    "https://getinvolved.pittsburghpa.gov/printer-optimization-program?_gl=1*1gd6wnc*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQzMTMuMC4wLjA.",
    "https://getinvolved.pittsburghpa.gov/process-improvement-in-pittsburgh?_gl=1*1j4a85u*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQzMjMuMC4wLjA.",
    "https://getinvolved.pittsburghpa.gov/public-safety-media-blotter?_gl=1*r46fdv*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQzMzIuMC4wLjA.",
    "https://getinvolved.pittsburghpa.gov/station-alerting?_gl=1*lopowk*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQzMzkuMC4wLjA.",
    "https://getinvolved.pittsburghpa.gov/tech-refresh?_gl=1*mbzaru*_ga*MTQ2MzQwODA5Mi4xNzQwMzM3MDQ1*_ga_6YKGFYBQNH*MTc0MTgwNjY2MS4yMi4xLjE3NDE4MTQzNDcuMC4wLjA.",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Our-Teams",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/IP-Contacts",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Web-Disclaimers-Policies",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Web-Disclaimers-Policies/External-Link-Policy",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Web-Disclaimers-Policies/Privacy-Policy",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Web-Disclaimers-Policies/Security-Policy",
    "https://www.pittsburghpa.gov/Business-Development/Innovation-Performance/Web-Disclaimers-Policies/Google-Translate-Policy",
   

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Innovation_performance.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Innovation_performance.txt


In [23]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/About-Public-Works",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Contact-Public-Works",

   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Architecture-Division"
   

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Public_Works.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Public_Works.txt


In [24]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
   "https://www.pittsburghpa.gov/Business-Development/Zoning",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Zoning-FAQ",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/Fee-Schedule",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/Property-Certification",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/ConsolidationsSubdivisions",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/Institutional-Master-Plan",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/Specially-Planned-Districts",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Planning-Application-and-Process/Pre-Application-Meeting",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Site-Plan-Review",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Site-Plan-Requirements",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Certificate-of-Occupancy",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Administrator-Exception",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Design-Review",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Environmental-Review",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Environmental-Review",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Environmental-Review/Floodplain",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Environmental-Review/Geotechnical-Reports",
   "https://www.pittsburghpa.gov/Business-Development/Zoning/Plan-Reviews-and-Notices/Construction-Management-Plans",
    "https://www.pittsburghpa.gov/Business-Development/Zoning/Housing"

   

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Zoning.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Zoning.txt


In [25]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
   "https://www.pittsburghpa.gov/Business-Development/GIS",
   "https://www.pittsburghpa.gov/Business-Development/GIS/Download-Data-Maps",
   "https://www.pittsburghpa.gov/Business-Development/GIS/Download-Data-Maps",
   "https://www.pittsburghpa.gov/Business-Development/GIS/Interactive-Maps",
   "https://www.pittsburghpa.gov/Business-Development/GIS/Address-Request",
   "https://www.pittsburghpa.gov/Business-Development/GIS/New-Street-Name-Request"


]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "GIS.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/GIS.txt


In [26]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
 
   "https://www.pittsburghpa.gov/Business-Development/Procurement/Beacon-Online-Procurement",
   "https://www.pittsburghpa.gov/Business-Development/Procurement/FAQs",
   "https://www.pittsburghpa.gov/Business-Development/Procurement/Bid-Opportunities",
   "https://www.pittsburghpa.gov/Business-Development/Procurement/Procurement-Resources"


   

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Procurement.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Procurement.txt


In [27]:
import requests
from bs4 import BeautifulSoup
import os

urls = [
 
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab",
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab/Benefits",
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab/Alumni",
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab/Startup-Resources",
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab/FAQ",
   "https://www.pittsburghpa.gov/Business-Development/PGH-Lab/Contact-Us"

   

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "PGH-LAB.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/PGH-LAB.txt


In [28]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

   "https://www.pittsburghpa.gov/Business-Development/Sustainability",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Climate-Action-Plan",
      "https://www.pittsburghpa.gov/Business-Development/Sustainability/Resilient-Pittsburgh",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Local-Food-Is-More-Sustainable",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Community-Supported-Agriculture",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Urban-Agriculture",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Food-Access-Programs",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Food-Waste",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Children-Gardens-and-Healthy-Food",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Food-Related-Events",
     "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Internships-and-Career-Opportunities",
      "https://www.pittsburghpa.gov/Business-Development/Sustainability/Food-Systems/Food-Systems-Resources",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Sustainability-Resilience-Library",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Affiliations-Memberships",
   "https://www.pittsburghpa.gov/Business-Development/Sustainability/Environmental-Planning-and-Review"
  
  


]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Sustainability.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Business_Development.txt/Sustainability.txt


In [29]:
import requests
from bs4 import BeautifulSoup
import os

urls = [

   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Pittsburgh-Urban-Forest",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Special-Initiatives",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Partnerships",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Events",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Tree-Planting",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Request-Tree-Work",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Appealing-Street-Tree-Removal",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-Resource-Links",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Forestry-FAQ",
   "https://www.pittsburghpa.gov/Business-Development/Public-Works/Forestry-Division/Vacant-Lot-Greening"
  

]
output_dir = "/home/jdalvi/jdalvi/anlp2/scraped_txt_files/Resident_Services.txt/Tree_Resources"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Tree_Resources.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('div', {'id': 'main-content'})
        
        if main_content:
            tags_to_scrape = ['h1', 'h2', 'h3', 'p', 'ul', 'li']
            content_tags = main_content.find_all(tags_to_scrape)
            prev_text = ""
            
            for tag in content_tags:
                text = tag.get_text(strip=True)
                
                # Avoid duplicate consecutive headings
                if tag.name in ['h1', 'h2', 'h3']:
                    if text == prev_text:
                        continue
                    text = f"\n{text}\n"  # Keep only the unique heading
                    prev_text = text
                elif tag.name == 'ul':
                    text = "\n"  # Add spacing before unordered lists
                elif tag.name == 'li':
                    parent = tag.find_parent('ul')
                    if parent and parent.find_parent('li'):
                        text = '    * ' + text  # Sub-bullet
                    else:
                        text = '- ' + text  # Main bullet
                
                if text:
                    f.write(text + '\n')
    print(f"Data successfully saved to {output_file}")

Data successfully saved to /home/jdalvi/jdalvi/anlp2/scraped_txt_files/Resident_Services.txt/Tree_Resources/Tree_Resources.txt


## Visit_pittsburgh.txt


In [8]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.visitpittsburgh.com/blog/upcoming-concerts-in-pittsburgh/"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Send GET request
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract concert details
    concerts = []
    event_blocks = soup.find_all("div")  # Change this to correct container

    for event in event_blocks:
        title = event.find('h2')
        date = event.find('p')
        venue = event.find('span')

        title_text = title.text.strip() if title else "N/A"
        date_text = date.text.strip() if date else "N/A"
        venue_text = venue.text.strip() if venue else "N/A"

        concerts.append(f"Title: {title_text}\nDate: {date_text}\nVenue: {venue_text}\n---\n")

    # Save to .txt file
    with open("pittsburgh_concerts.txt", "w", encoding="utf-8") as f:
        f.writelines(concerts)

    print("Data saved to pittsburgh_concerts.txt")

else:
    print("Failed to retrieve the webpage")


Data saved to pittsburgh_concerts.txt


In [10]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.visitpittsburgh.com/things-to-do/"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find all content blocks
cards = soup.find_all("span", class_="mashup__content")  # Update container class

data = []
for card in cards:
    title = card.find("span", class_="mashup__heading")
    summary = card.find("span", class_="mashup__summary")
    
    title_text = title.text.strip() if title else "N/A"
    summary_text = summary.text.strip() if summary else "N/A"

    data.append(f"Title: {title_text}\nSummary: {summary_text}\n---\n")

# Save to .txt file
with open("pittsburgh_data.txt", "w", encoding="utf-8") as f:
    f.writelines(data)

print("Data saved to pittsburgh_data.txt")


Data saved to pittsburgh_data.txt


In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-pirates/"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract Title (H1) - "Pittsburgh Pirates"
title_element = soup.find("h1", class_="page-title__heading")
title = title_element.text.strip() if title_element else "N/A"

# Extract the correct Subtitle (H2) - "Baseball in Pittsburgh"
subtitle_element = soup.find("h2", string="Baseball in Pittsburgh")  # Look for exact text match
subtitle = subtitle_element.text.strip() if subtitle_element else "N/A"

# Extract relevant paragraphs under H2
content_div = subtitle_element.find_next("div", class_="text__text") if subtitle_element else None
paragraphs = content_div.find_all("p") if content_div else []

formatted_paragraphs = []
for p in paragraphs:
    text = p.text.strip()

    # Extract hyperlinks inside <a> tags and format them
    for link in p.find_all("a", href=True):
        link_text = link.text.strip()
        link_url = link["href"]
        text = text.replace(link_text, f"{link_text} ({link_url})")

    formatted_paragraphs.append(text)

# Save to a text file
with open("pittsburgh_pirates_selected.txt", "w", encoding="utf-8") as f:
    f.write(f"Title: {title}\n")
    f.write(f"Subtitle: {subtitle}\n\n")
    f.write("--- Full Content ---\n\n")
    f.write("\n\n".join(formatted_paragraphs))

print("Selected data saved to pittsburgh_pirates_selected.txt")


Selected data saved to pittsburgh_pirates_selected.txt


In [19]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-steelers/"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract Title (H1) - "Pittsburgh Pirates"
title_element = soup.find("h1", class_="page-title__heading")
title = title_element.text.strip() if title_element else "N/A"

# Extract the correct Subtitle (H2) - "Baseball in Pittsburgh"
subtitle_element = soup.find("h2", string="Football in Pittsburgh")  # Look for exact text match
subtitle = subtitle_element.text.strip() if subtitle_element else "N/A"

# Extract relevant paragraphs under H2
content_div = subtitle_element.find_next("div", class_="text__text") if subtitle_element else None
paragraphs = content_div.find_all("p") if content_div else []

formatted_paragraphs = []
for p in paragraphs:
    text = p.text.strip()

    # Extract hyperlinks inside <a> tags and format them
    for link in p.find_all("a", href=True):
        link_text = link.text.strip()
        link_url = link["href"]
        text = text.replace(link_text, f"{link_text} ({link_url})")

    formatted_paragraphs.append(text)

# Save to a text file
with open("pittsburgh_pirates_selected2.txt", "w", encoding="utf-8") as f:
    f.write(f"Title: {title}\n")
    f.write(f"Subtitle: {subtitle}\n\n")
    f.write("--- Full Content ---\n\n")
    f.write("\n\n".join(formatted_paragraphs))

print("Selected data saved to pittsburgh_pirates_selected.txt")


Selected data saved to pittsburgh_pirates_selected.txt


In [27]:
import requests
from bs4 import BeautifulSoup

# URL of the Pittsburgh Penguins page
url = "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-penguins/"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract Title (H1) - "Pittsburgh Penguins"
title_element = soup.find("h1", class_="page-title__heading")
title = title_element.text.strip() if title_element else "N/A"

paragraphs = content_div.find_all("p") if content_div else []

formatted_paragraphs = []
for p in paragraphs:
    text = p.text.strip()

    # Extract hyperlinks inside <a> tags and format them
    for link in p.find_all("a", href=True):
        link_text = link.text.strip()
        link_url = link["href"]
        text = text.replace(link_text, f"{link_text} ({link_url})")

    formatted_paragraphs.append(text)

# Save to a text file
with open("pittsburgh_penguins_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(f"Title: {title}\n")
    #f.write(f"Subtitle: {subtitle}\n\n")
    f.write("--- Full Content ---\n\n")
    f.write("\n\n".join(formatted_paragraphs))

print("Selected data saved to pittsburgh_penguins_cleaned.txt")


Selected data saved to pittsburgh_penguins_cleaned.txt


In [28]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract main title
title_element = soup.find("h1")
title = title_element.text.strip() if title_element else "N/A"

# Extract all subheadings (H2 - sports teams)
subheadings = [h2.text.strip() for h2 in soup.find_all("h2")]

# Extract all paragraph descriptions (P tags)
paragraphs = [p.text.strip() for p in soup.find_all("p")]

# Extract important links (team websites or stadiums)
important_links = {}
for a in soup.find_all("a", href=True):
    link_text = a.text.strip()
    link_url = a["href"]
    if "http" in link_url:  # Ensure external links only
        important_links[link_text] = link_url

# Save to a text file
with open("pittsburgh_sports_teams.txt", "w", encoding="utf-8") as f:
    f.write(f"Title: {title}\n\n")
    
    f.write("--- Sports Teams ---\n")
    for h2 in subheadings:
        f.write(f"- {h2}\n")

    f.write("\n--- Team Descriptions ---\n")
    for p in paragraphs:
        f.write(f"{p}\n\n")

    f.write("\n--- Important Links ---\n")
    for text, link in important_links.items():
        f.write(f"{text}: {link}\n")

print("Data saved to pittsburgh_sports_teams.txt")


Data saved to pittsburgh_sports_teams.txt


In [38]:
from bs4 import BeautifulSoup

# HTML content
html_content = """
<div id="id-1313653" class="block text text--default layout--spaced-v layout--fluid-h bg--white bg--light">
    <div class="text__inner text--content">
        <div class="text__heading layout--text-max aos-init aos-animate" data-aos="fade-up" data-aos-anchor="#id-1313653" style="--aos-duration: 500ms;">
            <h2>Reasons to Compete in Pittsburgh</h2>
        </div>
        <div class="text__text lobotomize layout--text-max links aos-init aos-animate" data-aos="fade-up" data-aos-anchor="#id-1313653" style="--aos-duration: 500ms; --aos-delay: 100ms;">
            <h3>Iconic Sports Experiences</h3>
            <p>"SportsPITTSBURGH knows firsthand how to host championship-caliber events all while creating Iconic Experiences for athletes, coaches and fans that are guaranteed to be memorable for years to come. From professional and amateur to high school and youth sports, there’s never been a better time to hoist a trophy in the City of Champions."</p>
            <h3>THE Sports City</h3>
            <p>"The City of Champions knows sports, and local pride shines bright. Home to the Steelers, Penguins and Pirates, Pittsburgh is a city that bleeds black and gold. The community rallies around sporting events big and small."</p>
            <h3>Unique Sports Facilities & World-Class Venues</h3>
            <p>"From the LEED® Platinum-Certified David L. Lawrence Convention Center, which has hosted 500 teams on 36 basketball courts, to PPG Paints Arena, which features a versatile floorspace good for everything from wrestling to volleyball and hockey, the Pittsburgh region is well equipped to support a multitude of sports events. With many major universities in the city, too, collegiate-level facilities are available in several Pittsburgh neighborhoods."</p>
            <p>"PNC Park and Heinz Field, the acclaimed homes of the Pittsburgh Pirates and Pittsburgh Steelers, respectively, provide awe-inspiring atmospheres and breathtaking views of the city skyline from Pittsburgh’s North Shore. Highmark Stadium, home to the Pittsburgh Riverhounds soccer team, sits along the Monongahela River and boasts its own unbeatable cityscape view."</p>
        </div>
    </div>
</div>
"""

# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")

# Extract heading
heading = soup.find("h2").text.strip()

# Extract subheadings and paragraphs
text_content = f"{heading}\n\n"

for h3 in soup.find_all("h3"):
    section_title = h3.text.strip()
    paragraphs = []
    
    # Find all paragraphs following the h3
    for sibling in h3.find_next_siblings():
        if sibling.name == "h3":
            break  # Stop if another heading appears
        if sibling.name == "p":
            paragraphs.append(sibling.text.strip())

    # Append to text content
    text_content += f"{section_title}\n" + "\n".join(paragraphs) + "\n\n"

# Save to a .txt file
file_path = "/home/jdalvi/jdalvi/anlp2/pittsburgh_concerts.txt"
with open(file_path, "w", encoding="utf-8") as file:
    file.write(text_content)

print(f"Text content saved to {file_path}")


Text content saved to /home/jdalvi/jdalvi/anlp2/pittsburgh_concerts.txt


In [40]:
import requests
from bs4 import BeautifulSoup

url = "https://example.com"  # Replace with actual URL
response = requests.get(url)

# Print first 1000 characters to check if content is loaded
print(response.text[:1000])  


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--incognito")  # Open in incognito mode (fresh session)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


def scrape_events_for_month(driver, month_num):
    # Construct the URL for the given month
    url = f"https://downtownpittsburgh.com/events/?n={month_num}&y=2025&cat=0"
    print(f"Scraping URL: {url}")
    driver.get(url)
    
    # Wait for the page to load. Adjust the sleep time if necessary.
    time.sleep(5)
    
    events = []
    # Find all event containers on the page
    event_items = driver.find_elements(By.CLASS_NAME, "eventitem")
    
    for event in event_items:
        try:
            copy = event.find_element(By.CLASS_NAME, "copyContent")
        except Exception:
            continue
        
        # Extract categories from the 'category' div (each term inside 'term')
        try:
            category_div = copy.find_element(By.CLASS_NAME, "category")
            term_elements = category_div.find_elements(By.CLASS_NAME, "term")
            categories = ", ".join([term.text.strip() for term in term_elements])
        except Exception:
            categories = ""
        
        # Extract the event title and link from the <h1><a> element
        try:
            h1 = copy.find_element(By.TAG_NAME, "h1")
            a_tag = h1.find_element(By.TAG_NAME, "a")
            title = a_tag.text.strip()
            link = a_tag.get_attribute("href")
            if link.startswith("/"):
                link = "https://downtownpittsburgh.com" + link
        except Exception:
            title = ""
            link = ""
        
        # Extract the event date from the element with class 'eventdate'
        try:
            event_date = copy.find_element(By.CLASS_NAME, "eventdate").text.strip()
        except Exception:
            event_date = ""
        
        # Extract a rough description by taking all text and removing known parts
        try:
            full_text = copy.text
            try:
                read_more = copy.find_element(By.CLASS_NAME, "button").text.strip()
            except Exception:
                read_more = ""
            description = full_text.replace(categories, "").replace(title, "").replace(event_date, "").replace(read_more, "").strip()
        except Exception:
            description = ""
        
        events.append({
            "month": month_num,  # storing the month number (3 for March, etc.)
            "categories": categories,
            "title": title,
            "link": link,
            "event_date": event_date,
            "description": description
        })
    return events

def scrape_all_events():
    all_events = []
    # Create a single Chrome driver instance
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # Loop through month numbers 3 (March) to 12 (December)
    for month in range(3, 13):
        month_events = scrape_events_for_month(driver, month)
        print(f"Found {len(month_events)} events for month {month}.")
        all_events.extend(month_events)
    
    driver.quit()
    return all_events

if __name__ == '__main__':
    events = scrape_all_events()
    
    # Create a DataFrame and save the aggregated data to CSV
    df = pd.DataFrame(events)
    df.to_csv("downtownpittsburgh_events.csv", index=False)
    print("Data saved to downtownpittsburgh_events.csv")


In [43]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Setup Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Target URL
url = "https://www.visitpittsburgh.com/things-to-do/"
driver.get(url)

# Wait for JavaScript to load (adjust time if needed)
time.sleep(5)

# Get page source and parse it
soup = BeautifulSoup(driver.page_source, "html.parser")

# Extract the main heading
heading = soup.find("h1").text.strip() if soup.find("h1") else "No heading found"

# Extract the description text
description = soup.find("p").text.strip() if soup.find("p") else "No description found"

# Extract all links under the main content
links = []
for a in soup.find_all("a", href=True):
    link_text = a.text.strip()
    link_url = a["href"]
    if link_text and link_url.startswith("http"):  # Avoid empty and non-absolute links
        links.append({"Text": link_text, "URL": link_url})

# Close the driver
driver.quit()

# Save extracted content
file_path = "/home/jdalvi/jdalvi/anlp2/pittsburgh_sports_teams.txt"
with open(file_path, "w", encoding="utf-8") as file:
    file.write(f"Main Heading: {heading}\n\nDescription:\n{description}\n\nUseful Links:\n")
    for link in links:
        file.write(f"{link['Text']}: {link['URL']}\n")

print(f"✅ Scraped content saved to: {file_path}")


✅ Scraped content saved to: /home/jdalvi/jdalvi/anlp2/pittsburgh_sports_teams.txt


In [45]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Setup Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run in background
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Target URL
url = "https://www.visitpittsburgh.com/things-to-do/"
driver.get(url)

# Wait for JavaScript to load (adjust if needed)
time.sleep(5)

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Extract main heading
heading = soup.find("h1").text.strip() if soup.find("h1") else "No heading found"

# Extract description (first paragraph)
description = soup.find("p").text.strip() if soup.find("p") else "No description found"

# Extract subheadings (categories like 'Free Things to Do', 'Arts & Culture', etc.)
sections = []
for section in soup.find_all("h2"):  # You can also try h3 if needed
    sec_title = section.text.strip()
    
    # Extract related paragraphs under each section
    related_p = section.find_next("p")
    section_desc = related_p.text.strip() if related_p else "No description available"
    
    sections.append({"Category": sec_title, "Details": section_desc})

# Convert to DataFrame and save as CSV
#df = pd.DataFrame(sections)
#csv_path = "/mnt/data/pittsburgh_things_to_do.csv"
#df.to_csv(csv_path, index=False)

# Save text content for better readability
text_path = "/home/jdalvi/jdalvi/anlp2/pittsburgh_steelers_info.txt"
with open(text_path, "w", encoding="utf-8") as file:
    file.write(f"Main Heading: {heading}\n\nDescription:\n{description}\n\nCategories:\n")
    for sec in sections:
        file.write(f"\n{sec['Category']}:\n{sec['Details']}\n")

# Close driver
driver.quit()

print(f"✅ Scraped content saved:\n📂 CSV: {csv_path}\n📂 TXT: {text_path}")


✅ Scraped content saved:
📂 CSV: /mnt/data/pittsburgh_things_to_do.csv
📂 TXT: /home/jdalvi/jdalvi/anlp2/pittsburgh_steelers_info.txt


In [1]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Extracting content inside the specific <p> tag
target_p = soup.select_one('p.page-title__subheading')

if target_p:
    print(target_p.get_text(strip=True))
else:
    print("Content not found!")


Not us! Here are the top free things to do in Pittsburgh.


In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.visitpittsburgh.com/things-to-do/tours-sightseeing/"

# Send an HTTP request to fetch the page content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Extracting the <h1> inside the <div class="page-title__content">
title_h1 = soup.select_one("div.page-title__content h1")

if title_h1:
    print(title_h1.get_text(strip=True))  # Prints the extracted title
else:
    print("Title not found!")


Tours & Sightseeing


In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Extracting content inside the specific <h1> tag
target_h1 = soup.select_one('h1.page-title__heading')

if target_h1:
    print(target_h1.get_text(strip=True))
else:
    print("Content not found!")

import requests
from bs4 import BeautifulSoup

url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Extracting content inside the specific <h2> tag
target_h2 = soup.select_one('#jlevents-festivals div.text__heading > h2')

if target_h2:
    print(target_h2.get_text(strip=True))
else:
    print("Content not found!")

import requests
from bs4 import BeautifulSoup

url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Extracting content inside the specific div tag
target_div = soup.select_one('#id-1300830 div.text__text')

if target_div:
    print(target_div.get_text(separator='\n\n', strip=True))
else:
    print("Content not found!")


Free Events & Festivals


In [6]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Define CSS selectors for each target element
selectors = {
    "Main Heading (H1)": 'h1.page-title__heading',
    "Section Heading (H2)": '#jlevents-festivals div.text__heading > h2',
    "Downtown & Mt. Washington Section": '#id-1300830 div.text__text'
}

# Open and write to a txt file
with open('extracted_content.txt', 'w', encoding='utf-8') as file:
    for title, selector in selectors.items():
        element = soup.select_one(selector)
        file.write(f"{title}:\n")
        
        if element:
            content = element.get_text(separator='\n\n', strip=True)
            file.write(content)
        else:
            file.write("Content not found!")
        
        file.write("\n\n" + "-"*80 + "\n\n")

print("Content extraction complete. Check 'extracted_content.txt'.")


Content extraction complete. Check 'extracted_content.txt'.


In [7]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <div> elements with an ID
divs_with_ids = soup.find_all('div', id=True)

# Open a file to save the extracted content
with open('div_content.txt', 'w', encoding='utf-8') as file:
    for div in divs_with_ids:
        div_id = div.get('id')
        div_content = div.get_text(separator='\n\n', strip=True)
        
        file.write(f"DIV ID: {div_id}\n")
        file.write(f"Content:\n{div_content}\n")
        file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("Extraction complete! Check 'div_content.txt' for results.")


Extraction complete! Check 'div_content.txt' for results.


In [9]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://www.visitpittsburgh.com/things-to-do/free-things-to-do/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <div> elements with an ID
divs_with_ids = soup.find_all('div', id=True)

# Open a file to save the extracted content
with open('/home/jdalvi/jdalvi/anlp2/visit_pittsburgh/free_things_to_do.txt', 'w', encoding='utf-8') as file:
    for div in divs_with_ids:
        div_content = div.get_text(separator='\n\n', strip=True)
        
        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("Extraction complete! Check 'free_things_to_do.txt' for results.")


Extraction complete! Check 'free_things_to_do.txt' for results.


# Family_fun.txt

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL with placeholder for page number
base_url = "https://www.visitpittsburgh.com/things-to-do/family-fun/?page={}"

# Open a file to save the extracted content
with open("family_fun.txt", "w", encoding="utf-8") as file:

    # Loop through pages 2, 3, and 4
    for page in range(2, 4):
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the browser
driver.quit()

print("\nData saved in 'family_fun.txt' 🚀")



Scraping: https://www.visitpittsburgh.com/things-to-do/family-fun/?page=2
Place Name: Historic Hanna's Town
Summary: 809 Forbes Trail Rd.
Greesburg, Pennsylvania 15601
(724) 836-1800
--------------------------------------------------------------------------------

Place Name: Mix Candle Co. SouthSide Works
Summary: 2746 Sidney St.
Pittsburgh, Pennsylvania 15203
(412) 525-1594
--------------------------------------------------------------------------------

Place Name: Flour Power Cooking Studios: Pittsburgh
Summary: 4655 William Flinn Highway, Ste. 114
Allison Park, Pennsylvania 15101
(412) 579-0372
--------------------------------------------------------------------------------

Place Name: Greene County Tourist Promotion Agency
Summary: 19 S. Washington St., Fort Jackson Bldg.
Waynesburg, Pennsylvania 15370
(724) 627-8687
--------------------------------------------------------------------------------

Place Name: Full Throttle Adrenaline Park
Summary: 360 Mall Circle Dr.
Monroevill

In [37]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the webpage
url = "https://www.visitpittsburgh.com/things-to-do/family-fun/?page=4"
driver.get(url)

# Wait for JavaScript to load content
time.sleep(10)  # Increased wait time

# Scroll down to trigger lazy loading
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)  # Wait again for content to load

# Find all place names and their corresponding summaries
headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

# Ensure correct mapping between place names and summaries
if not headings or not summaries:
    print("No data found. The website structure might have changed.")

# Open the file in append mode and save the results
with open("family_fun.txt", "a", encoding="utf-8") as file:
    for heading, summary in zip(headings, summaries):
        place_name = heading.text.strip()
        summary_text = summary.text.strip()

        output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
        
        print(output)  # Print to console
        file.write(output)  # Append to file

# Close the browser
driver.quit()

print("\n✅ Data successfully appended to 'family_fun.txt'")


Place Name: Steelers Hall of Honor Museum
Address: 100 Art Rooney Ave., Gate B
Pittsburgh, Pennsylvania 15212
(412) 697-7150
--------------------------------------------------------------------------------

Place Name: Yogi Bear's Jellystone Park at Kozy Rest
Address: 449 Campground Rd.
Harrisville, Pennsylvania 16038
(724) 584-0629
--------------------------------------------------------------------------------

Place Name: Steeltown Event Park
Address: 2 Willow Ave
Oakdale, Pennsylvania 15071
(412) 443-9287
--------------------------------------------------------------------------------

Place Name: West Overton Village and Museum
Address: 109 W. Overton Rd., PO Box 3
Scottdale, Pennsylvania 15683
(724) 887-7910
--------------------------------------------------------------------------------

Place Name: Washington County Tourism Promotion Agency
Address: 1000 Horizon Vue Dr., Ste. 1C80
Canonsburg, Pennsylvania 15317
(724) 225-3010
----------------------------------------------------

In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the webpage
url = "https://www.visitpittsburgh.com/things-to-do/family-fun/?page=1"
driver.get(url)

# Wait for JavaScript to load content
time.sleep(10)  # Increased wait time

# Scroll down to trigger lazy loading
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)  # Wait again for content to load

# Find all place names and their corresponding summaries
headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

# Ensure correct mapping between place names and summaries
if not headings or not summaries:
    print("No data found. The website structure might have changed.")

# Open the file in append mode and save the results
with open("family_fun.txt", "a", encoding="utf-8") as file:
    for heading, summary in zip(headings, summaries):
        place_name = heading.text.strip()
        summary_text = summary.text.strip()

        output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
        
        print(output)  # Print to console
        file.write(output)  # Append to file

# Close the browser
driver.quit()

print("\n✅ Data successfully appended to 'family_fun.txt'")


Place Name: EQT Children's Theater Series
Address: 101 Sixth St.
Pittsburgh, Pennsylvania 15222
(412) 456-6666
--------------------------------------------------------------------------------

Place Name: Dave & Buster's Pittsburgh - North Hills
Address: 6260 Northway Dr.
Pittsburgh, Pennsylvania 15237
(412) 702-9550
--------------------------------------------------------------------------------

Place Name: The Andy Warhol Museum
Address: 117 Sandusky St.
Pittsburgh, Pennsylvania 15212
(412) 237-8300
--------------------------------------------------------------------------------

Place Name: Duquesne Incline
Address: 1197 W. Carson St.
Pittsburgh, Pennsylvania 15219
(412) 381-1665
--------------------------------------------------------------------------------

Place Name: Compass Inn Museum/Ligonier Valley Historical Society
Address: 1382 Route 30 East
Laughlington, Pennsylvania 15655
(724) 238-4983
--------------------------------------------------------------------------------

P

In [44]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.visitpittsburgh.com/things-to-do/family-fun/"

# Send an HTTP request to fetch the page content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Extracting the <h1> inside the <div class="page-title__content">
title_h1 = soup.select_one("div.page-title__content h1")

if title_h1:
    title_text = title_h1.get_text(strip=True)  # Extract text from h1
else:
    title_text = "No Title Found"

# Read the existing content of sightseeing.txt
try:
    with open("/home/jdalvi/jdalvi/anlp2/visit_pittsburgh/family_fun.txt", "r", encoding="utf-8") as file:
        existing_content = file.read()
except FileNotFoundError:
    existing_content = ""  # If the file does not exist, start fresh

# Write the extracted title at the beginning & append the previous content
with open("/home/jdalvi/jdalvi/anlp2/visit_pittsburgh/family_fun.txt", "w", encoding="utf-8") as file:
    file.write(f"Page Title: {title_text}\n{'=' * 80}\n")
    file.write(existing_content)

print("\n✅ Title successfully added to 'family_fun.txt' 🚀")



✅ Title successfully added to 'family_fun.txt' 🚀


# Sightseeing.txt

In [40]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL with placeholder for page number
base_url = "https://www.visitpittsburgh.com/things-to-do/tours-sightseeing/?page={}"

# Open a file to save the extracted content
with open("sightseeing.txt", "w", encoding="utf-8") as file:
    for page in range(1, 3):  # Loop through pages 1, 2
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the browser
driver.quit()

print("\n✅ Data successfully saved in 'sightseeing.txt' 🚀")



Scraping: https://www.visitpittsburgh.com/things-to-do/tours-sightseeing/?page=1
Place Name: Duquesne Incline
Address: 1197 W. Carson St.
Pittsburgh, Pennsylvania 15219
(412) 381-1665
--------------------------------------------------------------------------------

Place Name: Libation Tours
Address: 2519 Penn Ave.
Pittsburgh, Pennsylvania 15222
(412) 427-2880
--------------------------------------------------------------------------------

Place Name: Greene County Historical Society
Address: 918 Rolling Meadows Rd.
Waynesburg, Pennsylvania 15370
(724) 627-3204
--------------------------------------------------------------------------------

Place Name: City Hall Tours
Address: 414 Grant St.
Pittsburgh, Pennsylvania 15219
--------------------------------------------------------------------------------

Place Name: DOORS OPEN Pittsburgh
Address: 
--------------------------------------------------------------------------------

Place Name: 'Burgh Bits and Bites Food Tour
Address: (412)

In [43]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.visitpittsburgh.com/things-to-do/tours-sightseeing/"

# Send an HTTP request to fetch the page content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Extracting the <h1> inside the <div class="page-title__content">
title_h1 = soup.select_one("div.page-title__content h1")

if title_h1:
    title_text = title_h1.get_text(strip=True)  # Extract text from h1
else:
    title_text = "No Title Found"

# Read the existing content of sightseeing.txt
try:
    with open("sightseeing.txt", "r", encoding="utf-8") as file:
        existing_content = file.read()
except FileNotFoundError:
    existing_content = ""  # If the file does not exist, start fresh

# Write the extracted title at the beginning & append the previous content
with open("sightseeing.txt", "w", encoding="utf-8") as file:
    file.write(f"Page Title: {title_text}\n{'=' * 80}\n")
    file.write(existing_content)

print("\n✅ Title successfully added to 'sightseeing.txt' 🚀")



✅ Title successfully added to 'sightseeing.txt' 🚀


# Outdoor_Adventure.txt

In [48]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL with placeholder for page number
base_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/?page={}"

# Open a file to save the extracted content
with open("outdoor_adventure.txt", "w", encoding="utf-8") as file:
    for page in range(1, 4):  # Loop through pages 1, 2
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the browser
driver.quit()

print("\n✅ Data successfully saved in 'outdoor_adventure.txt' 🚀")



Scraping: https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/?page=1
Place Name: Golf
Address: Pennsylvania is home to more than 500 public golf courses, with many of the best located in the Pittsburgh area. Enjoy championship play that is second to none.
--------------------------------------------------------------------------------

Place Name: Fishing, Kayaking & Boating
Address: The Pittsburgh region provides almost unlimited opportunities for fishing and boating fun. Pittsburgh's rivers are ideal for sculling, kayaking, canoeing and powerboating.
--------------------------------------------------------------------------------

Place Name: Allegheny Land Trust/Dead Man's Hollow
Address: 416 Thorn St.
Sewickley, Pennsylvania 15143
(412) 741-2750
--------------------------------------------------------------------------------

Place Name: GO Laurel Highlands
Address: 113 East Main St.
Ligonier, Pennsylvania 15658
(724) 238-5661
------------------------------------------

In [51]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL for outdoor adventure pages
base_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/?page={}"

# File path for saving data
file_path = "outdoor_adventure.txt"

# Open a file to save the extracted content
with open(file_path, "w", encoding="utf-8") as file:
    for page in range(1, 4):  # Loop through pages 1, 2, 3
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the Selenium browser
driver.quit()

print("\n✅ Outdoor Adventure data saved! Now extracting Golf page content...")

# **Step 2: Extract content from the "Golf" page using BeautifulSoup**
golf_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/golf/"
response = requests.get(golf_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all <div> elements with an ID
divs_with_ids = soup.find_all("div", id=True)

# Append Golf content to the same file
with open(file_path, "a", encoding="utf-8") as file:
    file.write("\n\n## Golf Page Content\n" + "=" * 80 + "\n")
    
    for div in divs_with_ids:
        div_content = div.get_text(separator="\n\n", strip=True)

        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'outdoor_adventure.txt' 🚀")

# **Step 3: Extract content from the "Fishing" page using BeautifulSoup**
golf_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/fishing-boating/"
response = requests.get(golf_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all <div> elements with an ID
divs_with_ids = soup.find_all("div", id=True)

# Append Golf content to the same file
with open(file_path, "a", encoding="utf-8") as file:
    file.write("\n\n## Fishing and Kayaking and Boating Page Content\n" + "=" * 80 + "\n")
    
    for div in divs_with_ids:
        div_content = div.get_text(separator="\n\n", strip=True)

        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'outdoor_adventure.txt' 🚀")


# **Step 4: Extract content from the "Trails" page using BeautifulSoup**
golf_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/trails/"
response = requests.get(golf_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all <div> elements with an ID
divs_with_ids = soup.find_all("div", id=True)

# Append Golf content to the same file
with open(file_path, "a", encoding="utf-8") as file:
    file.write("\n\n## Trails Page Content\n" + "=" * 80 + "\n")
    
    for div in divs_with_ids:
        div_content = div.get_text(separator="\n\n", strip=True)

        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'outdoor_adventure.txt' 🚀")


# **Step 5 Extract content from the "Parks" page using BeautifulSoup**
golf_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/parks/"
response = requests.get(golf_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all <div> elements with an ID
divs_with_ids = soup.find_all("div", id=True)

# Append Golf content to the same file
with open(file_path, "a", encoding="utf-8") as file:
    file.write("\n\n## Parks Page Content\n" + "=" * 80 + "\n")
    
    for div in divs_with_ids:
        div_content = div.get_text(separator="\n\n", strip=True)

        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'outdoor_adventure.txt' 🚀")


# **Step 6 Extract content from the "Biking" page using BeautifulSoup**
golf_url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/biking-in-pittsburgh/"
response = requests.get(golf_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all <div> elements with an ID
divs_with_ids = soup.find_all("div", id=True)

# Append Golf content to the same file
with open(file_path, "a", encoding="utf-8") as file:
    file.write("\n\n## Biking Page Content\n" + "=" * 80 + "\n")
    
    for div in divs_with_ids:
        div_content = div.get_text(separator="\n\n", strip=True)

        # Only write to file if content is not empty
        if div_content:
            file.write(f"{div_content}\n")
            file.write("\n" + "-"*80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'outdoor_adventure.txt' 🚀")





Scraping: https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/?page=1
Place Name: Golf
Address: Pennsylvania is home to more than 500 public golf courses, with many of the best located in the Pittsburgh area. Enjoy championship play that is second to none.
--------------------------------------------------------------------------------

Place Name: Fishing, Kayaking & Boating
Address: The Pittsburgh region provides almost unlimited opportunities for fishing and boating fun. Pittsburgh's rivers are ideal for sculling, kayaking, canoeing and powerboating.
--------------------------------------------------------------------------------

Place Name: Amped Airsoft Arena
Address: 2250 Noblestown Rd.
Pittsburgh, Pennsylvania 15205
(412) 712-9066
--------------------------------------------------------------------------------

Place Name: Bike Pittsburgh
Address: 188 43rd St., #1
Pittsburgh, Pennsylvania 15201
(412) 325-4334
---------------------------------------------------------

In [52]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.visitpittsburgh.com/things-to-do/outdoor-adventure/"

# Send an HTTP request to fetch the page content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Extracting the <h1> inside the <div class="page-title__content">
title_h1 = soup.select_one("div.page-title__content h1")

if title_h1:
    title_text = title_h1.get_text(strip=True)  # Extract text from h1
else:
    title_text = "No Title Found"

# Read the existing content of sightseeing.txt
try:
    with open("outdoor_adventure.txt", "r", encoding="utf-8") as file:
        existing_content = file.read()
except FileNotFoundError:
    existing_content = ""  # If the file does not exist, start fresh

# Write the extracted title at the beginning & append the previous content
with open("outdoor_adventure.txt", "w", encoding="utf-8") as file:
    file.write(f"Page Title: {title_text}\n{'=' * 80}\n")
    file.write(existing_content)

print("\n✅ Title successfully added to 'outdoor_adventure.txt' 🚀")


✅ Title successfully added to 'outdoor_adventure.txt' 🚀


In [1]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL for outdoor adventure pages
base_url = "https://www.visitpittsburgh.com/restaurants-culinary/?page={}"

# File path for saving data
file_path = "restaurant_culinary.txt"

# Open a file to save the extracted content
with open(file_path, "w", encoding="utf-8") as file:
    for page in range(1,9):  # Loop through pages 1, 2, 3
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the Selenium browser
driver.quit()



Scraping: https://www.visitpittsburgh.com/restaurants-culinary/?page=1
Place Name: Aruba Island Bowls Cafe
Address: 1601 Penn Ave.
Pittsburgh, Pennsylvania 15222
(412) 440-1601
--------------------------------------------------------------------------------

Place Name: bellfarm Kitchen | Bar
Address: 1111 Airport Blvd., PO Box 12420
Hyatt Regency Pittsburgh International Airport
Pittsburgh, Pennsylvania 15231
(724) 899-6050
--------------------------------------------------------------------------------

Place Name: Bakersfield - Tacos.Tequila.Whiskey.
Address: 940 Penn Ave.
Pittsburgh, Pennsylvania 15222
(412) 586-5024
--------------------------------------------------------------------------------

Place Name: The Bigelow Grille
Address: One Bigelow Square
Pittsburgh, Pennsylvania 15219
(412) 281-5013
--------------------------------------------------------------------------------

Place Name: Aqueous
Address: 1001 Lafayette St., Nemacolin
Farmington, Pennsylvania 15437
(866) 406-6

In [3]:


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL for outdoor adventure pages
base_url = "https://www.visitpittsburgh.com/restaurants-culinary/bars-nightlife/?page={}"

# File path for saving data
file_path = "bars_nightlife.txt"

# Open a file to save the extracted content
with open(file_path, "w", encoding="utf-8") as file:
    for page in range(1,4):  # Loop through pages 1, 2, 3
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the Selenium browser
driver.quit()



Scraping: https://www.visitpittsburgh.com/restaurants-culinary/bars-nightlife/?page=1
Place Name: Crafted North
Address: Pittsburgh Marriott City Center, 112 Washington Place
Pittsburgh, Pennsylvania 15219
(412) 471-4000
--------------------------------------------------------------------------------

Place Name: BURN by Rocky Patel
Address: 346 North Shore Dr.
Pittsburgh, Pennsylvania 15212
(412) 586-5854
--------------------------------------------------------------------------------

Place Name: Carmella's Plates & Pints
Address: 1908 E. Carson St.
Pittsburgh, Pennsylvania 15203
(412) 918-1215
--------------------------------------------------------------------------------

Place Name: Barcolena Wine Bar
Address: 922 Penn Ave.
Pittsburgh, Pennsylvania 15222
(412) 525-7722
--------------------------------------------------------------------------------

Place Name: Ember & Vine Woodfire Oven and Social Bar
Address: DoubleTree by Hilton Pittsburgh Cranberry, 910 Sheraton Dr.
Cranberr

In [4]:
import requests
from bs4 import BeautifulSoup

# List of architecture-related URLs
architecture_urls = [
    "https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/",
    "https://www.visitpittsburgh.com/restaurants-culinary/wineries-distilleries/",
    "https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/"]

# File path to save extracted data
file_path = "restaurants_culnary_2.txt"

# Open file in write mode to clear old content
with open(file_path, "w", encoding="utf-8") as file:
    file.write("Pittsburgh Sports Overview\n" + "=" * 80 + "\n")

# Loop through each architecture URL
for url in architecture_urls:
    print(f"\nScraping: {url}")

    # Fetch webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract <h1> Title
    title_h1 = soup.select_one("div.page-title__content h1")
    page_title = title_h1.get_text(strip=True) if title_h1 else "No Title Found"

    # Extract all <div> elements with an ID
    divs_with_ids = soup.find_all("div", id=True)

    # Append results to `architecture.txt`
    with open(file_path, "a", encoding="utf-8") as file:
        file.write(f"\n## {page_title}\n" + "=" * 80 + "\n")

        for div in divs_with_ids:
            div_content = div.get_text(separator="\n\n", strip=True)

            # Only write to file if content is not empty
            if div_content:
                file.write(f"{div_content}\n")
                file.write("\n" + "-" * 80 + "\n\n")  # Separator for readability

print("\n✅ All data successfully saved in 'restaurants_culnary_2.txt' 🚀")




Scraping: https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/

Scraping: https://www.visitpittsburgh.com/restaurants-culinary/wineries-distilleries/

Scraping: https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/

✅ All data successfully saved in 'restaurants_culnary_2.txt' 🚀


In [None]:



from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL for outdoor adventure pages
base_url = "https://www.visitpittsburgh.com/neighborhoods/"

# File path for saving data
file_path = "bars_nightlife.txt"

# Open a file to save the extracted content
with open(file_path, "w", encoding="utf-8") as file:
    for page in range(1):  # Loop through pages 1, 2, 3
        url = base_url.format(page)
        print(f"\nScraping: {url}")
        file.write(f"\nScraping: {url}\n")

        # Open the webpage
        driver.get(url)

        # Wait for JavaScript to load content
        time.sleep(10)

        # Extract the <h1> heading (Page Title)
        try:
            h1_element = driver.find_element(By.CSS_SELECTOR, "h1.page-title__heading")
            h1_text = h1_element.text.strip()
        except:
            h1_text = "No Title Found"

        # Save the heading at the top of the file (only once)
        if page == 1:
            file.write(f"Page Title: {h1_text}\n{'=' * 80}\n")

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find all place names and their corresponding summaries
        headings = driver.find_elements(By.CSS_SELECTOR, "a.card__heading")
        summaries = driver.find_elements(By.CSS_SELECTOR, "div.card__summary")

        # Ensure correct mapping between place names and summaries
        if not headings:
            print(f"No place names found on page {page}. The website structure might have changed.")
            file.write(f"No place names found on page {page}.\n")
            continue

        if not summaries:
            print(f"No summaries found on page {page}. The website structure might have changed.")
            file.write(f"No summaries found on page {page}.\n")
            continue

        for heading, summary in zip(headings, summaries):
            place_name = heading.text.strip()
            summary_text = summary.text.strip()

            # Print and save output
            output = f"Place Name: {place_name}\nAddress: {summary_text}\n" + "-" * 80 + "\n"
            print(output)
            file.write(output)

# Close the Selenium browser
driver.quit()



Scraping: https://www.visitpittsburgh.com/neighborhoods/
