In [13]:
#Installing Selenium and chrome driver
!pip install selenium
!apt-get update
!apt-get install -y chromium-browser
!apt install chromium-chromedriver

#Setting up a Chrome browser that can be used to automate web interactions, such as clicking buttons, filling forms, and scraping data from websites.
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, NoSuchElementException
import time
import csv

!ls /usr/lib/chromium-browser/chromedriver

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,956 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,533 kB]
Fetched 4,756 kB in 3s (1,809 kB/s)
Reading package lists... Done
W: Skipping

In [38]:
# The main code for scraping and aquiring the listings from www.forsalebyowner.com
driver = web_driver()

#url
driver.get('https://www.affordablehousing.com/philadelphia-pa/')

# Wait for the page to load
time.sleep(5)

scraped_results = []

# Function to close modals (if any)
def close_modals():
    try:
        modal_close_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "modal--cls--btn fas fa-times closemodal")))
        modal_close_button.click()
        print("Modal closed")
    except TimeoutException:
        print("No modal or close button found")

# Function to click the "Next" button
def click_view_more():
    try:
        view_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@data-bind="click: function () { GoToNext(); }"]')))  #or //*[text()="Next"]
        view_more_button.click()
        print("View More Listings button clicked")
        return True
    except ElementClickInterceptedException:
        print("Element click intercepted, trying to wait for modal to disappear")
        try:
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "modal--cls--btn fas fa-times closemodal")))
            view_more_button.click()
            print("View More Listings button clicked after modal disappeared")
            return True
        except TimeoutException:
            print("Failed to wait for modal to disappear")
            return False
    except TimeoutException:
        print("View More Listings button not clickable")
        return False

#Function to collect the data listing from website
def extract_listings():
    listings = []
    try:
        body = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="divtnResultPage"]')))

        print("Body element found.")

        property_elements = body.find_elements(By.XPATH, '//div[contains(@class, "tnresult--card")]') #//*[@id="divtnResultList"]/div[4]/div[2]/div[3]/div[2]

        print(f"Found {len(property_elements)} property elements.")

        if len(property_elements) == 0:
            print("No property elements found. Please verify the XPath.")

        for idx, property_element in enumerate(property_elements):
            try:
                # Extract property title and location
                #title = property_element.find_element(By.XPATH, './/a[contains(@class, "block text-xl font-bold")]').text
                location = property_element.find_element(By.XPATH, './/div[contains(@class, "tnresult--price")]').text
                # Extract price
                price = property_element.find_element(By.XPATH, './/div[contains(@class, "tnresult--propertyaddress")]').text
                # Extract property details (e.g., Beds, Baths, Acres)
                #details = property_element.find_element(By.XPATH, '//*[@id="divtnResultList"]/div[4]/div[2]/div[3]/div[3]/div[2]/div/div/div[2]').text
                # Extract seller information (optional)
                #property_type = property_element.find_element(By.XPATH, '//*[@id="0"]/div[2]/ah-property-type/span').text

                # Add the data to the listings list
                listings.append({
                   # "Title": title,
                    "Location": location,
                    "Price": price,
                    #"Details": details,
                    #"Property Type": property_type,
                })
            except NoSuchElementException as e:
                # If an element is missing, let's print the HTML of the property element to debug
                print(f"Error for property {idx + 1}: {e}")
                #print("Property HTML: ", property_element.get_attribute('outerHTML'))
    except Exception as e:
        print(f"Error extracting listings: {e}")
    return listings

# Function to save the data to a CSV file
def save_to_csv(listings, filename="Housing.csv"):
    keys = listings[0].keys()
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        for listing in listings:
            writer.writerow(listing)
    print(f"Data saved to {filename}")


# Main function to scrape all listings (including "View More Listings")
def scrape_all_listings(max_lookups=1500, target_listings=2500):
    all_listings = []
    last_listing_count = 0
    lookup_count = 0

    while True:
        if lookup_count >= max_lookups:
            print("Reached maximum lookup count. Stopping.")
            break

        listings = extract_listings()

        # Check if there are any new listings
        if len(listings) == 0:
            print("No new listings found.")
            break

        # Append unique listings only (check for duplicates based on a unique key, e.g., title + location)
        for listing in listings:
            if listing not in all_listings:
                all_listings.append(listing)

        print(f"Total listings found so far: {len(all_listings)}")

        # Stop if we have reached the target number of listings
        if len(all_listings) >= target_listings:
            print(f"Reached the target number of {target_listings} listings. Stopping.")
            break

        # Check if the number of listings has increased, if not, break the loop
        if len(all_listings) == last_listing_count:
            print("No new listings found. Ending the scraping process.")
            break

        last_listing_count = len(all_listings)

        # Try to load more listings
        more_loaded = click_view_more()
        if not more_loaded:
            print("No more listings to load.")
            break

        lookup_count += 1
        time.sleep(10)  # Add a delay to reduce the load on the website

    return all_listings


# Run the scraping process
try:
    all_listings = scrape_all_listings()
    print(f"Scraping complete. Found {len(all_listings)} listings.")

    if all_listings:
        save_to_csv(all_listings)

    for listing in all_listings:
        print(listing)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()

Body element found.
Found 64 property elements.
Total listings found so far: 31
View More Listings button not clickable
No more listings to load.
Scraping complete. Found 31 listings.
Data saved to Housing.csv
{'Location': '$1,780', 'Price': '1033 S 55th St, 1st Flr, Philadelphia, PA 19143'}
{'Location': '$1,780', 'Price': '1122 Marlyn Rd, Philadelphia, PA 19151'}
{'Location': '$1,300', 'Price': '4509 Laird St, Philadelphia, PA 19139'}
{'Location': '$945', 'Price': '5813 Willows Ave, 3-R, Philadelphia, PA 19143'}
{'Location': '$1,675', 'Price': '5816 Pemberton St, Philadelphia, PA 19143'}
{'Location': '$1,500', 'Price': '5844 Pentridge St, Philadelphia, PA 19143'}
{'Location': '$1,700', 'Price': '6101 Tackawanna St, 2, Philadelphia, PA 19135'}
{'Location': '$1,800', 'Price': '852 N 41st St, 1, Philadelphia, PA 19104'}
{'Location': '$1,750', 'Price': '6230 Delancey St, 0, Philadelphia, PA 19143'}
{'Location': '$1,600', 'Price': '1226 W Oakdale St, Philadelphia, PA 19133'}
{'Location': '