<a href="https://colab.research.google.com/github/tobitprince/New-folder--4-/blob/main/property24_apartments_flats_to_rent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install tenacity



In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
import signal
import sys
import os

class Property24Scraper:
    def __init__(self, max_pages=8200):
        self.base_url = "https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.properties = []
        self.max_pages = max_pages
        self.filename = '/content/drive/MyDrive/property24-apartments-flats-to-rent-in-nairobi.csv'

        # Register signal handlers
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

    def signal_handler(self, sig, frame):
        """Handle interrupt signals and save progress"""
        print('\nInterrupt received! Saving current progress...')
        self.save_to_csv()
        sys.exit(0)

    def get_page_content(self, url):
        """Fetch page content with error handling"""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching page {url}: {e}")
            time.sleep(5)
            return None

    def extract_property_details(self, listing):
        """Extract details from a property listing"""
        property_data = {
            'price': 'N/A',
            'title': 'N/A',
            'location': 'N/A',
            'address': 'N/A',
            'description': 'N/A',
            'bedrooms': 'N/A',
            'bathrooms': 'N/A',
            'parking': 'N/A',
            'size': 'N/A',
            'url': 'N/A',
            'image_url': 'N/A'
        }

        try:
            # Price
            if price := listing.find('span', class_='p24_price'):
                property_data['price'] = price.text.strip()

            # Title
            if title := listing.find('span', class_='p24_propertyTitle'):
                property_data['title'] = title.text.strip()

            # Location
            if location := listing.find('span', class_='p24_location'):
                property_data['location'] = location.text.strip()

            # Address
            if address := listing.find('span', class_='p24_address'):
                property_data['address'] = address.text.strip()

            # Description
            if description := listing.find('span', class_='p24_excerpt'):
                property_data['description'] = description.text.strip()

            # Features
            if features := listing.find('span', class_='p24_icons'):
                if bedrooms := features.find('span', title='Bedrooms'):
                    property_data['bedrooms'] = bedrooms.find('span').text.strip() if bedrooms.find('span') else 'N/A'

                if bathrooms := features.find('span', title='Bathrooms'):
                    property_data['bathrooms'] = bathrooms.find('span').text.strip() if bathrooms.find('span') else 'N/A'

                if parking := features.find('span', title='Parking Spaces'):
                    property_data['parking'] = parking.find('span').text.strip() if parking.find('span') else 'N/A'

                if size := features.find('span', class_='p24_size'):
                    property_data['size'] = size.find('span').text.strip() if size.find('span') else 'N/A'

            # Listing URL
            if link := listing.find('a', href=True):
                property_data['url'] = urljoin(self.base_url, link['href'])

            # Image URL
            if image := listing.find('img', src=True):
                property_data['image_url'] = image['src']

        except Exception as e:
            print(f"Error extracting property details: {e}")

        return property_data

    def scrape_properties(self):
        """Main scraping logic"""
        for page in range(1, self.max_pages + 1):
            url = f"{self.base_url}?Page={page}"
            print(f"Scraping page {page}/{self.max_pages}: {url}")

            soup = self.get_page_content(url)
            if not soup:
                print(f"Failed to get content for page {page}")
                continue

            listings = soup.find_all('div', class_='p24_regularTile')
            if not listings:
                print("No more listings found")
                break

            for listing in listings:
                property_data = self.extract_property_details(listing)
                self.properties.append(property_data)

            # Periodic saving
            if page % 10 == 0:
                self.save_to_csv(intermediate=True)

            time.sleep(2)  # Rate limiting

    def save_to_csv(self, filename=None, intermediate=False):
        """Save properties to CSV file"""
        if not self.properties:
            print("No properties to save")
            return

        filename = filename or self.filename
        mode = 'a' if intermediate and os.path.exists(filename) else 'w'

        fieldnames = ['price', 'title', 'location', 'address', 'description',
                     'bedrooms', 'bathrooms', 'parking', 'size', 'url', 'image_url']

        try:
            with open(filename, mode, newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                if mode == 'w':
                    writer.writeheader()
                writer.writerows(self.properties)

            status = "Intermediate save" if intermediate else "Final save"
            print(f"{status}: Saved {len(self.properties)} properties to {filename}")

            if not intermediate:
                self.properties = []  # Clear after final save

        except IOError as e:
            print(f"Error saving to CSV: {e}")

def main():
    """Main execution function"""
    scraper = Property24Scraper(max_pages=8200)
    try:
        scraper.scrape_properties()
        scraper.save_to_csv()
    except Exception as e:
        print(f"Scraping failed: {e}")
        scraper.save_to_csv()  # Save progress on failure

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping page 2531/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2531
Scraping page 2532/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2532
Scraping page 2533/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2533
Scraping page 2534/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2534
Scraping page 2535/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2535
Scraping page 2536/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2536
Scraping page 2537/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2537
Scraping page 2538/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairobi-c1890?Page=2538
Scraping page 2539/8200: https://www.property24.co.ke/apartments-flats-to-rent-in-nairo