### Zameen Web Scraping 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import os 


In [None]:


def scrape_zameen_lahore(output_csv='data/zameen_lahore_data.csv', max_pages=10):
    """
    WARNING: educational purposes only.
    It is highly likely to be blocked by Zameen.com's anti-scraping measures.
    Please respect their robots.txt and terms of service.
    """

    base_url_sale_houses = "https://www.zameen.com/Homes/Lahore-1-{}.html"
    base_url_sale_plots = "https://www.zameen.com/Plots/Lahore-1-{}.html"
    base_url_rentals = "https://www.zameen.com/Rentals/Lahore-1-{}.html"
    base_url_commercial = "https://www.zameen.com/Commercial/Lahore-1-{}.html"
    base_url_commercial_rentals = "https://www.zameen.com/Commercial_Rentals/Lahore-1-{}.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    all_listings = []
    existing_urls = set()

    #Duplicate URL check for avoidance of repitions
    if os.path.exists(output_csv):
        # print(f"Load scraped data in our file aka : {output_csv}")
        try:
            with open(output_csv, 'r', newline='', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    all_listings.append(row)
                    if 'URL' in row and row['URL']:
                        existing_urls.add(row['URL'])
            print(f"Loaded {len(existing_urls)} existing URLs. Will skip duplicates.")
        except Exception as e:
            print(f"Could not read existing CSV file. Starting fresh. Error: {e}")
            all_listings = [] # Reset in case of read error

    def fetch_listings(url_template, category):
        page_num = 1
        while page_num <= max_pages:
            url = url_template.format(page_num)
            print(f"Fetching {category} from: {url}")
            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {url}: {e}")
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            listings = soup.find_all('li', attrs={'aria-label':"Listing"})

            if not listings:
                print(f"No more {category} listings found on page {page_num}. Exiting.")
                break

            for listing in listings:
                try:
                    title_tag = listing.find('a', attrs={'aria-label': "Listing link"})
                    if title_tag and 'title' in title_tag.attrs:
                        title = title_tag['title'].strip()
                        link = title_tag['href'].strip() if 'href' in title_tag.attrs else 'N/A'
                        
                        # construct full URL and check for duplicates 
                        full_url = "https://www.zameen.com" + link
                        if full_url in existing_urls:
                            print(f"Skipping duplicate: {title}")
                            continue # skip to the next listing

                        price_tag = listing.find('span', attrs={'aria-label': "Price"})
                        price = price_tag.text.strip() if price_tag else 'N/A'

                        location_tag = listing.find('div', attrs={'aria-label': 'Location'})
                        location = location_tag.text.strip() if location_tag else 'N/A'

                        area_tag = listing.find('span', attrs={'aria-label':"Area"})
                        area = area_tag.text.strip() if area_tag else 'N/A'
                        
                        print(f"Found new listing: {title}")

                        all_listings.append({
                            'Category': category,
                            'Title': title,
                            'Price': price,
                            'Location': location,
                            'Area': area,
                            'URL': full_url
                        })

                        # Add the new URL to the set to avoid duplicates in this session ---
                        existing_urls.add(full_url)
                        
                except AttributeError as e:
                    print(f"Skipping a listing due to missing element: {e}")
                except Exception as e:
                    print(f"An unexpected error occurred while processing a listing: {e}")

            page_num += 1
            time.sleep(random.uniform(2, 5))

    print("starting to scrape Zameen.com (Lahore)")

    fetch_listings(base_url_sale_houses, 'Homes')
    fetch_listings(base_url_sale_plots, 'Plots')
    fetch_listings(base_url_rentals, 'Rentals')
    fetch_listings(base_url_commercial, 'Commercial')
    fetch_listings(base_url_commercial_rentals, 'Commercial Rentals')

    print(f"Scraping completed. Total listings to write: {len(all_listings)}.")

    # overwrite the CSV with the combined new and old data ---
    if all_listings:
        # Ensure all dictionaries have the same keys before writing
        keys = all_listings[0].keys()
        with open(output_csv, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_listings)
        print(f"Data successfully exported to {output_csv}")
    else:
        print("No data was scraped to export.")

if __name__ == "__main__":
    scrape_zameen_lahore(max_pages=20)

Loaded 3061 existing URLs. Will skip duplicates.
starting to scrape Zameen.com (Lahore)
Fetching Homes from: https://www.zameen.com/Homes/Lahore-1-1.html
Found new listing: 1 Kanal Brand New Magnificent Designer House For Sale
Found new listing: 14 Marla 4 Bedroom Brand New Apartment Available For Sale In Askari 10 Sector F Lahore Cantt
Skipping duplicate: Luxurious 1 Kanal House for Sale Your Dream Home Awaits!
Found new listing: 1 Kanal Owner Build Modern Bungalow For Sale In Phase 3 DHA Prime Location
Found new listing: Luxury Apartment for Sale The Oasis Grand 14 by Landmark Developers in Down Town
Found new listing: Near to Park House For sale In DHA Phase 2 - Block S
Found new listing: brand new house
Found new listing: Luxury 1 Bed Apartment For Sale In (Gulberg 3) Best For Living And Investment!
Found new listing: Discover The Charm Of This Stunning 10 Marla Lewis-Built Home Perfect Blend Of Luxury And Modern Living In A Prime Neighborhood
Found new listing: SUPER HOT LOCATION 