## Scraping off of [Graana.com](https://www.graana.com) 

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import os

In [None]:
def scrape_graana_lahore(output_csv='data/graana_lahore_data.csv', max_pages=30):
    """
    Scrapes property listings from Graana.com for Lahore.

    WARNING: For educational purposes only. Web scraping can be against the
    terms of service of a website. Please respect Graana.com's policies.
    Frequent, rapid requests can get your IP address blocked.
    """

    # Base URLs for different categories in Lahore on Graana.com
    # The structure is typically /lahore/{category}?page=
    base_url_sale_houses = "https://www.graana.com/sale/residential-properties-sale-lahore-2/?pageSize=30&page={}"
    base_url_sale_plots = "https://www.graana.com/sale/plot-sale-lahore-2/?page={}&pageSize=30"
    base_url_commercial = "https://www.graana.com/sale/commercial-properties-sale-lahore-2/?page={}&pageSize=30"
    base_url_rentals_commercial = "https://www.graana.com/rent/commercial-properties-rent-lahore-2/?page={}&pageSize=30"
    base_url_rentals_residential = "https://www.graana.com/rent/residential-properties-rent-lahore-2/?page={}&pageSize=30"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }

    all_listings = []
    existing_urls = set()

    # --- Step 1: Load existing data and URLs from CSV if it exists ---
    if os.path.exists(output_csv):
        print(f"Loading existing data from {output_csv}...")
        try:
            with open(output_csv, 'r', newline='', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    # Append existing data to our list
                    all_listings.append(row)
                    # Add the URL to our set for duplicate checking
                    if 'URL' in row and row['URL']:
                        existing_urls.add(row['URL'])
            print(f"Loaded {len(existing_urls)} existing URLs. Will skip any duplicates.")
        except Exception as e:
            print(f"Could not read existing CSV file. Starting fresh. Error: {e}")
            all_listings = [] # Reset list if the file is corrupt or unreadable

    
    def fetch_listings(url_template, category):
        """Fetches and parses listings for a given category."""
        page_num = 1
        while page_num <= max_pages:
            url = url_template.format(page_num)
            print(f"Fetching {category} from: {url}")
            try:
                response = requests.get(url, headers=headers, timeout=15)
                response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {url}: {e}")
                break # Stop this category if a page fails to load

            soup = BeautifulSoup(response.content, 'html.parser')

            # The main container for each property listing
            listings = soup.find_all('div', class_='MuiBox-root mui-style-17zbhp0')

            if not listings:
                print(f"No more {category} listings found on page {page_num}. Moving to next category.")
                break

            for listing in listings:
                try:
                    # Find the link tag which contains the URL and the title
                    link_tag = listing.find('a')
                    
                    if not link_tag:
                        continue # Skip if no link tag is found

                    # --- Step 2: Construct full URL and check for duplicates ---
                    relative_link = link_tag.get('href', '')
                    full_url = "https://www.graana.com" + relative_link
                    print(f"Processing listing URL: {full_url}")

                    
                    if full_url in existing_urls:
                        # print(f"Skipping duplicate: {full_url}")
                        continue # Skip to the next listing if URL is already known
                    
                    # Open the listing URL to extract the title from the detail page
                    try:
                        detail_response = requests.get(full_url, headers=headers, timeout=15)
                        detail_response.raise_for_status()
                        detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                        # print(f"Detail soup sample: {detail_soup.title.string if detail_soup.title else 'No title found'}")
                        # Try to extract the title from the detail page
                        # detail_title_tag = detail_soup.find('h1', class_='MuiTypography-root MuiTypography-h3New mui-style-s6x0cd')
                        title = detail_soup.title.string.strip() if detail_soup.title else 'N/A'
                        # print(f"Extracted title from detail page: {detail_soup.title.text.strip()}")
                    except Exception as e:
                        print(f"Could not fetch title from detail page {full_url}: {e}")
                        title = link_tag.text.strip() if link_tag else 'N/A'
                    # title = link_tag.text.strip() if link_tag else 'N/A'

                    price_tag = listing.find('div', class_='MuiTypography-root MuiTypography-h4New mui-style-gz23my')
                    price = price_tag.text.strip() if price_tag else 'N/A'
                    
                    location_tag = listing.find('h5')
                    location = location_tag.text.strip() if location_tag else 'N/A'

                    # Area is often found in a container with other features
                    area_tag = listing.find('div', class_='MuiTypography-root MuiTypography-body2New mui-style-1548769')
                    area = area_tag.text.strip() if area_tag else 'N/A'
                    
                    print(f"Found new listing: {title}, Price: {price}, Location: {location}, Area: {area}")

                    # Add the new, unique listing to our list
                    all_listings.append({
                        'Category': category,
                        'Title': title,
                        'Price': price,
                        'Location': location,
                        'Area': area,
                        'URL': full_url
                    })

                    # --- Step 3: Add the new URL to the set to avoid duplicates in this session ---
                    existing_urls.add(full_url)

                except AttributeError as e:
                    print(f"Skipping a listing due to a missing element: {e}")
                except Exception as e:
                    print(f"An unexpected error occurred while processing a listing: {e}")

            page_num += 1
            # Be respectful to the server by waiting between requests
            time.sleep(random.uniform(3, 6))

    print("--- Starting to scrape Graana.com (Lahore) ---")
    
    # Scrape each category
    # fetch_listings(base_url_sale_houses, 'Homes for Sale')
    # fetch_listings(base_url_sale_plots, 'Plots for Sale')
    # fetch_listings(base_url_rentals_residential, 'Rentals')
    # fetch_listings(base_url_rentals_commercial, 'Commercial Rentals')
    fetch_listings(base_url_commercial, 'Commercial for Sale')

    print(f"\nScraping completed. Total listings to write: {len(all_listings)}.")

    # --- Step 4: Write all data (old and new) to the CSV file ---
    if all_listings:
        # Get the keys from the first dictionary in the list
        keys = all_listings[0].keys()
        with open(output_csv, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_listings)
        print(f"Data successfully exported to {output_csv}")
    else:
        print("No data was scraped to export.")

if __name__ == "__main__":
    # You can change the number of pages to scrape here
    scrape_graana_lahore(max_pages=30)

Loading existing data from data/graana_lahore_data.csv...
Loaded 1964 existing URLs. Will skip any duplicates.
--- Starting to scrape Graana.com (Lahore) ---
Fetching Commercial for Sale from: https://www.graana.com/sale/commercial-properties-sale-lahore-2/?page=1&pageSize=30
Processing listing URL: https://www.graana.com/property/300-sqft-shop-sale-bismillah-housing-scheme-lahore-1466928/
Processing listing URL: https://www.graana.com/property/1100-sqft-plaza-sale-bahria-town---sector-e--lahore-1466682/
Processing listing URL: https://www.graana.com/property/190-sqft-office-sale-anarkali-lahore-1464300/
Processing listing URL: https://www.graana.com/property/2.4-marla-shop-sale-township-lahore-1462300
Processing listing URL: https://www.graana.com/property/7-marla-office-sale-audit-and-accounts-housing-society-lahore-1460806/
Processing listing URL: https://www.graana.com/property/489-sqft-office-sale-gulberg-3-lahore-1456759/
Processing listing URL: https://www.graana.com/property/10