In [7]:
# mounted at /content/drive

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

# User-Agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}



### Overview
This code sets up the necessary tools and headers to scrape web data. It imports libraries for sending HTTP requests, parsing HTML, and handling data. Additionally, it defines a user agent header to mimic a web browser, which can be useful to avoid getting blocked by some websites.

***Import requests**
    This line imports the requests module, which is a popular Python module used to send HTTP requests to websites.
* **From bs4 import BeautifulSoup**
    This line imports the BeautifulSoup from the bs4 module. BeautifulSoup is a library that is used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree that can be used to extract data in a hierarchical and more readable manner.
* **Import pandas as pd**
    This line imports the pandas library, which is a powerful data manipulation and analysis tool. You will use it to structure and analyze the data you scrape from the websites.
* **Import os**
    This line imports the os module, which is used for interacting with the operating system. This could be used for tasks like creating directories, reading environment variables, etc.
* **Import time**
    This line imports the time module, which can be used to add delays to your code. Adding delays is a good practice when web scraping to avoid sending too many requests too quickly, which could lead to you getting blocked by the website.


# Extracting Flats/Apartments

In [8]:
# mounted at /content/drive

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

# User-Agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}

flats = pd.DataFrame()

start = 127
end = 200
csv_file = f"/content/drive/MyDrive/DSP/Case Studies/Real estate/Flats_gurgaon_data_p{start}-{end}.csv"

pageNumber = start
req=0
city = 'gurgaon'

while pageNumber < end:
    url = f"https://www.99acres.com/flats-in-{city}-ffid-page-{pageNumber}"
    page = requests.get(url, headers=headers)
    
    pageSoup = BeautifulSoup(page.content, 'html.parser')
    
    # Rest of the code for parsing and extracting data would go here...
    for soup in pageSoup.select_one('div[data-label="SEARCH"]').select('section[data-hydration-on-demand="true"]'):
        # Extract property name and property sub-name
        try:
            property_name = soup.select_one('a.srpTuple_propertyName').text.strip()
            link = soup.select_one('a.srpTuple_propertyName')['href']
            society = soup.select_one('div#srp_tuple_society_heading').text.strip()
        except:
            property_name = None
            link = None
            society = None
            continue # If there's an error on the main page, skip to the next item

        # Detail Page
        page = requests.get(link, headers=headers)
        dpageSoup = BeautifulSoup(page.content, 'html.parser')
        req += 1

        try:
            # Price Range
            price = dpageSoup.select_one('#pdPrice2').text.strip()
        except:
            price = None
            
        try:
            # Area
            area = dpageSoup.select_one('#srp_tuple_price_per_unit_area').text.strip()
        except:
            area = None
            
        try:
            # Area with Type
            areawithType = dpageSoup.select_one('#FartArea').text.strip()
        except:
            areawithType = None
            
        try:
            # Bedroom
            bedroom = dpageSoup.select_one('div.srpTuple__bedroomNum').text.strip()
        except:
            bedroom = None
            
        try:
            # Bathroom
            bathroom = dpageSoup.select_one('div.srpTuple__bathroomNum').text.strip()
        except:
            bathroom = None
        try:
            # Balcony
            balcony = dpageSoup.select_one('div.srpTuple__balconyNum').text.strip()
        except:
            balcony = None    
        try:
            # Parking
            parking = dpageSoup.select_one('div.srpTuple__parking').text.strip()
        except:
            parking = None

        try:
            # Address
            address = dpageSoup.select_one('div#address').text.strip()
        except:
            address = None

        try:
            # Floor Number
            floorNum = dpageSoup.select_one('div#floor_number').text.strip()
        except:
            floorNum = None
        
        try:
            # Facing
            facing = dpageSoup.select_one('div#facing').text.strip()
        except:
            facing = None

        try:
            # Age of Possession
            agePossession = dpageSoup.select_one('div#agePossessionLbl').text.strip()
        except:
            agePossession = None
        
        try:
            # Nearby Locations
            nearbyLocations = [i.text.strip() for i in dpageSoup.select_one('div.nearbyLocation_tagWrap').select('span.nearbyLocation_infoText')]
        except:
            nearbyLocations = []

        try:
            # Description
            description = dpageSoup.select_one('div.srp_tuple_description').text.strip()
        except:
            description = None

        try:
            # Furnish Details
            furnishDetails = [i.text.strip() for i in dpageSoup.select_one('ul#FurnishDetails').select('li')]
        except:
            furnishDetails = []

        # Features
        if furnishDetails:
            try:
                features = [i.text.strip() for i in dpageSoup.select_one('ul#features').select('li')]
            except:
                features = []
        else:
            try:
                features = [i.text.strip() for i in dpageSoup.select_one('ul#features').select('li')]
            except:
                features = []
                
        try:
            # Rating
            rating = [i.text.strip() for i in dpageSoup.select_one('div.review_rightSide_div_1_5').select('div.ratingByFeature_circleWrap')]
        except:
            rating = None
            
        try:
            # Property ID
            property_id = dpageSoup.select_one('div#prop_id').text.strip()
        except:
            property_id = None
            
        # create a dictionary with the given variables
        property_data = {
            'property_name': property_name,
            'link': link,
            'society': society,
            'price': price,
            'area': area,
            'areawithType': areawithType,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'balcony': balcony,
            'additionalRooms': None, # This was not in your screenshots but is a good practice
            'address': address,
            'floorNum': floorNum,
            'facing': facing,
            'agePossession': agePossession,
            'nearbyLocations': nearbyLocations,
            'description': description,
            'furnishDetails': furnishDetails,
            'features': features,
            'rating': rating,
            'property_id': property_id
        }
        
    temp_df = pd.DataFrame.from_records([property_data])
    # print(temp_df)
    flats = pd.concat([flats, temp_df], ignore_index=True)
    
    if os.path.isfile(csv_file):
        # Append DataFrame to the existing file without header
        temp_df.to_csv(csv_file, mode='a', header=False, index=False)
    else:
        # Write DataFrame to the file with header
        temp_df.to_csv(csv_file, mode='a', header=True, index=False)
        
    if req % 4 == 0:
        time.sleep(10)
    if req % 15 == 0:
        time.sleep(50)
    
    print(f"{pageNumber} -> {req}")
    pageNumber += 1


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

The code scrapes property data from the website "99acres.com" for apartments in Gurgaon. It navigates through a range of pages, extracts details of each property, and saves the data to a CSV file. The script is designed to handle potential errors gracefully, using try and except blocks to manage missing data, and introduces pauses to avoid making rapid requests and potentially getting blocked by the website
Initialization of Variables:
⚫ start and end specify the range of web pages to scrape.
csv file defines the path to the CSV file where data will be saved.
pageNumber starts from the initial value of start and will be incremented to navigate through the pages.
req counts the number of HTTP requests made.
Loop for Page Navigation
The while loop is used to navigate through each page in the range from start to end.
Inside this loop, the URL of the page to be scraped is constructed using the pageNumber.
An HTTP GET request is made to retrieve the content of the page, and the content is then parsed using BeautifulSoup.
Loop for Property Extraction.
The nested for loop navigates through individual property sections on the current page.
The script attempts to extract the property name, its link, and its society name.
If any of these attributes are missing, it skips to the next property.
00:33:44
For each property, an HTTP request is made to its detail page
1.
Session 1 on Capstone Project - Data Gathering property details sxe pike, aida. Leam Sabunt, battitom count, balcony count address, and many other attributes. if any attribute is missing the code handles it gracefully, assigning an empty string or an empty list as

In [None]:
# New: Create a session with a retry strategy
session = requests.Session()
retry_strategy = Retry(
    total=3,  # Number of retries
    backoff_factor=1,  # Delay factor (1s, 2s, 4s...)
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)


# Your existing code
flats = pd.DataFrame()

start = 127
end = 200
csv_file = f"/content/drive/MyDrive/DSP/Case Studies/Real estate/Flats_gurgaon_data_p{start}-{end}.csv"

pageNumber = start
req=0
city = 'gurgaon'

while pageNumber < end:
    url = f"https://www.99acres.com/flats-in-{city}-ffid-page-{pageNumber}"
    
    # Use the session to make requests
    try:
        page = session.get(url, headers=headers)
        pageSoup = BeautifulSoup(page.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        pageNumber += 1
        continue
    
    search_results = pageSoup.select_one('div[data-label="SEARCH"]')
    
    if search_results:
        for soup in search_results.select('section[data-hydration-on-demand="true"]'):
            # Extract property name and property sub-name
            try:
                property_name = soup.select_one('a.srpTuple_propertyName').text.strip()
                link = soup.select_one('a.srpTuple_propertyName')['href']
                society = soup.select_one('div#srp_tuple_society_heading').text.strip()
            except:
                property_name = None
                link = None
                society = None
                continue # If there's an error on the main page, skip to the next item

            # Detail Page
            try:
                page = session.get(link, headers=headers)
                dpageSoup = BeautifulSoup(page.content, 'html.parser')
                req += 1
            except requests.exceptions.RequestException as e:
                print(f"Error making request to detail page: {e}")
                continue

            try:
                # Price Range
                price = dpageSoup.select_one('#pdPrice2').text.strip()
            except:
                price = None
                
            try:
                # Area
                area = dpageSoup.select_one('#srp_tuple_price_per_unit_area').text.strip()
            except:
                area = None
                
            try:
                # Area with Type
                areawithType = dpageSoup.select_one('#FartArea').text.strip()
            except:
                areawithType = None
                
            try:
                # Bedroom
                bedroom = dpageSoup.select_one('div.srpTuple__bedroomNum').text.strip()
            except:
                bedroom = None
                
            try:
                # Bathroom
                bathroom = dpageSoup.select_one('div.srpTuple__bathroomNum').text.strip()
            except:
                bathroom = None
            try:
                # Balcony
                balcony = dpageSoup.select_one('div.srpTuple__balconyNum').text.strip()
            except:
                balcony = None    
            try:
                # Parking
                parking = dpageSoup.select_one('div.srpTuple__parking').text.strip()
            except:
                parking = None

            try:
                # Address
                address = dpageSoup.select_one('div#address').text.strip()
            except:
                address = None

            try:
                # Floor Number
                floorNum = dpageSoup.select_one('div#floor_number').text.strip()
            except:
                floorNum = None
            
            try:
                # Facing
                facing = dpageSoup.select_one('div#facing').text.strip()
            except:
                facing = None

            try:
                # Age of Possession
                agePossession = dpageSoup.select_one('div#agePossessionLbl').text.strip()
            except:
                agePossession = None
            
            try:
                # Nearby Locations
                nearbyLocations = [i.text.strip() for i in dpageSoup.select_one('div.nearbyLocation_tagWrap').select('span.nearbyLocation_infoText')]
            except:
                nearbyLocations = []

            try:
                # Description
                description = dpageSoup.select_one('div.srp_tuple_description').text.strip()
            except:
                description = None

            try:
                # Furnish Details
                furnishDetails = [i.text.strip() for i in dpageSoup.select_one('ul#FurnishDetails').select('li')]
            except:
                furnishDetails = []

            # Features
            if furnishDetails:
                try:
                    features = [i.text.strip() for i in dpageSoup.select_one('ul#features').select('li')]
                except:
                    features = []
            else:
                try:
                    features = [i.text.strip() for i in dpageSoup.select_one('ul#features').select('li')]
                except:
                    features = []
                    
            try:
                # Rating
                rating = [i.text.strip() for i in dpageSoup.select_one('div.review_rightSide_div_1_5').select('div.ratingByFeature_circleWrap')]
            except:
                rating = None
                
            try:
                # Property ID
                property_id = dpageSoup.select_one('div#prop_id').text.strip()
            except:
                property_id = None
                
            # create a dictionary with the given variables
            property_data = {
                'property_name': property_name,
                'link': link,
                'society': society,
                'price': price,
                'area': area,
                'areawithType': areawithType,
                'bedroom': bedroom,
                'bathroom': bathroom,
                'balcony': balcony,
                'additionalRooms': None, # This was not in your screenshots but is a good practice
                'address': address,
                'floorNum': floorNum,
                'facing': facing,
                'agePossession': agePossession,
                'nearbyLocations': nearbyLocations,
                'description': description,
                'furnishDetails': furnishDetails,
                'features': features,
                'rating': rating,
                'property_id': property_id
            }
            
        temp_df = pd.DataFrame.from_records([property_data])
        # print(temp_df)
        flats = pd.concat([flats, temp_df], ignore_index=True)
        
        if os.path.isfile(csv_file):
            # Append DataFrame to the existing file without header
            temp_df.to_csv(csv_file, mode='a', header=False, index=False)
        else:
            # Write DataFrame to the file with header
            temp_df.to_csv(csv_file, mode='a', header=True, index=False)
            
        if req % 5 == 0:
            time.sleep(10)
        if req % 15 == 0:
            time.sleep(50)
        
        print(f"{pageNumber} -> {req}")
    pageNumber += 1


In [None]:
# mounted at /content/drive

import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

# User-Agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}

# New: Create a session with a retry strategy
session = requests.Session()
retry_strategy = Retry(
    total=3,  # Number of retries
    backoff_factor=1,  # Delay factor (1s, 2s, 4s...)
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)


# Your existing code
flats = pd.DataFrame()

start = 1
end = 10
csv_file = f"/content/drive/MyDrive/DSP/Case Studies/Real estate/Flats_gurgaon_housing_data_p{start}-{end}.csv"

pageNumber = start
req = 0
city = 'gurgaon'

while pageNumber < end:
    url = f"https://housing.com/in/buy/flats/p/{city}?page={pageNumber}"
    
    # Use the session to make requests
    try:
        page = session.get(url, headers=headers)
        pageSoup = BeautifulSoup(page.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        pageNumber += 1
        continue
    
    # The new search result selector for Housing.com
    search_results = pageSoup.find_all('div', {'data-testid': 'listing-card'})
    
    if search_results:
        for soup in search_results:
            # Extract property name and property sub-name
            try:
                property_name = soup.find('h2', {'data-testid': 'listing-card-title'}).text.strip()
                link = soup.find('a', {'data-testid': 'listing-card-link'})['href']
                society = soup.find('div', {'data-testid': 'listing-card-title-address'}).text.strip()
            except:
                property_name = None
                link = None
                society = None
                continue # If there's an error on the main page, skip to the next item

            # Detail Page
            try:
                page = session.get(link, headers=headers)
                dpageSoup = BeautifulSoup(page.content, 'html.parser')
                req += 1
            except requests.exceptions.RequestException as e:
                print(f"Error making request to detail page: {e}")
                continue

            # This part will need to be updated with new selectors for the detail page of Housing.com
            # As the structure is different, you will need to inspect the elements of a Housing.com detail page
            # to find the correct selectors for price, area, bedroom, etc.
            
            # For now, we will add placeholders. You can fill these in after inspecting the page.
            price = None
            area = None
            areawithType = None
            bedroom = None
            bathroom = None
            balcony = None
            parking = None
            address = None
            floorNum = None
            facing = None
            agePossession = None
            nearbyLocations = []
            description = None
            furnishDetails = []
            features = []
            rating = None
            property_id = None
                
            # create a dictionary with the given variables
            property_data = {
                'property_name': property_name,
                'link': link,
                'society': society,
                'price': price,
                'area': area,
                'areawithType': areawithType,
                'bedroom': bedroom,
                'bathroom': bathroom,
                'balcony': balcony,
                'additionalRooms': None, 
                'address': address,
                'floorNum': floorNum,
                'facing': facing,
                'agePossession': agePossession,
                'nearbyLocations': nearbyLocations,
                'description': description,
                'furnishDetails': furnishDetails,
                'features': features,
                'rating': rating,
                'property_id': property_id
            }
            
        temp_df = pd.DataFrame.from_records([property_data])
        flats = pd.concat([flats, temp_df], ignore_index=True)
        
        if os.path.isfile(csv_file):
            temp_df.to_csv(csv_file, mode='a', header=False, index=False)
        else:
            temp_df.to_csv(csv_file, mode='a', header=True, index=False)
            
        if req % 5 == 0:
            time.sleep(10)
        if req % 15 == 0:
            time.sleep(50)
        
        print(f"{pageNumber} -> {req}")
    pageNumber += 1


In [10]:
# mounted at /content/drive

import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

# User-Agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}

# New: Create a session with a retry strategy
session = requests.Session()
retry_strategy = Retry(
    total=3,  # Number of retries
    backoff_factor=1,  # Delay factor (1s, 2s, 4s...)
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)


# Your existing code
flats = pd.DataFrame()

start = 1
end = 10
csv_file = f"/content/drive/MyDrive/DSP/Case Studies/Real estate/Flats_gurgaon_housing_data_p{start}-{end}.csv"

pageNumber = start
req = 0
city = 'gurgaon'

while pageNumber < end:
    url = f"https://housing.com/in/buy/flats/p/{city}?page={pageNumber}"
    
    # Use the session to make requests
    try:
        page = session.get(url, headers=headers)
        pageSoup = BeautifulSoup(page.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        pageNumber += 1
        continue
    
    # The new search result selector for Housing.com
    search_results = pageSoup.find_all('div', {'data-testid': 'listing-card'})
    
    if search_results:
        for soup in search_results:
            # Extract property name and property sub-name
            try:
                property_name = soup.find('h2', {'data-testid': 'listing-card-title'}).text.strip()
                link = soup.find('a', {'data-testid': 'listing-card-link'})['href']
                society = soup.find('div', {'data-testid': 'listing-card-title-address'}).text.strip()
            except:
                property_name = None
                link = None
                society = None
                continue # If there's an error on the main page, skip to the next item

            # Detail Page
            try:
                # Add a longer, randomized sleep here to appear more human-like
                time.sleep(1 + time.random() * 2) 
                page = session.get(link, headers=headers)
                dpageSoup = BeautifulSoup(page.content, 'html.parser')
                req += 1
            except requests.exceptions.RequestException as e:
                print(f"Error making request to detail page: {e}")
                continue

            # This part will need to be updated with new selectors for the detail page of Housing.com
            # As the structure is different, you will need to inspect the elements of a Housing.com detail page
            # to find the correct selectors for price, area, bedroom, etc.
            
            # For now, we will add placeholders. You can fill these in after inspecting the page.
            price = None
            area = None
            areawithType = None
            bedroom = None
            bathroom = None
            balcony = None
            parking = None
            address = None
            floorNum = None
            facing = None
            agePossession = None
            nearbyLocations = []
            description = None
            furnishDetails = []
            features = []
            rating = None
            property_id = None
                
            # create a dictionary with the given variables
            property_data = {
                'property_name': property_name,
                'link': link,
                'society': society,
                'price': price,
                'area': area,
                'areawithType': areawithType,
                'bedroom': bedroom,
                'bathroom': bathroom,
                'balcony': balcony,
                'additionalRooms': None, 
                'address': address,
                'floorNum': floorNum,
                'facing': facing,
                'agePossession': agePossession,
                'nearbyLocations': nearbyLocations,
                'description': description,
                'furnishDetails': furnishDetails,
                'features': features,
                'rating': rating,
                'property_id': property_id
            }
            
        temp_df = pd.DataFrame.from_records([property_data])
        flats = pd.concat([flats, temp_df], ignore_index=True)
        
        if os.path.isfile(csv_file):
            temp_df.to_csv(csv_file, mode='a', header=False, index=False)
        else:
            temp_df.to_csv(csv_file, mode='a', header=True, index=False)
        
        # Increased sleep time for pages to avoid throttling
        print(f"Pausing for 30 seconds to avoid being blocked...")
        time.sleep(30)
        
        print(f"{pageNumber} -> {req}")
    pageNumber += 1


In [11]:
flats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
