In [71]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
url = "https://www.squareyards.com/new-projects-in-gurgaon"
response = requests.get(url)
response

<Response [200]>

In [6]:
soup = BeautifulSoup(response.text , 'html.parser')


In [15]:
properties_details = soup.find_all('div',id='dseprojectdata')[0]

In [26]:
properties = properties_details.find_all('div', class_='tileBox')

In [58]:
data = []
for property in properties:
    if property.find('img'):
        if 'src' in property.find('img').attrs:
            thumbnail = property.find('img')['src']
        elif 'data-src' in property.find('img').attrs:
            thumbnail = property.find('img')['data-src'] 
        else:
            thumbnail = ''    
    url = property.find('a')['href']
    name = property.find('a').text
    price = property.find('div',class_='priceWithStar').text.strip('\n').strip('')

    details = {
        'thumbnail':thumbnail,
        'url':url,
        'property':name,
        'price':price
    }
    data.append(details)
    

In [61]:
df = pd.DataFrame(data)

In [62]:
new_df = df

In [65]:
new_df.head()

Unnamed: 0,thumbnail,url,property,price
0,https://www.squareyards.com/cdn-cgi/image/widt...,https://www.squareyards.com/gurgaon-residentia...,Signature Global City 81,₹ 88.30 Lac - 1.10 Cr
1,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,Signature Global Park,₹ 37.90 Lac - 73.27 Lac
2,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,M3M Antalya Hills,₹ 1.13 Cr - 1.50 Cr
3,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,Smart World Orchard,₹ 1.25 Cr - 2.57 Cr
4,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,M3M Crown,₹ 2.22 Cr - 3.61 Cr


In [67]:
df.isnull().sum()

thumbnail    0
url          0
property     0
price        0
dtype: int64

In [72]:
def rating(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text , 'html.parser')
    if soup.find_all('div',class_='ratingAndReview'):
        rate_conrtainer = soup.find_all('div',class_='ratingAndReview')[0]
        rate = rate_conrtainer.find('div',class_='ratingValue').text.strip('\n')
        return rate
    else:
        return np.nan

new_df['rating'] = new_df['url'].apply(lambda x: rating(x))


In [85]:
def project_size(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text , 'html.parser')
    if soup.find_all('div',class_='projectInformationBox'):
        project = soup.find_all('div',class_='projectInformationBox')[0]
        size_details = project.find('div',class_='tileShortInfoBox')
        return size_details.find('ul').text.strip('\n').split('\n')

    else:
        return np.nan

new_df['projectSize'] = new_df['url'].apply(lambda x: project_size(x))

In [91]:
def project_loc(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text , 'html.parser')
    if soup.find_all('div',class_='locationOverview'):
        project = soup.find_all('div',class_='locationOverview')[0]
        location_details = project.find('a')
        return location_details.text.strip('\n')
    else:
        return np.nan

new_df['location'] = new_df['url'].apply(lambda x: project_loc(x))

In [123]:
def price_list(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text , 'html.parser')
    if soup.find_all('div',class_='priceList'):
        project = soup.find_all('div',class_='priceList')[0]
        price_list_data = project.find_all('tr')
        price_details = []
        for ind in range(1,len(price_list_data)):
            bhk_info = price_list_data[ind].find_all('td', align='left')[0].text.strip('\n')
            sqft_info = price_list_data[ind].find_all('td', align='center')[0].text.strip('\n').split('\n')[0]
            price_info = price_list_data[ind].find_all('td', align='center')[1].text.strip('\n')
            info = {
                'unit type':bhk_info,
                'area':sqft_info,
                'price':price_info
            }
            price_details.append(info)
        return price_details    
    else:
        return np.nan

new_df['priceList'] = new_df['url'].apply(lambda x: price_list(x))

In [168]:
# extract gym status
def gym(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')

        if "Gymnasium" in amenities:
            return "Yes"
    
        else:
            return "No"  
    else:
        return "No"           
new_df['gym'] = new_df['url'].apply(lambda x: gym(x))           

In [170]:
# extract swimming pool status
def swimmingPool(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Swimming Pool" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['swimmingPool'] = new_df['url'].apply(lambda x: swimmingPool(x))           

In [171]:
# extract security status
def security(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "24 x 7 Security" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['security'] = new_df['url'].apply(lambda x: security(x))           

In [173]:
# extract kids play area status
def play_area(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Kids' Play Areas / Sand Pits" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Kids_Play_Areas'] = new_df['url'].apply(lambda x: play_area(x))           

In [174]:
# extract Tennis Court(s) status
def Tennis_Court(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Tennis Court(s)" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Tennis Court'] = new_df['url'].apply(lambda x: Tennis_Court(x))           

In [176]:
# extract Badminton Court(s) status
def Badminton_Court(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Badminton Court(s)" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Badminton Court'] = new_df['url'].apply(lambda x: Badminton_Court(x))           

In [177]:
# extract Squash Court status
def Squash_Court(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Squash Court" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Squash Court'] = new_df['url'].apply(lambda x: Squash_Court(x))           

In [178]:
# extract Jogging / Cycle Track status
def track(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Jogging / Cycle Track" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['running-cycling track'] = new_df['url'].apply(lambda x: track(x))           

In [179]:
# extract Basketball status
def Basketball(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Basketball" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Basketball court'] = new_df['url'].apply(lambda x: Basketball(x))           

In [180]:
# extract Yoga Areas status
def yoga(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Yoga Areas" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Yoga Areas'] = new_df['url'].apply(lambda x: yoga(x))           

In [182]:
# extract Power Backup status
def backup(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Power Backup" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Power Backup'] = new_df['url'].apply(lambda x: backup(x))           

In [183]:
# extract lift status
def lift(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Lift" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Lift'] = new_df['url'].apply(lambda x: lift(x))           

In [184]:
# extract Fire Fighting Systems status
def system(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Fire Fighting Systems" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Fire Fighting Systems'] = new_df['url'].apply(lambda x: system(x))           

In [185]:
# extract Party Hall status
def hall(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Party Hall" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['Party Hall'] = new_df['url'].apply(lambda x: hall(x))           

In [186]:
# extract Indoor Games status
def in_games(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Indoor Games" in amenities:
            return "Yes"
        else:
            return "No"
    else:
        return "No"             
new_df['indoor_games'] = new_df['url'].apply(lambda x: in_games(x))           

In [193]:
# extract  theatre status
def theatre(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Mini Theatre" in amenities or "Amphitheater" in amenities or "Multiplex" in amenities:
            return "Yes"  
        else:
            return "No"
    else:
        return "No"             
new_df['theatre'] = new_df['url'].apply(lambda x: theatre(x))           

In [195]:
# extract retail_store status
def retail_store(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Multi Brand Retail" in amenities or "High Street Retail" in amenities:
            return "Yes"  
        else:
            return "No"
    else:
        return "No"             
new_df['retail_store'] = new_df['url'].apply(lambda x: retail_store(x))           

In [196]:
# extract food court status
def food(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Food Court" in amenities:
            return "Yes"  
        else:
            return "No"
    else:
        return "No"             
new_df['Food Court'] = new_df['url'].apply(lambda x: food(x))           

In [203]:
# extract Clubhouse status
def Clubhouse(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Clubhouse" in amenities or "Luxurious Clubhouse" in amenities:
            return "Yes"  
        else:
            return "No"
    else:
        return "No"             
new_df['Clubhouse'] = new_df['url'].apply(lambda x: Clubhouse(x))           

In [198]:
# extract garden status
def garden(x):
    response = requests.get(x)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('div',class_='amenities'):
        amenities=soup.find('div',class_='amenities').text.strip('\n')
        if "Large Green Area" in amenities or "Normal Park / Central Green" in amenities:
            return "Yes"  
        else:
            return "No"
    else:
        return "No"             
new_df['garden'] = new_df['url'].apply(lambda x: garden(x))           

In [200]:
new_df.head()

Unnamed: 0,thumbnail,url,property,price,rating,projectSize,location,priceList,gym,swimmingPool,...,Power Backup,Lift,Fire Fighting Systems,Party Hall,indoor_games,theatre,retail_store,Food Court,Clubhouse,garden
0,https://www.squareyards.com/cdn-cgi/image/widt...,https://www.squareyards.com/gurgaon-residentia...,Signature Global City 81,₹ 88.30 Lac - 1.10 Cr,5.0,"[1035 units , 11.97 Acres]",Sector 81 Gurgaon,"[{'unit type': '2 BHK 955 Sq. Ft. Ind Floor', ...",Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes
1,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,Signature Global Park,₹ 37.90 Lac - 73.27 Lac,5.0,"[2204 units , 25 Acres]",Sohna Sector 36 Gurgaon,"[{'unit type': '2 BHK 514 Sq. Ft. Ind Floor', ...",Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes
2,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,M3M Antalya Hills,₹ 1.13 Cr - 1.50 Cr,5.0,"[2704 units , 53.8 Acres]",Sector 79 Gurgaon,"[{'unit type': '2 BHK 1193 Sq. Ft. Ind Floor',...",Yes,Yes,...,Yes,No,Yes,Yes,Yes,Yes,No,No,Yes,Yes
3,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,Smart World Orchard,₹ 1.25 Cr - 2.57 Cr,4.6,"[2289 units , 23.29 Acres]",Sector 61 Gurgaon,"[{'unit type': '2 BHK 1120 Sq. Ft. Ind Floor',...",Yes,Yes,...,Yes,No,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes
4,https://static.squareyards.com/cdn-cgi/image/w...,https://www.squareyards.com/gurgaon-residentia...,M3M Crown,₹ 2.22 Cr - 3.61 Cr,5.0,"[1332 units , 16 Acres]",Sector 111 Gurgaon,"[{'unit type': '3 BHK 1555 Sq. Ft. Apartment',...",Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes


In [204]:
new_df.to_csv('gurgaon_project.csv',index=True)