# Part 1, web scrapying

There are multiple libs availiable for extract information from html, as a newbie, Requests and beautifulsoup should work for me.

## import libs

In [9]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## set up an example link to work on

without setting the headers, the requests will return: permission denied. we need to pretend we're browsers by setting up a user-agent. I used the link below to get the agent headers.

[What is my user-agent?](https://www.whatsmyua.info/)

In this case I search for rentals in clayton area, use page one as an example for writing my code.
Once I got the code done I will apply it to all pages.

In [23]:
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
url_base = "https://www.realestate.com.au/rent/in-clayton/list-"
#response = requests.get(url, headers = headers)

next define a function to extract features I want from the link.

In [75]:
df = pd.DataFrame(columns=['property_type','price','bond','address','feature_bedroom',
                           'feature_bathroom','feature_parking','agent_brand',
                           'agent_name','available_date','property_details'])

In [74]:
def get_features(response,df):
    '''
    this function extract features from the html, and style it to a dataframe
    arguments:
    response: output of requests.get(url)
    df: a dataframe to store the features
    
    return:
    df: the data frame contain features.
    '''
    if response.status_code != 200:
        print('Link Error, try again')
    else:
        soup = BeautifulSoup(response.content, 'html.parser')
        html = list(soup.children)[2]
        cards = html.find(class_="tiered-results tiered-results--exact")
        articles = cards.select("div article")
        for article in articles:
            card = article.find(class_="residential-card__content")
            price = card.find(class_="property-price").get_text().split()[0][1:]
            address_card = card.find(class_="residential-card__address-heading")
            address = address_card.select("h2 a span")[0].get_text()
            address_link = 'https://www.realestate.com.au' + address_card.find_all("a")[0].get('href')
            features = card.find_all(class_="general-features__feature")
            factors = len(features)
            feature_bedroom = features[0].get('aria-label').split()[0]
            try:
                feature_bathroom = features[1].get('aria-label').split()[0]
            except:
                feature_bathroom = np.nan
            if factors == 3:
                feature_parking = features[2].get('aria-label').split()[0]
            else:
                feature_parking = 0
            property_type = card.find(class_="residential-card__property-type").get_text()
            try:
                agent_brand = article.find(class_="branding__image").get('alt')
            except:
                print("no agent")
                agent_brand = np.nan
            try:
                agent_name = article.find(class_="agent__name").get_text()
            except:
                print("no agent name")
                agent_name = np.nan
            detail_response = requests.get(address_link, headers = headers)
            if detail_response.status_code == 200:
                detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                try:
                    bond = detail_soup.select("div.property-info__property-price-details p")[0].get_text()
                except:
                    bond = np.nan
                    print("error bond")
                available_date = detail_soup.find(class_="property-info__footer-content").get_text()
                property_details = detail_soup.find(class_="property-description__content").get_text()
            else:
                bond,available_date,property_details = np.nan, np.nan, np.nan
                print('LINK_ERROR')
            df_temp = pd.DataFrame({'property_type':[property_type],
                                    'price':[price],
                                    'bond':[bond],
                                    'address':[address],
                                    'feature_bedroom':[feature_bedroom],
                                    'feature_bathroom':[feature_bathroom],
                                    'feature_parking':[feature_parking],
                                    'agent_brand':[agent_brand],
                                    'agent_name':[agent_name],
                                    'available_date':[available_date],
                                    'property_details':[property_details]})
            df = df.append(df_temp)
    return df



    

## go through all pages

In [76]:
for i in range(1,25):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (25, 11)
start processing page 2
end of this page (50, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (75, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (100, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no agent

In [78]:
df_copy = df.copy()

In [79]:
url_base2 = "https://www.realestate.com.au/rent/in-chadstone/list-"
for i in range(1,25):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (200, 11)
start processing page 2
end of this page (225, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (250, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (275, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no ag

In [80]:
url_base3 = "https://www.realestate.com.au/rent/in-mount+waverley,+vic+3149/list-"
for i in range(1,37):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (375, 11)
start processing page 2
end of this page (400, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (425, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (450, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no ag

In [81]:
url_base5 = "https://www.realestate.com.au/rent/in-hungtingdale/list-"
for i in range(1,20):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (550, 11)
start processing page 2
end of this page (575, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (600, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (625, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no ag

In [82]:
df_copy = df.copy()

In [83]:

url_base6 = "https://www.realestate.com.au/rent/in-glen+waverley,+vic+3150/list-"
for i in range(1,20):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (725, 11)
start processing page 2
end of this page (750, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (775, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (800, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no ag

In [84]:
url_base7 = "https://www.realestate.com.au/rent/in-springvale,+vic+3171/list-"
for i in range(1,20):
    url = url_base + str(i)
    response = requests.get(url, headers = headers)
    print('start processing page %s' % i)
    df = get_features(response,df)
    print('end of this page',df.shape)

start processing page 1
end of this page (900, 11)
start processing page 2
end of this page (925, 11)
start processing page 3
error bond
no agent name
no agent name
no agent name
no agent name
end of this page (950, 11)
start processing page 4
no agent name
no agent name
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
end of this page (975, 11)
start processing page 5
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent
no agent name
no agent name
no ag

In [85]:
df.to_csv('20191111_rentals.csv',index=False)