# Part 1, web scrapying

There are multiple libs availiable for extract information from html, as a newbie, Requests and beautifulsoup should work for me.

## import libs

In [86]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## set up an example link to work on

without setting the headers, the requests will return: permission denied. we need to pretend we're browsers by setting up a user-agent. I used the link below to get the agent headers.

[What is my user-agent?](https://www.whatsmyua.info/)

In this case I search for rentals in clayton area, use page one as an example for writing my code.
Once I got the code done I will apply it to all pages.

next define a function to extract features I want from the link.

In [102]:
df = pd.DataFrame(columns=['property_type','price','bond','address','feature_bedroom',
                           'feature_bathroom','feature_parking','agent_brand',
                           'agent_name','available_date','property_details'])

In [119]:
def get_num_pages(suburb,headers):
    '''
    a sub function to get total number of pages of a suburb
    arguments:
    suburb: a string of suburb name
    headers: user agent headers
    
    return:
    page_num: total number of pages of a suburb, integer
    '''
    # get num of pages availiable
    firstpage = "https://www.realestate.com.au/rent/in-"+suburb+"/list-1"
    response = requests.get(firstpage, headers = headers)
    if response.status_code != 200:
        print('Link Error, try again')
    else:
        soup = BeautifulSoup(response.content, 'html.parser')
        pages = soup.find_all(class_="pagination__link rui-button-basic")
    return int(pages[-1].get_text())

In [129]:
def get_features(response,df):
    '''
    this function extract features from the html, and style it to a dataframe
    arguments:
    response: output of requests.get(url)
    df: a dataframe to store the features
    
    return:
    df: the data frame contain features.
    '''
    if response.status_code != 200:
        print('Link Error, try again')
    else:
        soup = BeautifulSoup(response.content, 'html.parser')
        html = list(soup.children)[2]
        cards = html.find(class_="tiered-results tiered-results--exact")
        articles = cards.select("div article")
        if len(articles) == 0:
            print("no records on this page")
            return df
        for article in articles:
            card = article.find(class_="residential-card__content")
            price = card.find(class_="property-price").get_text().split()[0][1:]
            address_card = card.find(class_="residential-card__address-heading")
            address = address_card.select("h2 a span")[0].get_text()
            address_link = 'https://www.realestate.com.au' + address_card.find_all("a")[0].get('href')
            try:
                feature_bedroom = card.find(class_="general-features__icon general-features__beds").get_text()
            except:
                feature_bedroom = 0
            try:
                feature_bathroom = card.find(class_="general-features__icon general-features__baths").get_text()
            except:
                feature_bathroom = 0
            try:
                feature_parking = card.find(class_="general-features__icon general-features__cars").get_text()
            except:
                feature_parking = 0
            property_type = card.find(class_="residential-card__property-type").get_text()
            try:
                agent_brand = article.find(class_="branding__image").get('alt')
            except:
                agent_brand = np.nan
            try:
                agent_name = article.find(class_="agent__name").get_text()
            except:
                agent_name = np.nan
            detail_response = requests.get(address_link, headers = headers)
            if detail_response.status_code == 200:
                detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                try:
                    bond = detail_soup.select("div.property-info__property-price-details p")[0].get_text()
                except:
                    bond = np.nan
                available_date = detail_soup.find(class_="property-info__footer-content").get_text()
                property_details = detail_soup.find(class_="property-description__content").get_text()
            else:
                bond,available_date,property_details = np.nan, np.nan, np.nan
                print('LINK_ERROR')
            df_temp = pd.DataFrame({'property_type':[property_type],
                                    'price':[price],
                                    'bond':[bond],
                                    'address':[address],
                                    'feature_bedroom':[feature_bedroom],
                                    'feature_bathroom':[feature_bathroom],
                                    'feature_parking':[feature_parking],
                                    'agent_brand':[agent_brand],
                                    'agent_name':[agent_name],
                                    'available_date':[available_date],
                                    'property_details':[property_details]})
            df = df.append(df_temp)
    return df



    

## go through all pages

In [130]:
# define inputs for the spider function
# define the user header
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
# define the suburb list 
suburbs = ["clayton","chadstone","mount+waverley,+vic+3149",
           "hungtingdale","glen+waverley,+vic+3150",
           "springvale,+vic+3171","melbourne","south+melbourne,+vic+3205",
           "southbank,+vic+3006","south+yarra,+vic+3141","toorak,+vic+3142",
           "richmond,+vic+3121",]
# initial empty dataframe with column names
df = pd.DataFrame(columns=['property_type','price','bond','address','feature_bedroom',
                           'feature_bathroom','feature_parking','agent_brand',
                           'agent_name','available_date','property_details'])
def webspider(suburbs,headers,df):
    '''
    grab rental information of a centain suburb from a list of suburbs
    arguments:
    suburbs: a array of suburb names
    headers: headers of user browser agent
    df: an empty dataframe with column names
    
    return:
    df: a dataframe with rental data
    '''
    print('we will process %i suburbs' % len(suburbs))
    for index,suburb in enumerate(suburbs):
        page_num = get_num_pages(suburb,headers)
        print('processing %i suburb: %s, total of %i pages' % (index+1,suburb,page_num))
        url_base = "https://www.realestate.com.au/rent/in-"+suburb+"/list-"
        # enumrat through all pages of a suburb
        for i in range(1,page_num):
            url = url_base + str(i)
            response = requests.get(url, headers = headers)
            print('start processing page %s' % i)
            df = get_features(response,df)
            print('end of this page, have a total of %i records' % df.shape[0])
            
    return df

df = webspider(suburbs,headers,df)
       

we will process 12 suburbs
processing 1 suburb: clayton, total of 24 pages
start processing page 1
end of this page, have a total of 25 records
start processing page 2
end of this page, have a total of 50 records
start processing page 3
end of this page, have a total of 75 records
start processing page 4
end of this page, have a total of 100 records
start processing page 5
end of this page, have a total of 125 records
start processing page 6
end of this page, have a total of 150 records
start processing page 7
end of this page, have a total of 172 records
start processing page 8
no records on this page
end of this page, have a total of 172 records
start processing page 9
no records on this page
end of this page, have a total of 172 records
start processing page 10
no records on this page
end of this page, have a total of 172 records
start processing page 11
no records on this page
end of this page, have a total of 172 records
start processing page 12
no records on this page
end of this

start processing page 7
no records on this page
end of this page, have a total of 315 records
start processing page 8
no records on this page
end of this page, have a total of 315 records
start processing page 9
no records on this page
end of this page, have a total of 315 records
start processing page 10
no records on this page
end of this page, have a total of 315 records
start processing page 11
no records on this page
end of this page, have a total of 315 records
start processing page 12
no records on this page
end of this page, have a total of 315 records
start processing page 13
no records on this page
end of this page, have a total of 315 records
start processing page 14
no records on this page
end of this page, have a total of 315 records
start processing page 15
no records on this page
end of this page, have a total of 315 records
start processing page 16
no records on this page
end of this page, have a total of 315 records
start processing page 17
no records on this page
end 

start processing page 50
no records on this page
end of this page, have a total of 1648 records
start processing page 51
no records on this page
end of this page, have a total of 1648 records
start processing page 52
no records on this page
end of this page, have a total of 1648 records
start processing page 53
no records on this page
end of this page, have a total of 1648 records
start processing page 54
no records on this page
end of this page, have a total of 1648 records
start processing page 55
no records on this page
end of this page, have a total of 1648 records
start processing page 56
no records on this page
end of this page, have a total of 1648 records
start processing page 57
no records on this page
end of this page, have a total of 1648 records
start processing page 58
no records on this page
end of this page, have a total of 1648 records
start processing page 59
no records on this page
end of this page, have a total of 1648 records
start processing page 60
no records on t

start processing page 57
no records on this page
end of this page, have a total of 1731 records
start processing page 58
no records on this page
end of this page, have a total of 1731 records
start processing page 59
no records on this page
end of this page, have a total of 1731 records
start processing page 60
no records on this page
end of this page, have a total of 1731 records
start processing page 61
no records on this page
end of this page, have a total of 1731 records
start processing page 62
no records on this page
end of this page, have a total of 1731 records
start processing page 63
no records on this page
end of this page, have a total of 1731 records
start processing page 64
no records on this page
end of this page, have a total of 1731 records
start processing page 65
no records on this page
end of this page, have a total of 1731 records
start processing page 66
no records on this page
end of this page, have a total of 1731 records
start processing page 67
no records on t

start processing page 68
no records on this page
end of this page, have a total of 2172 records
start processing page 69
no records on this page
end of this page, have a total of 2172 records
start processing page 70
no records on this page
end of this page, have a total of 2172 records
start processing page 71
no records on this page
end of this page, have a total of 2172 records
start processing page 72
no records on this page
end of this page, have a total of 2172 records
start processing page 73
no records on this page
end of this page, have a total of 2172 records
start processing page 74
no records on this page
end of this page, have a total of 2172 records
start processing page 75
no records on this page
end of this page, have a total of 2172 records
start processing page 76
no records on this page
end of this page, have a total of 2172 records
start processing page 77
no records on this page
end of this page, have a total of 2172 records
start processing page 78
no records on t

start processing page 78
no records on this page
end of this page, have a total of 2566 records
start processing page 79
no records on this page
end of this page, have a total of 2566 records
processing 11 suburb: toorak,+vic+3142, total of 63 pages
start processing page 1
end of this page, have a total of 2591 records
start processing page 2
end of this page, have a total of 2616 records
start processing page 3
end of this page, have a total of 2641 records
start processing page 4
end of this page, have a total of 2655 records
start processing page 5
no records on this page
end of this page, have a total of 2655 records
start processing page 6
no records on this page
end of this page, have a total of 2655 records
start processing page 7
no records on this page
end of this page, have a total of 2655 records
start processing page 8
no records on this page
end of this page, have a total of 2655 records
start processing page 9
no records on this page
end of this page, have a total of 2655

start processing page 25
no records on this page
end of this page, have a total of 2860 records
start processing page 26
no records on this page
end of this page, have a total of 2860 records
start processing page 27
no records on this page
end of this page, have a total of 2860 records
start processing page 28
no records on this page
end of this page, have a total of 2860 records
start processing page 29
no records on this page
end of this page, have a total of 2860 records
start processing page 30
no records on this page
end of this page, have a total of 2860 records
start processing page 31
no records on this page
end of this page, have a total of 2860 records
start processing page 32
no records on this page
end of this page, have a total of 2860 records
start processing page 33
no records on this page
end of this page, have a total of 2860 records
start processing page 34
no records on this page
end of this page, have a total of 2860 records
start processing page 35
no records on t

In [141]:
from datetime import date
today = date.today()
d1 = today.strftime("%d_%m_%Y")
csvtitle = d1 + "_rentals_raw.csv"
df.to_csv(csvtitle,index=False)