In [357]:
from datetime import date, timedelta
import itertools
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import re

In [358]:
class Scraper:
    
    def __init__(self, url):
        #Create the scraper object with some options
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36')
        self.options.add_argument('--disable-blink-features=AutomationControlled')
        self.options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.options.add_experimental_option('useAutomationExtension', False)
        self.options.headless = True
        self.driver = webdriver.Chrome(service = Service('/Users/victorcruzdefaria/Downloads/chromedriver'), options=self.options)
        self.driver.get(url)

    def get_details(self):

        #access the browser
        time.sleep(5)

        #get the prices, address and date sold and add to a list
        prices = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-9hd67m')]
        full_address = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-bqbbuf')]
        date_sold = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-1nj9ymt')]
        housing_type = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-693528')]
        
        
        #get the size info i.e. # of beds, # of baths, #number of parking
        #CHALLENGE: sometimes the size of the house/unit comes after the above metrics, used regex in list comprehension to remove all the strings starting with 3 digits
        layout_info = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-1ie6g1l') if not re.search(r"[m]", x.text)]
        
        #Group the layout_info into groups of 3
        splitedSize = 3
        layout_info = [layout_info[x:x+splitedSize] for x in range(0, len(layout_info), splitedSize)]
        
        #Group the data together
        data = [[e for x in grp for e in (x if isinstance(x, list) else [x])] for grp in zip(full_address,housing_type, date_sold, layout_info, prices, )]
        df = pd.DataFrame(data, columns=['address','housing_type', 'sold_date', 'n_beds','n_bath','n_garage', 'prices'])

        #close the browser
        self.driver.close()
        return df

In [359]:
def clean_prices(df):
    df['prices'] = df['prices'].apply(lambda x: re.sub('[^0-9]+','', x)).astype('int')

In [360]:
def cleaning_address(df):
    df[['address', 'suburb']] = df['address'].str.split(',', expand=True)
    df[['empty','suburb', 'state', 'postcode']] = df['suburb'].str.split(' ', n=3, expand=True)
    df['suburb'] = df['suburb'].str.strip()
    df['state'] = df['state'].astype('category')
    df.drop(labels=['empty'],axis=1, inplace=True)

In [361]:
def cleaning_date(df):
    df['sold_date'] = df['sold_date'].str.split().apply(lambda x : '/'.join(x[-3:]).lower())
    df['sold_date'] = pd.to_datetime(df['sold_date'], dayfirst=True, format="%d/%b/%Y")

In [362]:
def cleaning_layout(df):
    for i in ['n_beds', 'n_bath', 'n_garage']:
        df[i] = df[i].str.split().apply(lambda x: x[0]).astype('int')

In [363]:
def cleaning_housing_type(df):
    df['housing_type'] = df['housing_type'].str.split('/').apply(lambda x: x[0].strip()).astype('category')

In [364]:
class ScraperSuburbs(Scraper):
    def __init__(self, url):
        super().__init__(url)

    def get_details(self):
        time.sleep(5)

        #get the prices, address and date sold and add to a list
        prices = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-9hd67m')]
        full_address = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-bqbbuf')]
        date_sold = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-1nj9ymt')]
        housing_type = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-693528')]
        pass


In [3]:
def url_generator(number_pages, suburb_postcode = {}):
    url_list = []
    for i in range(number_pages):
        i+= 1
        BASE_URL = f'https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page={i}'
        url_list.append(BASE_URL)
        print(BASE_URL)

        options = webdriver.ChromeOptions()
        options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.headless = True
        driver = webdriver.Chrome(service = Service('/Users/victorcruzdefaria/Downloads/chromedriver'), options=options)
        driver.get(BASE_URL)
        

url_generator(30)

https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=1
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=2
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=3
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=4
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=5
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=6
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=7
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=8
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1&page=9
https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld

In [366]:
def main():
    URL = 'https://www.domain.com.au/sold-listings/?suburb=carnegie-vic-3163,murrumbeena-vic-3163&excludepricewithheld=1'
    web_scraper = Scraper(URL)
    df = web_scraper.get_details()
    cleaning_address(df)
    cleaning_date(df)
    cleaning_layout(df)
    cleaning_housing_type(df)
    clean_prices(df)

    print(df.head(10))
    df.info()
    
main()

                    address housing_type  sold_date  n_beds  n_bath  n_garage  \
0    2/31 Shepparson Avenue    Apartment 2022-08-13       2       1         1   
1         12/56 Grange Road    Apartment 2022-08-10       1       1         1   
2       211/9 Morton Avenue    Apartment 2022-08-09       2       1         1   
3       203/3 Morton Avenue    Apartment 2022-08-03       2       1         1   
4        19 Tranmere Avenue        House 2022-07-30       4       2         1   
5        28 Atkinson Street        House 2022-07-30       4       2         2   
6  1001/1060 Dandenong Road    Apartment 2022-07-29       3       2         2   
7        5/87 Coorigil Road    Apartment 2022-07-28       1       1         1   
8     2/82-84 Coorigil Road    Apartment 2022-07-28       2       1         1   
9     9/1214 Dandenong Road    Apartment 2022-07-28       1       1         1   

    prices       suburb state postcode  
0   742000     CARNEGIE   VIC     3163  
1   271000     CARNEGIE   