In [None]:
from datetime import date, timedelta
import itertools
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import re

In [None]:
class Scraper:
    
    def __init__(self, url):
        #Create the scraper object with some options
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36')
        self.options.add_argument('--disable-blink-features=AutomationControlled')
        self.options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.options.add_experimental_option('useAutomationExtension', False)
        self.options.headless = True
        self.driver = webdriver.Chrome(service = Service('/Users/victorcruzdefaria/Downloads/chromedriver'), options=self.options)
        self.driver.get(url)

    def get_details(self):

        #access the browser
        time.sleep(5)

        #get the prices, address and date sold and add to a list
        prices = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-9hd67m')]
        full_address = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-bqbbuf')]
        date_sold = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-1nj9ymt')]
        housing_type = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-693528')]
        
        
        #get the size info i.e. # of beds, # of baths, #number of parking
        #CHALLENGE: sometimes the size of the house/unit comes after the above metrics, used regex in list comprehension to remove all the strings starting with 3 digits
        layout_info = [x.text for x in self.driver.find_elements(By.CLASS_NAME, 'css-1ie6g1l') if not re.search(r"[m]", x.text)]
        
        #Group the layout_info into groups of 3
        splitedSize = 3
        layout_info = [layout_info[x:x+splitedSize] for x in range(0, len(layout_info), splitedSize)]
        
        #Group the data together
        data = [[e for x in grp for e in (x if isinstance(x, list) else [x])] for grp in zip(full_address,housing_type, date_sold, layout_info, prices, )]
        df = pd.DataFrame(data, columns=['address','housing_type', 'sold_date', 'n_beds','n_bath','n_garage', 'prices'])

        #close the browser
        self.driver.close()
        return df

In [None]:
def clean_prices(df):
    df['prices'] = df['prices'].apply(lambda x: re.sub('[^0-9]+','', x)).astype('int')

In [None]:
def cleaning_address(df):
    df[['address', 'suburb']] = df['address'].str.split(',', expand=True)
    df[['empty','suburb', 'state', 'postcode']] = df['suburb'].str.split(' ', n=3, expand=True)
    df['suburb'] = df['suburb'].str.strip()
    df['state'] = df['state'].astype('category')
    df.drop(labels=['empty'],axis=1, inplace=True)

In [None]:
def cleaning_date(df):
    df['sold_date'] = df['sold_date'].str.split().apply(lambda x : '/'.join(x[-3:]).lower())
    df['sold_date'] = pd.to_datetime(df['sold_date'], dayfirst=True, format="%d/%b/%Y")

In [None]:
def cleaning_layout(df):
    for i in ['n_beds', 'n_bath', 'n_garage']:
        df[i] = df[i].str.split().apply(lambda x: x[0]).astype('int')

In [None]:
def cleaning_housing_type(df):
    df['housing_type'] = df['housing_type'].str.split('/').apply(lambda x: x[0].strip()).astype('category')

In [None]:
class ResultsScraper(Scraper):
    # This class scrape the number of results available to
    # predict how many URL will be generated
    #TODO: LINK THE NUMBER OF RESULTS WITH PRICE RANGE
        #This will allow us to scrape as much old data as possible
    
    def __init__(self, url):
        #Inheritance from scraper.py to create similar scraper object
        super().__init__(url)

    def get_right_pages(self):
        #Use the Scraper
        search_summary = self.driver.find_element(By.CLASS_NAME, 'css-ekkwk0').text
        n_results = int(search_summary.split(' ', maxsplit=1)[0])

        #Calculate the number of pages to scrape
        #This will avoid the creation of unnecessary URL.
        #Domain Real State only display 50 pages search
        if n_results%20 == 0:
            n_pages = n_results/20
        else:
            n_pages = (n_results//20)+1
        print(n_results, n_pages)
        #    return n_pages

def url_generator(number_pages, state=str):
    suburb_postcode = pd.read_csv('postcode.csv')
    url_list = []
    for x in suburb_postcode[state]:
        for y in range(1,number_pages+1):
            BASE_URL = f'https://www.domain.com.au/sold-listings/?postcode={x}&price=0-5000000&excludepricewithheld=1&page={y}'
            url_list.append(BASE_URL)
            print(BASE_URL)
    print (len(url_list))        

url_generator(50,'VIC')
# n_pages = ResultsScraper('https://www.domain.com.au/sold-listings/?postcode=3168&price=0-5000000&excludepricewithheld=1')
# n_pages.get_right_pages()

In [None]:
def main():
    URL = 'https://www.domain.com.au/sold-listings/?suburb=carnegie-vic-3163,murrumbeena-vic-3163&excludepricewithheld=1'
    web_scraper = Scraper(URL)
    df = web_scraper.get_details()
    cleaning_address(df)
    cleaning_date(df)
    cleaning_layout(df)
    cleaning_housing_type(df)
    clean_prices(df)

    print(df.head(10))
    df.info()