# Google Project Sunroof - Web scraper assessment

Sunroof does not have an API that I could find. I've emailed their team to inquire and they are interested in our particular use case. In the meantime I'll figure out whether a web scraper is feasible in a reasonable amount of time. 

### &diams; Download the chromedriver if necessary

In [1]:
# URL for selenium chromedriver
url = 'https://chromedriver.chromium.org/downloads'
# download the right version, put it in this directory and 
# ensure it clears the security protocols on your local machine


### &diams; Import libraries and helper functions

In [80]:
# import libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import pandas as pd
import time

#import helper functions
from helper_functions import get_numeric


### &diams; Instantiate an automated browser

In [90]:
# instantiate an automated browser

driver = webdriver.Chrome(service=Service('chromedriver'), options=webdriver.ChromeOptions())
driver.maximize_window()

### &diams; Create a DataFrame that will store our results, load the addresses list


In [107]:
df = pd.DataFrame(columns=[
    'Rate'
    , 'Building Type'
    , 'Address Requested'
    , 'Address Delivered'
    , 'Addresses Match?' 
    , 'Annual Sunlight (Hours)'
    , 'Roof Area (sq ft)'
    , 'Savings Estimate ($)'
    ]
)
# choose csv, city extension, and results file name
addresses = 'tc-addresses-rate&type.csv'
city_state = ', Traverse City, Michigan, USA'
results_file_name = 'tc-sunroof-results-rate&type.csv'
df_addresses = pd.read_csv(f'data/{addresses}')

### &diams; Loop through each address, and: 
1) Obtain its results
2) Skip 3-4 if the search doesn't return any results
3) Parse the HTML for the numbers we need and store them in variables
4) Build a dataframe with the results and add it to the existing one

In [108]:
for col in ['Commercial', 'Commercial Demand']: 
    for i, item in enumerate(df_addresses[col]): 
        if type(item) != str: 
            continue
        address = item + city_state
        building_type = df_addresses['Building Type C'][i] if col == 'Commercial' else df_addresses['Building Type CD'][i]
        if address not in df['Address Requested']:
            # 1
            driver.get('https://sunroof.withgoogle.com/') 
            address_input = driver.find_element(by=By.CSS_SELECTOR, value="md-autocomplete.address-input") # to input the address
            address_input.send_keys(address) # input address 
            address_input.send_keys(Keys.RETURN) # open the dropdown menu of addresses
            address_input.send_keys(Keys.DOWN) # tab to the top result 
            time.sleep(1) # wait for page to be ready 
            address_input.send_keys(Keys.RETURN) # execute the search and load the next page
            # 2
            try: 
                timeout = 10
                test_element = WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'recommended-kw'))
                )
            except TimeoutException: 
                data = pd.DataFrame({
                    'Rate': [col]
                    , 'Building Type': [building_type]
                    , 'Address Requested': [address]
                    , 'Address Delivered': ['Not Found']
                })
                df = pd.concat([df, data], ignore_index=True)
                print(f'{address} returned no results')
                continue
            # 3
            panel_facts = driver.find_elements(by=By.CLASS_NAME, value='panel-fact-text')
            sunlight_hours, sq_ft = [fact.text for fact in panel_facts]
            try: 
                savings = driver.find_element(by=By.CLASS_NAME, value='panel-estimate-savings').text
            except NoSuchElementException: 
                savings = 'Savings not given'
            address_delivered = driver.find_element(by=By.CSS_SELECTOR, value='md-autocomplete').get_attribute('placeholder') # to obtain the placeholder address 
            kw_recommend = driver.find_element(by=By.CLASS_NAME, value='recommended-kw').text
            area_recommend = driver.find_element(by=By.CLASS_NAME, value='recommended-area').text[1:]
            # extract just the numerical values from the element text we just obtained
            sunlight_hours, sq_ft, savings, kw_recommend, area_recommend = list(map(
                lambda string: get_numeric(string)
                , (sunlight_hours, sq_ft, savings, kw_recommend, area_recommend)
                )
            )
            # 4
            data = pd.DataFrame(
                {   
                    'Rate': [col]
                    , 'Building Type': [building_type]
                    , 'Address Requested': [address]
                    , 'Address Delivered': [address_delivered]
                    , 'Addresses Match?': [1 if address.split(' ')[0] == address_delivered.split(' ')[0] else 0]
                    , 'Annual Sunlight (Hours)': [sunlight_hours]
                    , 'Roof Area (sq ft)': [sq_ft]
                    , 'Savings Estimate ($)': [savings]
                    , 'Recommended Installation Size (kW)': [kw_recommend]
                    , 'Recommended Area (sq ft)': [area_recommend]
                }
            )
            df = pd.concat([df, data], ignore_index=True)
            print(f'{address} has been added to our dataset')

537 BAY ST, Traverse City, Michigan, USA has been added to our dataset
1413 WOODMERE AVE, Traverse City, Michigan, USA has been added to our dataset
13920 S WEST BAY SHORE DR, Traverse City, Michigan, USA has been added to our dataset
508 MUNSON AVE, Traverse City, Michigan, USA has been added to our dataset
907 1/2 WOODMERE AVE, Traverse City, Michigan, USA has been added to our dataset
822 CASS ST, Traverse City, Michigan, USA has been added to our dataset
1719 S GARFIELD AVE, Traverse City, Michigan, USA has been added to our dataset
1501 CASS ST, Traverse City, Michigan, USA has been added to our dataset
1897 US 31 NORTH, Traverse City, Michigan, USA returned no results
1240 E EIGHTH ST, Traverse City, Michigan, USA has been added to our dataset
435 BAY ST, Traverse City, Michigan, USA has been added to our dataset
134 1/2 E FRONT ST, Traverse City, Michigan, USA has been added to our dataset
1505 PREMIER ST, Traverse City, Michigan, USA has been added to our dataset
507 BAY ST, Tr

In [110]:
df.to_csv(f'data/{results_file_name}')