In [1]:
import pandas as pd
import numpy as np

import time
import random
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException

from collections import defaultdict
from pprint import pprint
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
def create_empty_driver():
    option = webdriver.ChromeOptions()
#     option.add_argument(" — incognito")
    driver = webdriver.Chrome(executable_path='/Applications/chromedriver', options=option)
    return driver

def create_driver_opening_url(a_url):
    driver = create_empty_driver()
    driver.get(a_url)
    return driver

def soup_from_driver(a_driver):
    soup = BeautifulSoup(a_driver.page_source, 'html.parser')
    return soup

def clean_from_text(text):
#     cleaned_text = text.replace(",","").strip()
    cleaned_text = text.strip()
    return cleaned_text

def get_address(a_soup, a_dict, a_driver):
    address_dict = defaultdict(str)
    # Get addresses
    for line in a_soup.find(itemprop='address').find_all('span'):
        if line.has_attr('itemprop'):
            address_dict[clean_from_text(line['itemprop'])] = clean_from_text(line.text)
    # Get lat/long
    for line in a_soup.find(itemprop='geo').find_all('meta'):
        if line.has_attr('itemprop'):
            address_dict[clean_from_text(line['itemprop'])] = float(line['content'])
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **address_dict}
    
def get_description(a_soup, a_dict, a_driver):
    description_dict = defaultdict(str)
    remarks = a_soup.find(class_="remarks").find('span')
    description_dict['description'] = clean_from_text(remarks.text)
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **description_dict}

def get_key_details(a_soup, a_dict, a_driver):
    key_details_dict = defaultdict(str)
    try:
        for div in a_soup.find(class_='keyDetailsList').find_all('div'):
            key_details_dict[clean_from_text(div.contents[0].text)] = clean_from_text(div.contents[1].text)
    except:
        None
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **key_details_dict}

def get_listing_details(a_soup, a_dict, a_driver):
    list_details_dict = defaultdict(str)
    for super_group in a_soup.find_all(class_='super-group-content'):
        for data in super_group.find_all('span', {'class':'entryItemContent'}):
            try: list_details_dict[clean_from_text(data.contents[0].string)] = clean_from_text(data.contents[1].text)
            except: list_details_dict[clean_from_text(data.contents[0].string)] = True
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **list_details_dict}

def get_home_facts(a_soup, a_dict, a_driver):
    home_facts_dict = defaultdict(str)
    for row in a_soup.find(class_='facts-table'):
        home_facts_dict[clean_from_text(row.span.text)] = clean_from_text(row.div.text)
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **home_facts_dict} 

def get_transit_scores(a_soup, a_dict, a_driver):
    transit_scores_dict = defaultdict(str)
    for score in a_soup.find_all(class_='score'):
        try: transit_scores_dict[clean_from_text(score.find(class_='label').text)] = clean_from_text(score.find(class_='percentage').text)
        except: None
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **transit_scores_dict} 

def get_recent_area_offer_data(a_soup, a_dict, a_driver):
    recent_area_offer_data_dict = defaultdict(str)
    recent_area_offer_data_dict['current area'] = clean_from_text(a_soup.find(class_='OfferInsights').find('a').text)
    for td in a_soup.find(class_='OfferInsights').find(class_='basic-table').tbody.find_all('td'):
        recent_area_offer_data_dict['Area Current ' + clean_from_text(td.find(class_='field').text)] =\
                                                                      clean_from_text(td.find(class_='value').text)
    a_dict[a_driver.current_url] = {**a_dict[a_driver.current_url], **recent_area_offer_data_dict} 

def get_schools(a_soup, a_dict, a_driver):
    xpaths = ['//*[@id="schools-scroll"]/div/div[1]/div/div[1]/div[1]/button', '//*[@id="schools-scroll"]/div/div[1]/div/div[1]/div[2]/button', '//*[@id="schools-scroll"]/div/div[1]/div/div[1]/div[3]/button', '//*[@id="schools-scroll"]/div/div[1]/div/div[1]/div[4]/button']
    types = ['serving this home', 'elementary', 'middle', 'high']
    for i, xpath in enumerate(xpaths): 
        open_xpath_with_driver(xpath, a_driver)
        a_soup = soup_from_driver(a_driver)
        for td in a_soup.find(class_='schools-content').find('table').tbody.find_all('td'):
            try:
                a_dict['school name'].append(clean_from_text(td.find(class_='school-name').text))
                rating_list = td.find(class_='gs-rating-row').text.split(":")
                a_dict['great schools rating'].append(clean_from_text(rating_list[1]))
                a_dict['type'].append(types[i])
                a_dict['url'].append(a_driver.current_url)
            except: None   

#what does this do              
def open_xpath_with_driver(xpath, a_driver):
    a_driver.find_element_by_xpath(xpath).click()
    
def get_price_history(a_soup, a_dict, a_driver):
    for i, row in enumerate(a_soup.find(id='property-history-transition-node').table.tbody.find_all('tr')):
        try:
            a_dict['date'].append(clean_from_text(row.find(class_='date-col').text))
            a_dict['event'].append(clean_from_text(row.find(class_='event-col').findChildren()[0].text))
            a_dict['source'].append(clean_from_text(row.find(class_='source-info').text))
            a_dict['price'].append(clean_from_text(row.find(class_='price-col').text))
            a_dict['url'].append(a_driver.current_url)
        except: 
            None

def get_new_urls(a_soup, a_driver, a_urls_dict):
    for row in a_soup.find_all(class_='SimilarHomeCardReact'):
        a_urls_dict["https://www.redfin.com"+row.find('a', href=True)['href']]

In [27]:
def get_data(starting_url, house_data_dict = None, price_history_dict = None, schools_dict=None, driver=None, urls_dict = None):
    try:
        driver.get(starting_url)
        history_xpath = '//*[@id="propertyHistory-expandable-segment"]/div[2]/div/span'
        delay = 4 # seconds
#         print('start wait')
        time.sleep(delay)
#         print('end wait')
        try:
            myElem = WebDriverWait(driver, delay).until(EC.element_to_be_clickable((By.XPATH, history_xpath)))
            print("Page is ready!")
            myElem.click()
        except:
            print("Loading took too much time!")
        try:
            soup = soup_from_driver(driver)
            get_address(soup, house_data_dict, driver)
            get_description(soup, house_data_dict, driver)
            get_key_details(soup, house_data_dict, driver)
#             get_listing_details(soup, house_data_dict, driver)
            get_home_facts(soup, house_data_dict, driver)
            get_transit_scores(soup, house_data_dict, driver)
            get_recent_area_offer_data(soup, house_data_dict, driver)
            get_schools(soup, schools_dict, driver)
            get_price_history(soup, price_history_dict, driver)
            get_new_urls(soup, driver, urls_dict)
        except:
            None
    except:
        None
        
def start_pulling_data(urls_dict, num_urls_to_visit):
    try:
        house_data_dict = defaultdict(dict)
        price_history_dict = defaultdict(list)
        schools_dict = defaultdict(list)
        all_urls = list(filter(lambda x: not urls_dict[x], urls_dict.keys()))
# Pull only NY data
        all_urls = [url for url in all_urls if '.com/IN/Indianapolis' in url]
        if num_urls_to_visit < len(all_urls):
            all_urls = all_urls[:num_urls_to_visit]

        driver = create_driver_opening_url(all_urls[0])
        open_xpath_with_driver('//*[@id="header-content"]/header[2]/div[2]/button[1]/span', driver)
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[contains(concat(' ', @class, ' '), ' button Button tertiary emailSignInButton ')]")))
        driver.find_element_by_xpath("//*[contains(concat(' ', @class, ' '), ' button Button tertiary emailSignInButton ')]").click()
#         WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[contains(concat(' ', @class, ' '), ' dijitInputInner email ')]")))
#         driver.find_element_by_xpath("//*[contains(concat(' ', @class, ' '), ' dijitInputInner email ')]").click()
        time.sleep(3)
        WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@name='emailInput']")))
        driver.find_element_by_xpath("//*[@name='emailInput']").click()
        driver.find_element_by_xpath("//*[@name='emailInput']").send_keys('jackselbo@yahoo.com')
        driver.find_element_by_xpath('//*[@name="passwordInput"]').send_keys('403nbrVT')
        driver.find_element_by_xpath("//*[contains(concat(' ', @class, ' '), ' button Button primary submitButton ')]").click()
        time.sleep(3)
        for url in all_urls:
            print(url)
            get_data(url, house_data_dict, price_history_dict, schools_dict, driver, urls_dict)
            urls_dict[url] = True
        return house_data_dict, price_history_dict, schools_dict, driver, urls_dict
    except:
        None

In [35]:
# file = open("urls_dict9.pkl",'rb')
# old_urls_dict = pickle.load(file)
# file.close()
# old_urls_dict = defaultdict(bool)
len(old_urls_dict)

3028

In [36]:
# view_new_urls = [i for i in old_urls_dict if old_urls_dict[i] == False]
# view_new_in_urls = [url for url in view_new_urls if '.com/IN/Indianapolis/' in url]
# print(len(view_new_urls), len(view_new_in_urls))

In [90]:
search = pd.read_csv('redfin_seed_url23.csv')
for i in search.values:
    old_urls_dict[i[0]]

In [93]:
len(old_urls_dict)

9107

In [92]:
for i in range(90, 91):
    try: 
        house_data_dict, price_history_dict, schools_dict, driver, urls_dict = start_pulling_data(old_urls_dict, 100)
    except:
        print('missed')
    try:
        filehandler = open("house_data_dict{}.pkl".format(i),"wb")
        pickle.dump(house_data_dict,filehandler)
        filehandler.close()

        filehandler = open("price_history_dict{}.pkl".format(i),"wb")
        pickle.dump(price_history_dict,filehandler)
        filehandler.close()

        filehandler = open("schools_dict{}.pkl".format(i),"wb")
        pickle.dump(schools_dict,filehandler)
        filehandler.close()

        filehandler = open("urls_dict{}.pkl".format(i),"wb")
        pickle.dump(urls_dict,filehandler)
        filehandler.close()

        old_urls_dict = urls_dict
    except:
        print('fail')

https://www.redfin.com/IN/Indianapolis/8314-Goldfinch-Cir-46256/home/82195425
Page is ready!
https://www.redfin.com/IN/Indianapolis/8820-Deer-Run-Dr-46256/home/60093427
Page is ready!
https://www.redfin.com/IN/Indianapolis/8036-N-Richardt-Ave-46256/home/82194118
Loading took too much time!
https://www.redfin.com/IN/Indianapolis/7926-S-Scarborough-Blvd-Dr-S-46256/home/82194758
Loading took too much time!
https://www.redfin.com/IN/Indianapolis/8038-Teel-Way-46256/home/82195430
Page is ready!
https://www.redfin.com/IN/Indianapolis/11566-Nicole-Ct-46236/home/66966232
Page is ready!
https://www.redfin.com/IN/Indianapolis/9655-Geist-Woods-Way-46256/home/82382236
Loading took too much time!
https://www.redfin.com/IN/Indianapolis/8119-Fisher-Bend-Dr-46239/home/82300025
Loading took too much time!
https://www.redfin.com/IN/Indianapolis/8128-Wildwood-Farms-Dr-46239/home/82296933
Page is ready!
https://www.redfin.com/IN/Indianapolis/7872-Wolfgang-Pl-46239/home/82285766
Loading took too much time!

Page is ready!
http://www.redfin.com/IN/Indianapolis/4055-Central-Ave-46205/home/82238356
Loading took too much time!
http://www.redfin.com/IN/Indianapolis/6628-Glenn-Meade-Dr-46241/home/67033740
Loading took too much time!
http://www.redfin.com/IN/Indianapolis/1948-W-64th-St-46260/home/66943804
Loading took too much time!
http://www.redfin.com/IN/Indianapolis/1751-W-72nd-Pl-46260/home/66804488
Loading took too much time!
http://www.redfin.com/IN/Indianapolis/932-Camp-St-46202/home/67105260
Page is ready!
http://www.redfin.com/IN/Indianapolis/6920-Summerfield-Dr-N-46214/home/66804292
Page is ready!
http://www.redfin.com/IN/Indianapolis/7523-Blue-Willow-Dr-46239/home/66831766
Page is ready!
http://www.redfin.com/IN/Indianapolis/8168-Shorewalk-Dr-46236/unit-C/home/143931237
Page is ready!
http://www.redfin.com/IN/Indianapolis/8202-E-10th-St-46219/home/82320421
Loading took too much time!
http://www.redfin.com/IN/Indianapolis/250-N-Oakland-Ave-46201/home/82169041
Loading took too much tim

In [43]:
house_data_dict

defaultdict(dict,
            {'https://www.redfin.com/IN/Indianapolis/8444-Seabridge-Way-46240/unit-2/home/60144192': {'streetAddress': '8444 Seabridge Way',
              'addressLocality': 'Indianapolis,',
              'addressRegion': 'IN',
              'postalCode': '46240',
              'latitude': 39.9096793,
              'longitude': -86.1091512,
              'description': 'Location Location Location!! This stunning luxury condo is right in the heart of the action!  Astonishing pond views compliment the open concept with sight lines from the updated kitchen to the glistening water!  Two master suites boasting tastefully updated spa-like bathrooms and walk-in closets. Two car garage, spacious deck, plenty of storage space, custom built-in cabinetry throughout and so much more! Prime access to all of the activities of the Fashion Mall as well as 465 access. This luxury home won’t last long!',
              'HOA Dues': '$375/month',
              'Type': 'Residential, Condom