#### libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import googlemaps

#### setup url && send request

In [3]:
# API KEY: GOOGLE API KEY AND REQUIRED TO BIND WITH CREDIT CARD
# REF: https://github.com/googlemaps/google-maps-services-python
gmaps = googlemaps.Client(key="API KEY")
dst_addr = "ETH Zürich Hauptgebäude, Rämistrasse, Zürich"
city = "city-zurich"
price_bt, price_top = 2000, 3000
ppl_bt, ppl_top = 4, 6
url_template = "https://www.homegate.ch/rent/real-estate/%s/matching-list?ag=%d&ah=%d&ac=%d&ad=%d&tab=list&o=sortToplisting-desc"
url = url_template%(city, price_bt, price_top, ppl_bt, ppl_top)
http_response = requests.get(url)
page = BeautifulSoup(http_response.text, 'html.parser')

In [4]:
def extract_addr(page):
    item_class = "box-row-item box-row-item--sm-4 box-row-item--md-3 box-row-item--data"
    list_addr = []

    for item in page.find_all('div', class_ = item_class):
        list_addr.append(item.find('p').decode_contents().replace("<br>", ", ").replace("<br/>", ", ").replace("</br>", ""))
        
    return list_addr

def extract_info(page):
    item_class = "box-row-item box-row-item--sm-4 box-row-item--md-3 box-row-item--bottom-line-sm box-row-item--attributes"
    list_info = []
    
    for item in page.find_all('div', class_ = item_class):
        infos = item.find_all("li")[1:]
        dic = {}
        for info in infos:
            tmp_ls = info.find_all("span")
            try:
                dic[tmp_ls[0].text] = tmp_ls[1].text
            except:
                pass
        list_info.append(dic)
        
    return list_info

def extract_price_link(page):
    list_price, list_link = [], []
    rm_dic = {"\n":"", ",":"", ".":"", "–":""}
    
    for item in page.find_all('a', class_ = "detail-page-link box-row--link"):
        price_str = item.find("div", class_ = "item-content-label").find("span").text
        for k, v in rm_dic.items():
            price_str = price_str.replace(k, v)
        list_price.append(price_str.strip())
        list_link.append(item.attrs['href'])
        
    return list_price, list_link

In [5]:
http_response = requests.get(url)
page = BeautifulSoup(http_response.text, 'html.parser')
list_info = extract_info(page)
list_addr = extract_addr(page)
list_price, list_link = extract_price_link(page)


# range(2, #pages of results + 1)
for ind in range(2,5):
    url_new = url+"&ep="+str(ind)
    http_response = requests.get(url_new)
    page = BeautifulSoup(http_response.text, 'html.parser')
    list_info += extract_info(page)
    list_addr += extract_addr(page)
    list_price_tmp, list_link_tmp = extract_price_link(page)
    list_price += list_price_tmp
    list_link += list_link_tmp

# departure time: 2018/12/12 9:00
date = datetime(2018,12,12,9)
results = gmaps.distance_matrix(origins=list_addr, 
                                destinations=dst_addr, 
                                mode="transit", 
                                departure_time=date)

ls_time = []
for item in results['rows']:
    ls_time.append(item['elements'][0]['duration']['text'])

list_complete = []

for ind, info in enumerate(list_info):
    info['address'] = list_addr[ind]
    info['price'] = list_price[ind]
    info['url'] = "https://www.homegate.ch" + list_link[ind]
    info['time'] = ls_time[ind]
    list_complete.append(info)

In [6]:
df = pd.DataFrame.from_dict(list_complete)
df = df[['url', 'address', 'Rooms', 'price', 'time', 'Type', 'Living space', 'Floor']].sort_values('price')
df['time'] = df.time.apply(lambda x: int(x.replace(' mins', '')))
df = df.sort_values('time')

In [11]:
df.head()

In [9]:
df.to_csv('housing.csv')