### Data scrapping

In [6]:
%run stations.py

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [8]:
kijiji = "https://www.kijiji.ca"

In [9]:
n_pages = 40
first_page = "https://www.kijiji.ca/b-apartments-condos/gta-greater-toronto-area/c37l1700272"
next_pages = ["https://www.kijiji.ca/b-apartments-condos/gta-greater-toronto-area/page-"+
              str(i+2)+"/c37l1700272" for i in range(n_pages -1)]
listings = [first_page] + next_pages

In [10]:
Price_Address = {'Price': [],'Address': []}
Details = {
'Unit Type': [],
'Bedrooms': [],
'Bathrooms': [],
'Parking Included': [],
'Agreement Type': [],
'Move-In Date': [],
'Pet Friendly': [],
'Size (sqft)': [],
'Furnished': [],
'Air Conditioning': [],
'Smoking Permitted': [],
'Barrier-free Entrances and Ramps': [],
'Visual Aids': [],
'Accessible Washrooms in Suite': []}

In [11]:
def Extract1Page(listing):
    page = requests.get(listing)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', class_ = "title", href = True)
    ads = [kijiji + link['href'] for link in links]
    return ads

In [12]:
def ExtractPages(listings):
    ads = []
    for listing in listings:
        ads = ads + Extract1Page(listing)
    return ads

In [13]:
def Extract1Ad(ad):
    page = requests.get(ad)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    price = soup.find('span', content = True)
    if not price is None:
        price = price.get_text()[1:]
        Price_Address['Price'].append(price)
    else: 
        Price_Address['Price'].append('nan')
        
    address = soup.find('span', itemprop = "address")
    if not address is None:
        address = address.get_text()
        Price_Address['Address'].append(address)
    else: 
        Price_Address['Address'].append('nan')
        
    dl = soup.find_all('dl')
    other_stuff = [tag.get_text() for tag in dl]
    for item in Details:
        k = len(Details[item])
        for stuff in other_stuff:
            if item in stuff:
                Details[item].append(stuff[len(item):])
        if len(Details[item]) == k:
                Details[item].append('nan')

In [14]:
def ExtractAds(ads):
    for ad in ads:
        Extract1Ad(ad)

In [15]:
%%time
ads = ExtractPages(listings)

CPU times: user 7.16 s, sys: 217 ms, total: 7.38 s
Wall time: 2min 24s


In [16]:
%%time
ExtractAds(ads)

CPU times: user 1min 52s, sys: 2.86 s, total: 1min 55s
Wall time: 40min 56s


In [17]:
rentals = {**Price_Address, **Details} # Operators for merging dicts
df_rentals = pd.DataFrame(rentals)

### Data cleaning

In [18]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

In [19]:
df_rentals['Price'] = df_rentals['Price'].apply(locale.atof)

In [20]:
def NotAvailable(x):
    if x == 'Not Available':
        return 'nan'
    else:
        return x
df_rentals['Size (sqft)'] = df_rentals['Size (sqft)'].apply(NotAvailable)
df_rentals['Size (sqft)'] = df_rentals['Size (sqft)'].apply(locale.atof)

In [21]:
df_rentals.head()

Unnamed: 0,Price,Address,Unit Type,Bedrooms,Bathrooms,Parking Included,Agreement Type,Move-In Date,Pet Friendly,Size (sqft),Furnished,Air Conditioning,Smoking Permitted,Barrier-free Entrances and Ramps,Visual Aids,Accessible Washrooms in Suite
0,2350.0,"310 Burnhamthorpe Rd W, Mississauga, ON L5B 0E...",,,,1,1 Year,"July 25, 2020",No,825.0,No,Yes,Outdoors only,No,No,No
1,1847.0,"765 Steeles Ave West, Toronto, ON, M2R 2S7",,,,0,1 Year,,Yes,780.0,No,No,No,,,
2,2250.0,"Camberley Cres, Brampton, ON, L6V 3L4",,,,2,1 Year,"August 1, 2020",Yes,,No,Yes,No,,,
3,1797.0,"7433 Yonge St, Thornhill, ON, L3T 1S4",,,,0,1 Year,,Yes,,No,No,No,,,
4,2200.0,"185 Roehampton Ave, Toronto, ON M4P 1R4, Canada",,,,1,1 Year,"September 1, 2020",No,654.0,No,Yes,No,No,No,No


In [22]:
len(df_rentals)

1800

### Google map API

In [23]:
#https://googlemaps.github.io/google-maps-services-python/docs/index.html
#https://github.com/googlemaps/google-maps-services-python
import googlemaps

In [24]:
apikey = "AIzaSyAOYoLJf9d_cy8KsAzW6TtxcHnJZbYQ6Jk"

In [25]:
gmaps = googlemaps.Client(key=apikey)

### Closest TTC station

In [26]:
Address = df_rentals['Address']

In [27]:
%%time
Closest_Station = pd.Series(index = df_rentals.index)
TimeToClosestSt = pd.Series(index = df_rentals.index)
for n in range(len(df_rentals)):
    address = Address.iloc[n]
    dist_result = gmaps.distance_matrix(origins = [address], destinations = ttc, mode = 'walking')
    if dist_result['origin_addresses'] != ['']:
            times = []
            for i in range(len(ttc)):
                if dist_result['rows'][0]['elements'][i]['status'] == 'OK':
                    # time in seconds
                    times.append(dist_result['rows'][0]['elements'][i]['duration']['value'])
                else:
                    # If no status from Google map, we just say it takes more than 3600 seconds,
                    # as we will only focus on time under 1 hour.
                    times.append(3601)
            time = min(times)
            station = stations[times.index(time)]
            TimeToClosestSt.iloc[n] = time/60
            Closest_Station.iloc[n] = station
    else:
        TimeToClosestSt.iloc[n] = np.nan
        Closest_Station.iloc[n] = 'nan'

CPU times: user 20.1 s, sys: 535 ms, total: 20.7 s
Wall time: 13min 45s


In [28]:
df_rentals['Closest station'] = Closest_Station
df_rentals['Time to closest station'] = TimeToClosestSt
df_rentals = df_rentals[(df_rentals['Time to closest station'] < 60)]

In [29]:
len(df_rentals)

1088

In [30]:
df_rentals.sample(10)

Unnamed: 0,Price,Address,Unit Type,Bedrooms,Bathrooms,Parking Included,Agreement Type,Move-In Date,Pet Friendly,Size (sqft),Furnished,Air Conditioning,Smoking Permitted,Barrier-free Entrances and Ramps,Visual Aids,Accessible Washrooms in Suite,Closest station,Time to closest station
1699,2200.0,"28 Olive Ave, North York, ON M2N 7E6, Canada",,,,2,Month-to-month,"July 25, 2020",No,699.0,No,Yes,No,,,,Finch,7.316667
1136,1800.0,"M4G0A5, Canada",,,,1,1 Year,"August 1, 2020",No,633.0,No,Yes,No,Yes,No,No,Eglinton,49.35
793,2300.0,"7895 Jane St, Concord, ON L4K 2M7, Canada",,,,1,1 Year,"July 25, 2020",No,750.0,No,Yes,No,,,,Vaughan Metropolitan Centre,7.8
1787,2050.0,"25 Bedford Rd., Toronto, ON, M5R 1A9",,,,0,1 Year,,No,,No,No,No,,,,St George,4.25
87,2071.0,", Toronto M6J 3W4 ON, Canada",,,,1,Month-to-month,"August 1, 2020",Yes,704.0,No,Yes,Outdoors only,Yes,No,No,St Andrew,30.933333
1408,1677.0,"2175 Avenue Road, Toronto, ON, M5M 4B6",,,,0,1 Year,,Yes,,No,No,No,,,,York Mills,15.883333
1125,2509.0,"31 Spencer Avenue, Toronto, ON, M6K 2J9",,,,0,1 Year,,Yes,900.0,No,No,Yes,,,,Dufferin,40.3
1228,1300.0,"1274 Bloor St W, Toronto, ON M6H 1N8, Canada",,,,0,1 Year,"August 1, 2020",No,300.0,No,No,Outdoors only,,,,Lansdowne,3.15
924,2270.0,"18 Yonge St, Toronto, ON M5E 1Z8, Canada",,,,1,1 Year,"September 1, 2020",No,704.0,No,Yes,No,Yes,No,No,Union,7.733333
621,1948.0,"1330 Danforth Road, Scarborough, ON, M1J 1E8",,,,0,1 Year,,Yes,875.0,No,No,No,,,,Kennedy,28.283333


### Save copy of data

In [31]:
df_rentals.to_pickle('Kijiji_rentals25072020.pkl')

### Gmplot

In [13]:
import gmplot

In [4]:
gmap = gmplot.GoogleMapPlotter(37.766956, -122.438481, 13, apikey = apikey)

In [5]:
gmap.draw("my_map.html")