In [1]:
# Modules
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

In [2]:
# Load
map_file = 'map_cache.npy' # numpy load first, see if we have some data before
load = True
try:
    map_cache = np.load(map_file)
except:
    load = False
    
apt_file = 'apt_file.npy'
try:
    apt_list = np.load(apt_file) # update this to be compatible, load exported data from apt.py and filter here
except:
    apt_list = [
        'Royal York',
        'Oakland Apartments',
        'Oak Hill Apartments',
        'The Bridge on Forbes',
        'Schenley Apartments',
        'One on Centre',
        'Portal Place',
        'Devon Towers',
        'Amberson Gardens',
        'Webster Hall',
        'Fairfax Apartments',
        'Ambassador Apartments',
        'North Windsor Apartments',
        'Shadyside Commons',
    ]
    addr_list = [
        '3955 Bigelow Blvd, Pittsburgh, PA 15213',
        '4629 Bayard St, Pittsburgh, PA 15213',
        '475 Garner Ct, Pittsburgh, PA 15213',
        '3423 Forbes Ave, Pittsburgh, PA 15213',
        '4101 Bigelow Blvd, Pittsburgh, PA 15213',
        '4500 Centre Ave, Pittsburgh, PA 15213',
        '2633 Fifth Ave, Pittsburgh, PA 15213',
        '4920 CENTRE Ave, Pittsburgh, PA 15213',
        '1-4 Bayard Rd, Pittsburgh, PA 15213',
        '101 N Dithridge St, Pittsburgh, PA 15213',
        '4614 5th Ave, Pittsburgh, PA 15213',
        '4733 Centre Ave, Pittsburgh, PA 15213',
        '234 Melwood Ave, Pittsburgh, PA 15213',
        '401 Amberson Ave, Pittsburgh, PA 15232',
    ]


In [3]:
# Map Helper
def shortest_distance_time(arg):
    text = str(arg) # arg = bsyc
    end_idx = 0
    shortest_distance = 0
    shortest_time = 0
    while end_idx != -1:
        # Get distance
        end_idx = text.find(' mile', end_idx+1)
        if end_idx == -1:
            break
        start_idx = end_idx
        while(text[start_idx-1] != '['):
            start_idx -= 1
        end_idx = text.find(',', start_idx)
        distance = int(text[start_idx : end_idx])
        if shortest_distance != 0 and distance > shortest_distance: # Only do shortest distance
            end_idx = text.find(']', end_idx) # move cursor after 'mile'
            continue
        shortest_distance = distance
        # Get time for shortest distance
        start_idx = text.find('[', end_idx) + 1
        end_idx = text.find(',', start_idx)
        shortest_time = int(text[start_idx : end_idx]) # small bug, we are using new time not shortest_time if same shortest_distance
    return [shortest_distance, shortest_time]

# Helper function to create query, open url, call parse api and output one data
# Sample data: ['APT_NAM', 'DIS_DRI', 'TIM_DRI', ' DIS_WAL', 'TIM_WAL', 'DIS_BIC', 'TIM_BIC']
travel_mode = ['driving', 'walking', 'bicycling'] #, 'transit'] # do not support transit now, noise in sub-routes
def goog_map(apt_name, apt_addr):
    goog_map_data = [apt_name]
    for tm in travel_mode:
        link = 'https://www.google.com/maps/dir/?api=1&origin=' + \
               (apt_name+' '+apt_addr).replace(' ', '+') + \
               '&destination=CMU&travelmode=' + tm
        html = urlopen(link)
        bsyc = BeautifulSoup(html.read(), "lxml")
        time.sleep(5)
        fout = open(apt_name+' '+tm+'.txt', 'wt',encoding='utf-8')
        fout.write(str(bsyc))
        fout.close()
        goog_map_data += shortest_distance_time(bsyc)
    return goog_map_data

In [4]:
# Search
if load:
    apt_cache = map_cache[:,0].tolist() #[map_cache[r][0] for r in range(len(map_cache))]
else:
    apt_cache = [] # also needed to avoid duplicates
apt_data_list = [] # to store new apt_data
for apt,addr in zip(apt_list, addr_list):
    # Data in cache, skip search
    if apt in apt_cache:
        continue
    try:
        # Read one data
        apt_data = goog_map(apt, addr)
        # Add data
        apt_data_list.append(apt_data)
        # Avoid duplicate
        apt_cache.append(apt)
    except:
        print('Unable to read more map. Save current data.')
        break

# Add new data to cache
if apt_data_list:
    if load:
        map_cache = np.concatenate((map_cache, np.array(apt_data_list)), axis = 0)
    else:
        map_cache = np.array(apt_data_list)
# else: do nothing to map_cache

In [5]:
# Check current cache, old + new data
print(map_cache)
# Save all data
np.save(map_file, map_cache)

[['Royal York' '1385' '366' '1364' '1006' '1385' '371']
 ['Oakland Apartments' '943' '267' '943' '736' '943' '263']
 ['Oak Hill Apartments' '2869' '635' '2432' '1870' '3105' '678']
 ['The Bridge on Forbes' '1415' '278' '1415' '1090' '1415' '321']
 ['Schenley Apartments' '1408' '327' '1367' '997' '1438' '345']
 ['One on Centre' '1186' '319' '1186' '899' '1186' '290']
 ['Portal Place' '3627' '551' '1991' '1524' '1991' '606']
 ['Devon Towers' '1560' '299' '1410' '1058' '1483' '363']
 ['Amberson Gardens' '1464' '357' '1255' '950' '1387' '363']
 ['Webster Hall' '1200' '321' '634' '475' '631' '202']
 ['Fairfax Apartments' '551' '154' '481' '376' '551' '165']
 ['Ambassador Apartments' '1364' '304' '1342' '1037' '1302' '399']
 ['North Windsor Apartments' '1338' '392' '1027' '793' '1338' '390']
 ['Shadyside Commons' '1468' '321' '1374' '1066' '1391' '388']]
