In [1]:
import pandas as pd
from datetime import datetime
import googlemaps
import pprint
import json
import urllib.parse as parse
from tqdm import tqdm
import numpy as np
import config.py
MAPS_KEY = config.MAPS_KEY
gmaps = googlemaps.Client(key=MAPS_KEY)
pp = pprint.PrettyPrinter(indent=4)

In [2]:
df = pd.read_csv('listing_data.csv')
df.head()

Unnamed: 0,listing_id,date,addr_lat,addr_lon,agent_id
0,1110988,2018-09-12,40.780602,-73.956398,98438.0
1,1272052,2018-09-12,40.758301,-73.959503,12749.0
2,1280705,2018-09-12,40.5938,-73.974503,234944.0
3,1222165,2018-09-12,40.770401,-73.963699,107927.0
4,1238968,2018-09-12,40.709099,-74.013702,212974.0


#### Seems like some agent's ids are missing. Since the exercise states that we need to identify the agent who has walked the most in one day, we can drop some columns or fill with agent who appears the most

In [3]:
df.count()

listing_id    399391
date          399391
addr_lat      399391
addr_lon      399391
agent_id      399274
dtype: int64

#### Very interesting to see that there aren't many unique values. I am assuming because agents need to show the same apartment until it is off the market.

In [4]:
df.nunique()

listing_id    17421
date             17
addr_lat       4518
addr_lon       4558
agent_id       8423
dtype: int64

#### Sorting to drop N/As and the dates

In [5]:
sorted_df = df.sort_values(by=['date', 'agent_id']).dropna().reset_index()
sorted_df.head(10)

Unnamed: 0,index,listing_id,date,addr_lat,addr_lon,agent_id
0,110132,1331217,2018-09-01,40.763302,-73.9701,7335.0
1,110235,1341962,2018-09-01,40.763302,-73.9701,7335.0
2,163089,1350380,2018-09-01,40.763302,-73.9701,7335.0
3,381637,1330931,2018-09-01,40.765598,-73.976898,7335.0
4,62443,1352194,2018-09-01,40.7803,-73.9869,7337.0
5,128146,1305816,2018-09-01,40.7691,-73.981598,7340.0
6,256850,1317380,2018-09-01,40.7691,-73.981598,7340.0
7,288710,1357088,2018-09-01,40.7691,-73.981598,7340.0
8,319500,1283629,2018-09-01,40.7691,-73.981598,7340.0
9,381517,1325545,2018-09-01,40.7691,-73.981598,7340.0


In [None]:
origins = []
destinations = []
agent_id = []
date = []

for i in tqdm(range(sorted_df['listing_id'].count())):
    try:
        if sorted_df['date'][i] == sorted_df['date'][i+1] and sorted_df['agent_id'][i] == sorted_df['agent_id'][i+1]:
            origins.append(str(sorted_df['addr_lat'][i]) + "," +str(sorted_df['addr_lon'][i]))
            destinations.append(str(sorted_df['addr_lat'][i+1])+ "," +str(sorted_df['addr_lon'][i+1]))
            agent_id.append(sorted_df['agent_id'][i])
            date.append(sorted_df['date'][i])
        else:
            continue
    except KeyError:
        print("No more agents travelled on same day")

100%|██████████| 399274/399274 [00:33<00:00, 11746.32it/s]

No more agents travelled on same day





In [None]:
direction = []
for o in tqdm(range(len(origins))):
    gmap = gmaps.directions(origin = origins[o], destination = destinations[o], mode='walking')
    direction.append(gmap[0]['legs'][0]['duration']['text'])
return direction

  4%|▍         | 10064/267169 [45:57<19:34:00,  3.65it/s]

In [None]:
labels = ['walk-time', 'agent', 'dates']
df = pd.DataFrame(np.column_stack((direction, agent, date)), columns=labels)
df.head()

#### Trying it out the google maps API, Ultimately I will be using the directions API 

In [None]:
for l in range(df['listing_id'].count()):
    reverse_geocode = gmaps.reverse_geocode((sorted_df['addr_lat'][l], sorted_df['addr_lon'][l]), 
                                            location_type='ROOFTOP')
   
    print(parse.quote_plus(reverse_geocode[0]['formatted_address']) + ' ' + str(sorted_df['agent_id'][l]))