# Address Search Analysis:

This notebook consists of the data querying and visualization functions necessary for inidividual address analysis. This notebook was written for an ad-hoc approach. For production use, the contents of this notebook must be refactored. In total, there are 6 features rendered for an address:

- Full address and longitude/latitude search
- Number of rides taken to and from location this year and by month
- Top zip code zones rides are being taken to or from this year and by month
- Passenger count breakdown this year and by month
- Ride length  this year and by month
- Average ride fare this year and by month

In [1]:
import pandas as pd
import numpy as np
import mongoengine as me
from datetime import datetime
import requests
import dateutil
import json
import googlemaps
from bokeh import mpl
from bokeh.io import show, push_notebook, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, LogColorMapper
from bokeh.palettes import PuBu9 as palette
palette.reverse()
# import seaborn as sns
# %matplotlib inline

### Connections and Database Models

In [2]:
# mongoengine connection
me.connect('cruz-dev', host='localhost', port=27017)

# google api client
google_api_keys = {
    "surya": "AIzaSyAU2gGkynk36LibmjTwLKOKMHVTRKIM87k",
    "graham": "AIzaSyBRcJ-Oj88gvz0LWNaCKg42K0K9SQIFpfs"}
gmaps = googlemaps.Client(key=google_api_keys["surya"])

In [3]:
class Ride(me.Document):
    pickup_datetime = me.DateTimeField()
    dropoff_datetime = me.DateTimeField()
    pickup_zipcode = me.IntField()
    pickup_borough = me.StringField()
    pickup_county = me.StringField()
    pickup_long_lat = me.PointField()
    dropoff_zipcode = me.IntField()
    dropoff_borough = me.StringField()
    dropoff_county = me.StringField()
    dropoff_long_lat = me.PointField()
    total_amount = me.FloatField()
    fare_amount = me.FloatField()
    tip_amount = me.FloatField()
    passenger_count = me.IntField()
    trip_distance = me.FloatField()
    
    @me.queryset_manager
    def pickups_nearby(doc_cls, queryset, long, lat, distance):
        return queryset.filter(pickup_long_lat__near = [long, lat], pickup_long_lat__max_distance=distance).order_by('-pickup_datetime')
        
    @me.queryset_manager
    def dropoffs_nearby(doc_cls, queryset, long, lat, distance):
        return queryset.filter(dropoff_long_lat__near = [long, lat], dropoff_long_lat__max_distance=distance).order_by('-dropoff_datetime')
    
    def to_json(self):
        response = {"pickup_datetime": self.pickup_datetime, 
                    "dropoff_datetime":  self.dropoff_datetime,
                    "pickup_zipcode":  self.pickup_zipcode,
                    "pickup_borough":  self.pickup_borough,
                    "pickup_county":  self.pickup_county,
                    "pickup_long_lat":  self.pickup_long_lat['coordinates'],
                    "dropoff_zipcode":  self.dropoff_zipcode,
                    "dropoff_borough":  self.dropoff_borough,
                    "dropoff_county":  self.dropoff_county,
                    "dropoff_long_lat":  self.dropoff_long_lat['coordinates'],
                    "total_amount":  self.total_amount,
                    "fare_amount":  self.fare_amount,
                    "tip_amount":  self.tip_amount,
                    "passenger_count":  self.passenger_count,
                    "trip_distance":  self.trip_distance
                   }
        return response
    
    def to_series(self):
        response = pd.Series(
            {"pickup_datetime": self.pickup_datetime,
             "dropoff_datetime":  self.dropoff_datetime,
             "pickup_zipcode":  self.pickup_zipcode,
             "pickup_borough":  self.pickup_borough,
             "pickup_county":  self.pickup_county,
             "pickup_long_lat":  self.pickup_long_lat['coordinates'],
             "dropoff_zipcode":  self.dropoff_zipcode,
             "dropoff_borough":  self.dropoff_borough,
             "dropoff_county":  self.dropoff_county,
             "dropoff_long_lat":  self.dropoff_long_lat['coordinates'],
             "total_amount":  self.total_amount,
             "fare_amount":  self.fare_amount,
             "tip_amount":  self.tip_amount,
             "passenger_count":  self.passenger_count,
             "trip_distance":  self.trip_distance
            })     
        return response
    
    meta = {
        'indexes': [[("pickup_long_lat", "2dsphere"), ("pickup_datetime", 1)],
                    [("dropoff_long_lat", "2dsphere"), ("dropoff_datetime", 1)],
                    [("pickup_datetime", 1), ("pickup_borough", 1), ("pickup_zipcode", 1)],
                    [("dropoff_datetime", 1), ("dropoff_borough", 1), ("dropoff_zipcode", 1)],
                    [("pickup_datetime", 1), ("pickup_long_lat", "2dsphere")],
                    [("dropoff_datetime", 1), ("dropoff_long_lat", "2dsphere")]],
        'collection': 'rides_15'
    }

In [4]:
def get_pickups_nearby_df(geocode_response, distance):
    p_rides = Ride.pickups_nearby(geocode_response[0]["geometry"]["location"]["lng"],
                                  geocode_response[0]["geometry"]["location"]["lat"] , distance)
    p_rides_df = pd.DataFrame([ride.to_series() for ride in p_rides])
    p_rides_df.set_index('pickup_datetime')
    return p_rides_df

def get_dropoffs_nearby_df(geocode_response, distance):
    d_rides = Ride.dropoffs_nearby(geocode_response[0]["geometry"]["location"]["lng"],
                                   geocode_response[0]["geometry"]["location"]["lat"] , distance)
    d_rides_df = pd.DataFrame([ride.to_series() for ride in d_rides])
    d_rides_df.set_index('dropoff_datetime')
    return d_rides_df

In [10]:
geocode_response = gmaps.geocode("PHD")
if len(geocode_response) != 0:
    full_address =  geocode_response[0]['formatted_address']
    p_rides_df = get_pickups_nearby_df(geocode_response, 50)
    d_rides_df = get_dropoffs_nearby_df(geocode_response, 50)
    rides_df = pd.concat([p_rides_df, d_rides_df], ignore_index=False)
else:
    error_message = "Malformed Google Maps API response.\nPlease try a different address, and if it still fails, check your API credentials."
    print(error_message)



In [12]:
pzf_df = pd.read_csv("zipcode_geojson.csv")
zipcode_freq_df = pd.DataFrame()
for i, month in enumerate(["January", "February", "March", "April", "May", "June", "July"]):
    append_df = pzf_df.drop(['latitude', 'longitude'], 1)
    append_df['month'] = month
    
    p_query = (datetime(2015,i+1,1) < d_rides_df['pickup_datetime']) & (d_rides_df['pickup_datetime'] < datetime(2015,i+2,1))
    if True in p_query.values:
        p_frequencies = pd.DataFrame(100*d_rides_df[p_query]['pickup_zipcode'].value_counts() / len(d_rides_df)).rename(columns={'pickup_zipcode': 'pickup_frequency'})
        append_df = pd.merge(append_df, p_frequencies, how='left', left_on='zipcode', right_index=True).fillna(0)
    else:
        append_df['pickup_frequency'] = 0.0
            
    d_query = (datetime(2015,i+1,1) < p_rides_df['dropoff_datetime']) & (p_rides_df['dropoff_datetime'] < datetime(2015,i+2,1))
    if True in d_query.values:
        d_frequencies = pd.DataFrame(100*p_rides_df[d_query]['dropoff_zipcode'].value_counts() / len(p_rides_df)).rename(columns={'dropoff_zipcode': 'dropoff_frequency'})
        append_df = pd.merge(append_df, d_frequencies, how='left', left_on='zipcode', right_index=True).fillna(0)
    else:
        append_df['dropoff_frequency'] = 0.0
        
    zipcode_freq_df = zipcode_freq_df.append(append_df)
zipcode_freq_df['zipcode'] = zipcode_freq_df.apply(lambda row : "0"*(5-len(str(row['zipcode'])))+str(row['zipcode']), axis=1)

In [13]:
color_mapper = LogColorMapper(palette=palette)
json_data = zipcode_freq_df[zipcode_freq_df['month'] == "January"].to_dict('list')
json_data['longitude'] = [json.loads(row['longitude']) for i, row in pzf_df.iterrows()]
json_data['latitude'] = [json.loads(row['latitude']) for i, row in pzf_df.iterrows()]
source = ColumnDataSource(json_data)
TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save"

d_choropleth = figure(
    title="Dropoff Choropleth for {0}".format(full_address), tools=TOOLS,
    x_axis_location=None, y_axis_location=None
)
d_choropleth.grid.grid_line_color = None

d_choropleth.patches('longitude', 'latitude', source=source,
          fill_color={'field': 'dropoff_frequency', 'transform': color_mapper},
          fill_alpha=0.7, line_color="black", line_width=0.5)

hover = d_choropleth.select_one(HoverTool)
hover.point_policy = "follow_mouse"
hover.tooltips = [
    ("Name", "@name"),
    ("Zipcode", "@zipcode"),
    ("Borough", "@borough"),
    ("Dropoff Frequency", "@dropoff_frequency%"),
    ("(Long, Lat)", "($x, $y)")
]

show(d_choropleth)

### Additional Snippets:

Generating and saving a pandas dataframe compatible with bokeh's column data source. This dataframe is meant to represent the frequency data used in the dropoff and pickup zipcode choropleths.

In [None]:
zipcode_geojson_url = "http://catalog.civicdashboards.com/dataset/11fd957a-8885-42ef-aa49-5c879ec93fac/resource/28377e88-8a50-428f-807c-40ba1f09159b/download/nyc-zip-code-tabulation-areas-polygons.geojson"
zipcode_geojson = requests.get(zipcode_geojson_url).json()

# names = []
# zipcodes = []
# longs = []
# lats= []
# boroughs = []
# for data in zipcode_geojson['features']:
#     names.append(data['properties']['PO_NAME'])
#     zipcodes.append(data['properties']['postalCode'])
#     boroughs.append(data['properties']['borough'])
#     longs.append([coord[0] for coord in data['geometry']['coordinates'][0]])
#     lats.append([coord[1] for coord in data['geometry']['coordinates'][0]])
    
pzf_df = pd.DataFrame()
pzf_df['name'] = pd.Series([n['properties']['PO_NAME'] for n in zipcode_geojson['features']])
pzf_df['zipcode'] = pd.Series([z['properties']['postalCode'] for z in zipcode_geojson['features']])
pzf_df['borough'] = pd.Series([b['properties']['borough'] for b in zipcode_geojson['features']])
pzf_df['longitude'] = pd.Series([[coord[0] for coord in d['geometry']['coordinates'][0]] for d in zipcode_geojson['features']])
pzf_df['latitude'] = pd.Series([[coord[1] for coord in d['geometry']['coordinates'][0]] for d in zipcode_geojson['features']])
pzf_df.to_csv("zipcode_geojson.csv", index=False)

# zipcode_freq_df = pd.DataFrame()
# for month in ["January", "February", "March", "April", "May", "June", "July"]:
#     pzf_df['Month'] = month
#     zipcode_freq_df = zipcode_freq_df.append(pzf_df)
# zipcode_freq_df.to_csv("bokeh_zipcode_geojson.csv", index=False)

Using geoplotlib to generate choropleths for pickup and dropoff rates.

In [None]:
def get_p_color(properties):
    key = properties["postalCode"]
    if key in p_zipcode_weight_dict:
        return cmap.to_color(p_zipcode_weight_dict.get(key), max(p_zipcode_weight_dict.values()), 'lin')
    else:
        return [0, 0, 0, 0]
def get_d_color(properties):
    key = properties["postalCode"]
    if key in d_zipcode_weight_dict:
        return cmap.to_color(d_zipcode_weight_dict.get(key), max(d_zipcode_weight_dict.values()), 'lin')
    else:
        return [0, 0, 0, 0]

cmap = ColorMap('Blues', alpha=255, levels=10)
gplt.geojson(zipcode_geojson, fill=True, color=get_d_color, f_tooltip = lambda properties: properties['postalCode'])
gplt.set_bbox(gplt.utils.BoundingBox(north=40.9, west=-74.2, south=40.5, east=-73.7))
gplt.show()

Using seaborn to generate violin plots to represent fare for different days of the week.

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x="day", y="total_amount", hue="type", data=rides_df, palette="Set2", split=True, scale="count", inner="quartile", order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
output_file("ex.html")
show(mpl.to_bokeh(ax))