Visualizing the nearest neighbors algorithm



In [12]:
!pip install plotly
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import numpy as np
import statistics
import math
from folium import plugins
from scipy.ndimage import imread
# This first part of the code is the various functions we need to use, most are self explanatory
import json
# First, read the file
trips_file = open('trips.json')
# Then, convert contents to list of dictionaries 
trips = json.load(trips_file)
def float_values(trips):
    for trip in trips:
        for key, value in trip.items():
            trip[key] = float(value)                         
    return trips

def parse_trips(trips):
    parsedtrips=[]
    for trip in trips:
        trimtrip=trip.copy()
        
        for key in trip:
            if key != 'trip_distance' and key!= 'pickup_latitude' and key != 'pickup_longitude':
                trimtrip.pop(key)
                
        parsedtrips.append(trimtrip)
    
    return parsedtrips


def location(trip):
    loc=[trip['pickup_latitude'], trip['pickup_longitude']]
    return loc

def distance_location(selected_trip, neighbor_trip):
    x=location(selected_trip)
    y=location(neighbor_trip)
    dist=pow((pow(x[0]-y[0],2)+pow(x[1]-y[1],2)) ,.5)
    return dist

def distance_between_neighbors(selected_trip, neighbor_trip):
    neighbor_trip['distance_from_selected']=distance_location(selected_trip, neighbor_trip)
    return neighbor_trip

def distance_all(selected_individual, neighbors):
    alldist=list(map(lambda x : distance_between_neighbors(selected_individual,x),neighbors))
    return alldist


def nearest_neighbors(selected_trip, trips, number = 3):
    sortlist= sorted(distance_all(selected_trip, trips),key = lambda neighbor: neighbor['distance_from_selected'])
    if sortlist[0]['distance_from_selected']==0:
        sortlist.pop(0)
    return sortlist[:number]

def mean_distance(neighbors):
    nearest_distances = list(map(lambda neighbor: neighbor['trip_distance'], neighbors))
    return statistics.mean(nearest_distances)



#here is where we need to mess around to change the k value
def zvalue(x,y):
    trip=dict(pickup_latitude=x, pickup_longitude=y)
    nearest_n_neighbors = nearest_neighbors(trip, cleaned_trips or [], number = 7)#change this number argument to change K
    z=mean_distance(nearest_n_neighbors)
    return z




#this was the attempt at mercator projection, still messing with this
def deg2num(lat_deg, lon_deg, zoom):
    lat_rad = math.radians(lat_deg)
    n = 2.0 ** zoom
    xtile =(lon_deg + 180.0) / 360.0*n
    ytile =(1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) / math.pi) / 2.0*n
    return [xtile, ytile]
    
#First we clean the data
parsed_trips = parse_trips(trips)
cleaned_trips = float_values(parsed_trips)



#these are the coordinates of the top right and botton left corners of the sample area. They determine the area to be sampled
xminll=40.701030
xmaxll=40.816453
yminll=-74.029704
ymaxll=-73.921881

##these are the other corners of the green box, more info about that below
xbr=40.708998
ybr=-73.974842
xtl=40.816453
ytl=-73.968224

numpoints=50 #this is the number of x points and y points, so total sample points is this squared
xlist=list(np.linspace(xminll,xmaxll,numpoints))
ylist=list(np.linspace(yminll,ymaxll,numpoints))

#this creates a grid over the specified area with numpoints^2spaced points sampled
xx,yy = np.meshgrid(xlist, ylist)

#this takes our nearest neighbor function from the lab and iterates it of our sample grid to get a matrix of z values
vectorz=np.vectorize(zvalue)
zgrid=vectorz(xx,yy)

import folium
import os

#this plots the corners of the green box on the folium map
marker = folium.CircleMarker(location = [xminll, yminll], radius=10)
marker2 = folium.CircleMarker(location = [xmaxll, ymaxll], radius=10)
marker3 = folium.CircleMarker(location = [xbr, ybr], radius=10)
marker4 = folium.CircleMarker(location = [xtl, ytl], radius=10)

manhattan_map = folium.Map(location=[40.7589, -73.9851], zoom_start=12)
marker.add_to(manhattan_map)
marker2.add_to(manhattan_map)
marker3.add_to(manhattan_map)
marker4.add_to(manhattan_map)

#back to plotly code
#this turns off all the extra stuff so we just get the plot itself. Mess around here if you want titles, legends, etc
xaxis=dict(
        
        showgrid=False,
        zeroline=False,
        showline=False,
        ticks='',
        showticklabels=False)

layout = go.Layout(yaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, #this line keeps the scale between x and y the same/
    zeroline=False,
    showline=False,
    ticks='',
    showticklabels=False),xaxis=xaxis)

#this adds the green box to the contour plot
layout.update(dict(shapes = [
        {
            'type': 'path',
            'path':f'M {xminll},{yminll} L{xbr},{ybr} L{xmaxll},{ymaxll} L{xtl},{ytl} L{xminll},{yminll}',
            'opacity': 1,
            'line': {
                'width': 2,
                'color':'rgb(29, 240, 4)',
            }
        }]
        ))
#this puts all the plotly stuff together, ready to be plotted
data=[go.Contour(x=xlist,y=ylist,z=zgrid, showscale=False, visible=True)]
fig=go.Figure(data=data, layout=layout)



The idea with this little exercise was to get some help visualizing the data we got from the taxi cab data and the nearest neighbors algorith by taking a grid of coordinates, at each point seeing what trip length the NN algorithm gives us. The contour plot function in plotly does a good job of depicting this data.

I've commented in the Nearest Neighbors function itself how to change the k values, one of the most useful things this visualization might help with is is seeing how changing k values changes our data.
I've also labeled in the code where you can change the area to be sampled and the number of sample points to take, you can play around with those to see how that affects the plot.

Manhattan map displays the folium map of manhattan with markers that correspond to the corners of the green box on the contour map

The box below manhattan map displays the contour map itself, with the green box giving us a reference point for the (approximate) boundaries of manhattan. The warmer colors indicate areas where the nearest neighbor algorthim gives longer avg trip distances.


On to the (many) issues:

The distortion comes because we are plotting spherical coordinates (long and lat) on a rectangle. I have some equations for the coordinate projections, but I'm still trying to get those work right. Even with the distortion, when you mouse over the map it gives you long/lat coordinates that can then be plotted in Folium (which does the coordinate projection internally. What I would really like is to get the contour plot to project correctly and then export the file, because Folium can take in an image file as a semi-transparent overlay, so we could see the contours plotted on the map itself.


In [10]:
manhattan_map

In [11]:
plotly.offline.iplot(fig)