In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join

In [2]:
train_path = join("data","ny_taxi",'train.csv')
test_path = join("data","ny_taxi",'test.csv')

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [5]:
train_df.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [6]:
train_df['store_and_fwd_flag'].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [7]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
#     print(type(lon1))
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

In [8]:
result = haversine(train_df['pickup_longitude'][1],train_df['pickup_latitude'][1],train_df['dropoff_longitude'][1],train_df['dropoff_latitude'][1])

In [9]:
result

1.8043735902887306

In [10]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [11]:
train_df['dist'] = 0.

In [12]:
train_df['dist'] = train_df.apply(lambda  train_df : haversine(train_df['pickup_longitude'],train_df['pickup_latitude'],train_df['dropoff_longitude'],train_df['dropoff_latitude']), axis=1)

In [13]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.49758
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.804374
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.38109
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.484566
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.187842


In [14]:
def get_arrows(locations, color='blue', size=6, n_arrows=3):
    
    '''
    Get a list of correctly placed and rotated 
    arrows/markers to be plotted
    
    Parameters
    locations : list of lists of lat lons that represent the 
                start and end of the line. 
                eg [[41.1132, -96.1993],[41.3810, -95.8021]]
    arrow_color : default is 'blue'
    size : default is 6
    n_arrows : number of arrows to create.  default is 3
    Return
    list of arrows/markers
    '''
    
    Point = namedtuple('Point', field_names=['lat', 'lon'])
    
    # creating point from our Point named tuple
    p1 = Point(locations[0][0], locations[0][1])
    p2 = Point(locations[1][0], locations[1][1])
    
    # getting the rotation needed for our marker.  
    # Subtracting 90 to account for the marker's orientation
    # of due East(get_bearing returns North)
    rotation = get_bearing(p1, p2) - 90
    
    # get an evenly space list of lats and lons for our arrows
    # note that I'm discarding the first and last for aesthetics
    # as I'm using markers to denote the start and end
    arrow_lats = np.linspace(p1.lat, p2.lat, n_arrows + 2)[1:n_arrows+1]
    arrow_lons = np.linspace(p1.lon, p2.lon, n_arrows + 2)[1:n_arrows+1]
    
    arrows = []
    
    #creating each "arrow" and appending them to our arrows list
    for points in zip(arrow_lats, arrow_lons):
        arrows.append(folium.RegularPolygonMarker(location=points, 
                      fill_color=color, number_of_sides=3, 
                      radius=size, rotation=rotation))
    return arrows

In [15]:
def get_bearing(p1, p2):
    
    
    long_diff = np.radians(p2.lon - p1.lon)
    
    lat1 = np.radians(p1.lat)
    lat2 = np.radians(p2.lat)
    
    x = np.sin(long_diff) * np.cos(lat2)
    y = (np.cos(lat1) * np.sin(lat2) 
        - (np.sin(lat1) * np.cos(lat2) 
        * np.cos(long_diff)))
    bearing = np.degrees(np.arctan2(x, y))
    
    # adjusting for compass bearing
    if bearing < 0:
        return bearing + 360
    return bearing

In [16]:
import folium
from collections import namedtuple

m = folium.Map(
    location=[40.767937,-73.982155],
    zoom_start=15
)

p1 = [40.767937, -73.982155]
p2 = [40.765602, -73.964630]

# folium.Marker(location=p1, icon=folium.Icon(color='green')).add_to(m)
# folium.Marker(location=p2, icon=folium.Icon(color='red')).add_to(m)
# folium.PolyLine(locations=[p1, p2], color='blue').add_to(m)
# arrows = get_arrows(locations=[p1, p2], n_arrows=3)
# for arrow in arrows:
#     arrow.add_to(m)
for i in range(10):
    p1 = [train_df['pickup_latitude'][i],train_df['pickup_longitude'][i]]
    p2 = [train_df['dropoff_latitude'][i],train_df['dropoff_longitude'][i]]
    folium.Marker(location=p1, icon=folium.Icon(color='green')).add_to(m)
    folium.Marker(location=p2, icon=folium.Icon(color='red')).add_to(m)
    folium.PolyLine(locations=[p1, p2], color='blue').add_to(m)
    arrows = get_arrows(locations=[p1, p2], n_arrows=3)
    for arrow in arrows:
        arrow.add_to(m)
    
    
m

In [42]:
import googlemaps

In [48]:
gmaps = googlemaps.Client(key='APIkey')

In [51]:
# my_dist = gmaps.distance_matrix('Delhi','Mumbai')['rows'][0]['elements'][0] 
my_dist = gmaps.distance_matrix('Delhi','Mumbai')

In [52]:
my_dist

{'destination_addresses': ['Mumbai, Maharashtra, India'],
 'origin_addresses': ['Delhi, India'],
 'rows': [{'elements': [{'distance': {'text': '1,416 km', 'value': 1415711},
     'duration': {'text': '1 day 1 hour', 'value': 90170},
     'status': 'OK'}]}],
 'status': 'OK'}

In [17]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.49758
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.804374
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.38109
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.484566
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.187842


In [18]:
train_df['speed'] = train_df['dist'] / train_df['trip_duration'] * 1000

In [19]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist,speed
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.49758,3.291384
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.804374,2.721529
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.38109,3.004279
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.484566,3.460526
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.187842,2.730672


In [21]:
train_df['jam'] = train_df.apply(lambda train_df : 0 if (train_df['speed'] < 2.532097) else (1 if (train_df['speed'] < 4.003932) else 2) , axis=1)

1분위수 미만 : 0 <br>
3분위수 미만 : 1 <br>
그 이상 : 2

In [20]:
train_df['speed'].describe()

count    1.458644e+06
mean     4.003932e+00
std      4.157811e+00
min      0.000000e+00
25%      2.532097e+00
50%      3.551096e+00
75%      4.953808e+00
max      2.574726e+03
Name: speed, dtype: float64

In [22]:
train_df['jam'].value_counts()

2    588117
1    505865
0    364662
Name: jam, dtype: int64

In [31]:
import folium
from collections import namedtuple

m = folium.Map(
    location=[40.767937,-73.982155],
    zoom_start=15
)


# folium.Marker(location=p1, icon=folium.Icon(color='green')).add_to(m)
# folium.Marker(location=p2, icon=folium.Icon(color='red')).add_to(m)
# folium.PolyLine(locations=[p1, p2], color='blue').add_to(m)
# arrows = get_arrows(locations=[p1, p2], n_arrows=3)
# for arrow in arrows:
#     arrow.add_to(m)
slow = 0
fast = 0
for i in range(200):
    p1 = [train_df['pickup_latitude'][i],train_df['pickup_longitude'][i]]
    p2 = [train_df['dropoff_latitude'][i],train_df['dropoff_longitude'][i]]
    p3 = train_df['jam'][i]
    if p3 == 2:
        col = 'blue'
        fast += 1
    elif p3 == 0:
        col = 'red'
        slow += 1
    else:
        continue

#     folium.Marker(location=p1, icon=folium.Icon(color=col)).add_to(m)
#     folium.Marker(location=p2, icon=folium.Icon(color=col)).add_to(m)
    folium.PolyLine(locations=[p1, p2], color=col).add_to(m)
    arrows = get_arrows(locations=[p1, p2], n_arrows=3)
    for arrow in arrows:
        arrow.add_to(m)
    
print('막힘 : {} // 빠름 : {}'.format(slow, fast))
m

막힘 : 42 // 빠름 : 79
