## Name: Swami Venkat
## Date: 12/3/2019


In [67]:
# Importing important libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import json
import folium
from folium.plugins import HeatMapWithTime
from folium.plugins import HeatMap
import os.path
from os import path
import matplotlib.cm as cm
import warnings
from matplotlib.ticker import MultipleLocator
warnings.filterwarnings('ignore')
from time import process_time

In [68]:
with open('./Borough Boundaries.geojson') as f:
    js = json.load(f)

man_cord_list = js['features'][4]['geometry']['coordinates']
flat_list1 = [item for sublist in man_cord_list for item in sublist]
man_cord = [item for sublist in flat_list1 for item in sublist]
max_lt,max_lng,min_lt,min_lng =0,0,999,999
for lat,lng in man_cord:
    
    if abs(lat) > max_lng:
        max_lng = lat
    if abs(lng) > max_lt: 
        max_lt = lng
    if abs(lat) < min_lng:
        min_lng = lat
    if abs(lng) < min_lt: 
        min_lt = lng
print(max_lng, max_lt, min_lng, min_lt)

-73.90665099473267 40.87903804730722 -74.01092841268031 40.68291694544512


In [69]:
def filter_df(df):
    df = df[['pickup_latitude' ,'pickup_longitude','dropoff_latitude' , 
             'dropoff_longitude','tpep_pickup_datetime','total_amount' , 
             'passenger_count' ,'Date', 'Time','total_amount' ]].copy()
    
    df = df[(df != 0).all(1)]
    df = df.loc[(df['passenger_count']>0) & (df['passenger_count']<5)]
    df = df.sort_values(by='tpep_pickup_datetime',kind='heapsort', ascending=True)
    
    df = df.loc[(df['pickup_latitude']>=min_lt) & 
                (df['pickup_latitude']<= max_lt) & 
                (df['pickup_longitude']>=min_lng) & 
                (df['pickup_longitude']<= max_lng)]
    df = df.loc[(df['dropoff_latitude']>=min_lt) & 
                (df['dropoff_latitude']<= max_lt) & 
                (df['dropoff_longitude']>=min_lng) & 
                (df['dropoff_longitude']<= max_lng)]
    
    df = df.round(4)    

    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [70]:
def parse_timestamp(df):
    
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'],format='%Y-%m-%d %H:%M:%S', errors='coerce')
    df['Timestamp'] = pd.to_datetime(df['tpep_pickup_datetime'].dt.round(freq = '15 min'))
    df['Date'] = (df['Timestamp']).dt.date
    df['Time'] = (df['Timestamp']).dt.time
    df['Day'] = df['tpep_pickup_datetime'].dt.strftime('%A')
    return df

In [71]:
df =  pd.read_csv('./Data/trip_data/yellow_tripdata_2016-06.csv' ,nrows =1000, low_memory=False)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
VendorID                 1000 non-null int64
tpep_pickup_datetime     1000 non-null object
tpep_dropoff_datetime    1000 non-null object
passenger_count          1000 non-null int64
trip_distance            1000 non-null float64
pickup_longitude         1000 non-null float64
pickup_latitude          1000 non-null float64
RatecodeID               1000 non-null int64
store_and_fwd_flag       1000 non-null object
dropoff_longitude        1000 non-null float64
dropoff_latitude         1000 non-null float64
payment_type             1000 non-null int64
fare_amount              1000 non-null float64
extra                    1000 non-null float64
mta_tax                  1000 non-null float64
tip_amount               1000 non-null float64
tolls_amount             1000 non-null float64
improvement_surcharge    1000 non-null float64
total_amount             1000 non-null float64
dtypes: floa

In [73]:
fpath = "./Data/Pickled Data/YC_2016_June_Trips.pkl"
if path.exists(fpath):
    trips_yc_2016_june = pd.read_pickle(fpath)
else:
    trips_yc_2016_june_raw = pd.read_csv('./Data/trip_data/yellow_tripdata_2016-06.csv' , low_memory=False)
    trips_yc_2016_june = filter_df(parse_timestamp(trips_yc_2016_june_raw))
    dpath ='./Data/Pickled Data/'
    os.makedirs(dpath, exist_ok=True) 
    trips_yc_2016_june.to_pickle('./Data/Pickled Data/YC_2016_June_Trips.pkl')
trips_yc_2016_june.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8576583 entries, 0 to 8576582
Data columns (total 10 columns):
pickup_latitude         float64
pickup_longitude        float64
dropoff_latitude        float64
dropoff_longitude       float64
tpep_pickup_datetime    datetime64[ns]
total_amount            float64
passenger_count         int64
Date                    object
Time                    object
total_amount            float64
dtypes: datetime64[ns](1), float64(6), int64(1), object(2)
memory usage: 654.3+ MB


In [74]:
trips_yc_2016_june.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,tpep_pickup_datetime,total_amount,passenger_count,Date,Time,total_amount.1
0,40.755,-73.9729,40.7252,-73.9923,2016-06-01 00:00:00,16.6,4,2016-06-01,00:00:00,16.6
1,40.7255,-73.9975,40.7449,-73.9977,2016-06-01 00:00:00,18.8,1,2016-06-01,00:00:00,18.8
2,40.7606,-73.9622,40.8272,-73.9223,2016-06-01 00:00:00,24.96,1,2016-06-01,00:00:00,24.96
3,40.7502,-74.0024,40.7552,-73.9911,2016-06-01 00:00:01,6.3,3,2016-06-01,00:00:00,6.3
4,40.7759,-73.9762,40.7679,-73.9823,2016-06-01 00:00:01,7.8,1,2016-06-01,00:00:00,7.8


In [75]:
startdate = pd.to_datetime("2016-06-06").date()
enddate = pd.to_datetime("2016-06-12").date()
trips_yc_2016_june_wk1 = trips_yc_2016_june.loc[(trips_yc_2016_june['Date'] >= startdate) 
                                                  & (trips_yc_2016_june['Date'] <= enddate)]

In [76]:
def get_bearing(long1, lat1, long2, lat2): 
    long1, lat1, long2, lat2 = map(np.radians, [long1, lat1, long2, lat2])
    bearing = np.arctan2(np.sin(long2-long1)*np.cos(lat2), np.cos(lat1)*np.sin(lat2)-np.sin(lat1)*np.cos(lat2)*np.cos(long2-long1))
    bearing = np.degrees(bearing)
    bearing = (bearing + 360) % 360
    
    return np.round(bearing,2)

In [77]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    distance = 3958.8 * c
    return np.round(distance,2)

In [78]:
 trips_yc_2016_june_wk1['Bearing Angle'] = get_bearing( trips_yc_2016_june_wk1['pickup_longitude'],
                                                            trips_yc_2016_june_wk1['pickup_latitude'],
                                                            trips_yc_2016_june_wk1['dropoff_longitude'],  
                                                            trips_yc_2016_june_wk1['dropoff_latitude'])/1000

In [79]:
 trips_yc_2016_june_wk1['Distance'] = haversine_np(    trips_yc_2016_june_wk1['pickup_longitude'],
                                                                       trips_yc_2016_june_wk1['pickup_latitude'],
                                                                       trips_yc_2016_june_wk1['dropoff_longitude'],  
                                                                       trips_yc_2016_june_wk1['dropoff_latitude'])/1000

In [80]:
trips_yc_2016_june_wk1.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,tpep_pickup_datetime,total_amount,passenger_count,Date,Time,total_amount.1,Bearing Angle,Distance
1503376,40.7302,-74.0023,40.7643,-73.9826,2016-06-05 23:52:30,14.75,2,2016-06-06,00:00:00,14.75,0.02363,0.00257
1503377,40.7691,-73.9826,40.7187,-74.0071,2016-06-05 23:52:30,20.3,1,2016-06-06,00:00:00,20.3,0.20023,0.00371
1503378,40.748,-73.9847,40.8092,-73.9519,2016-06-05 23:52:30,23.75,2,2016-06-06,00:00:00,23.75,0.02208,0.00456
1503379,40.7308,-74.0008,40.696,-73.9204,2016-06-05 23:52:30,27.96,1,2016-06-06,00:00:00,27.96,0.1197,0.00485
1503380,40.7344,-74.0071,40.7445,-73.999,2016-06-05 23:52:30,6.95,1,2016-06-06,00:00:00,6.95,0.03128,0.00082


In [81]:
import json
from shapely.geometry import shape, Point


def fill_boro(df):
    with open('./Borough Boundaries.geojson') as f:
        js = json.load(f)

    pu_point_list = df[['pickup_longitude', 'pickup_latitude']].values.tolist()
    do_point_list = df[['dropoff_longitude', 'dropoff_latitude']].values.tolist()

    boro_list = [0] * len(df)
    
    count = 0
    for pu,do in zip(pu_point_list,do_point_list):
        point_pu = Point(pu)
        point_do = Point(do)
        feature = js['features'][4]
        polygon = shape(feature['geometry'])
        if polygon.contains(point_pu) and polygon.contains(point_do):
                boro_list[pu_point_list.index(pu)] = feature['properties']['boro_name']

    df['Ride_Boro'] = boro_list
    df= df.loc[df.Ride_Boro == 'Manhattan']
    
    return df

In [111]:
trips_grp_by_time_date = trips_yc_2016_june_wk1.groupby(["Date","Time"]).size()
trips_grp_by_time_date= trips_grp_by_time_date.reset_index()

trips_grp_by_time_date = trips_grp_by_time_date.rename(columns={0:'Total Trips'})

trips_grp_by_time_date

Unnamed: 0,Date,Time,Total Trips
0,2016-06-06,00:00:00,1415
1,2016-06-06,00:15:00,1388
2,2016-06-06,00:30:00,1310
3,2016-06-06,00:45:00,1180
4,2016-06-06,01:00:00,968
...,...,...,...
667,2016-06-12,22:45:00,2657
668,2016-06-12,23:00:00,2404
669,2016-06-12,23:15:00,2293
670,2016-06-12,23:30:00,2494


In [112]:
from sklearn.cluster import KMeans
def after_cluster_data(df_train):
    agg_df= df_train.groupby(['cluster'])['passenger_count'].agg(['sum']).sort_values(['sum'], ascending=False)
    agg_df['trips'] = np.ceil(agg_df['sum']/4)
    Total_Passengers = agg_df['sum'].sum(), 
    Total_Trips = agg_df['trips'].sum()
    single_ride_count = len(agg_df.loc[(agg_df['sum']==1)])
    
    return Total_Passengers,Total_Trips,single_ride_count

def get_single_cluster_count(df_train):
    cl_count = df_train['cluster'].value_counts()
    cl_count= cl_count.reset_index()
    cl_count = cl_count.rename(columns={'index': 'Cluster_Label', 'cluster': 'Count'})
    single_cluster_count = len(cl_count.loc[cl_count['Count']==1])
    return single_cluster_count, (cl_count['Cluster_Label'].max()+1)
   
def kmeans_cluster(df_train,feat_list):
    train = df_train[feat_list].copy()
    step_size = int((len(train)//1.2 - len(train)//1.6)//10)
    cluster_range = np.arange(int(len(train)//1.6), int(len(train)//1.2),step_size)
    for i in cluster_range:
        kmeans = KMeans(n_clusters=i, random_state=0).fit(train)
        if kmeans.inertia_ < 0.004:
            break
    df_train['cluster'] = kmeans.labels_
    return df_train, kmeans

In [113]:
nyc_coor = [40.7128, -74.0060]

def generateBaseMap(default_location=nyc_coor, default_zoom_start=8.5):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

def generate_map(method,df,kmeans_model):
    
    for i in range(5):
        base_map = generateBaseMap(default_zoom_start=12)
        cl_count = df['cluster'].value_counts()
        cl_count= cl_count.reset_index()
        cl_count = cl_count.rename(columns={'index': 'Cluster_Label', 'cluster': 'Count'})
        max_cluster = cl_count['Cluster_Label'][i]
        
        df_max_cluster = df.loc[df['cluster']==max_cluster]
        list_pu = df_max_cluster[['pickup_latitude', 'pickup_longitude']].values.tolist()
        list_do = df_max_cluster[['dropoff_latitude', 'dropoff_longitude']].values.tolist()
        
        for location in list_pu:
            folium.Marker(location,icon=folium.Icon(color='red', icon='info-sign')).add_to(base_map)
        for location in list_do:
            folium.Marker(location,icon=folium.Icon(color='blue', icon='info-sign')).add_to(base_map)

        folium.Marker(kmeans_model.cluster_centers_[max_cluster][:2],
                      icon=folium.Icon(color='green', icon = 'home')).add_to(base_map)
        folium.Marker(kmeans_model.cluster_centers_[max_cluster][2:4],
                      icon=folium.Icon(color='orange', icon = 'home')).add_to(base_map)
        
        dpath ='./Data/HTML'
        os.makedirs(dpath, exist_ok=True) 
        fname = dpath + method + '_Cluster_Map_' + str(i) + '.html'
        base_map.save(fname)

In [114]:
def mean_pickup_dropoff_dist(df_train,kmeans):
    cl_count = df_train['cluster'].value_counts()
    cl_count= cl_count.reset_index()
    cl_count = cl_count.rename(columns={'index': 'Cluster_Label', 'cluster': 'Count'})
    mean_pickup_distance = 0
    mean_dropoff_distance = 0
    for i in range(5):
        max_cluster = cl_count['Cluster_Label'][i]
        df_max_cluster = df_train.loc[df_train['cluster']==max_cluster]
        
        pu_cent_lat = (kmeans.cluster_centers_[max_cluster])[0]
        pu_cent_long = (kmeans.cluster_centers_[max_cluster])[1]
        do_cent_lat = (kmeans.cluster_centers_[max_cluster])[2]
        do_cent_long = (kmeans.cluster_centers_[max_cluster])[3]

        df_max_cluster['Pickup_Distance_from_Centroid'] = haversine_np(pu_cent_long,pu_cent_lat,
                                                                       df_max_cluster['pickup_longitude'],  
                                                                       df_max_cluster['pickup_latitude'] )

        df_max_cluster['Dropoff_Distance_from_Centroid'] = haversine_np(do_cent_long,do_cent_lat,
                                                                       df_max_cluster['dropoff_longitude'],  
                                                                       df_max_cluster['dropoff_latitude'] )
        
        mean_pickup_distance +=df_max_cluster['Pickup_Distance_from_Centroid'].mean()
        mean_dropoff_distance +=df_max_cluster['Dropoff_Distance_from_Centroid'].mean()

    return mean_pickup_distance/5, mean_dropoff_distance/5


In [None]:
for i in range(len(trips_grp_by_time_date)):
    
    df_train_raw = trips_yc_2016_june_wk1.loc[(trips_yc_2016_june_wk1['Date']==trips_grp_by_time_date['Date'][i]) & 
                                 (trips_yc_2016_june_wk1['Time']==trips_grp_by_time_date['Time'][i])]
    df_train = fill_boro(df_train_raw)
    trips_grp_by_time_date.loc[i, 'Total Trips'] = len(df_train)
    trips_grp_by_time_date.loc[i, 'Total Single Trips'] = len(df_train.loc[(df_train['passenger_count']==1)])
    
    feat_list_m1 =  ['pickup_latitude' ,'pickup_longitude','dropoff_latitude' , 'dropoff_longitude']
    cluster_df_m1, kmeans_model_m1 = kmeans_cluster(df_train,feat_list_m1)
    
    total_passengers,total_trips, total_single_rides = after_cluster_data(cluster_df_m1)
    single_cluster_count,total_cluster_count = get_single_cluster_count(cluster_df_m1)
    
    trips_grp_by_time_date.loc[i,'Total Passengers'] = total_passengers
    trips_grp_by_time_date.loc[i,'Total Trips after Agg Method_1'] = total_trips
    trips_grp_by_time_date.loc[i,'Single Trips after Agg Method_1'] = total_single_rides
    trips_grp_by_time_date.loc[i,'Total Clusters Method_1'] = total_cluster_count
    trips_grp_by_time_date.loc[i,'Single Clusters Method_1'] = single_cluster_count
    trips_grp_by_time_date.loc[i,'Mean Pickup Dist Method_1'] = mean_pickup_dropoff_dist(cluster_df_m1,kmeans_model_m1)[0]
    trips_grp_by_time_date.loc[i,'Mean Dropoff Dist Method_1'] = mean_pickup_dropoff_dist(cluster_df_m1,kmeans_model_m1)[1]
        
    if i==0:
        generate_map('Method_1',cluster_df_m1,kmeans_model_m1)
        
    feat_list_m2 =  ['pickup_latitude' ,'pickup_longitude','Bearing Angle' , 'Distance']
    cluster_df_m2, kmeans_model_m2 = kmeans_cluster(df_train,feat_list_m2)
    
    total_passengers,total_trips, total_single_rides = after_cluster_data(cluster_df_m2)
    single_cluster_count,total_cluster_count = get_single_cluster_count(cluster_df_m2)
    
    trips_grp_by_time_date.loc[i,'Total Trips after Agg Method_2'] = total_trips
    trips_grp_by_time_date.loc[i,'Single Trips after Agg Method_2'] = total_single_rides
    trips_grp_by_time_date.loc[i,'Total Clusters Method_2'] = total_cluster_count
    trips_grp_by_time_date.loc[i,'Single Clusters Method_2'] = single_cluster_count
    
    trips_grp_by_time_date.loc[i,'Mean Pickup Dist_2'] = mean_pickup_dropoff_dist(cluster_df_m2,kmeans_model_m2)[0]
    
    if i==0:
        generate_map('Method_2',cluster_df_m2,kmeans_model_m2)
    if (i+1)%10==0:
        fname='./Data/Pickled Data/Trips_Grouped_DateTime_After_Clustering_part' +str(i) + '.pkl'
        trips_grp_by_time_date.to_pickle(fname)
                               
trips_grp_by_time_date.to_pickle('./Data/Pickled Data/Trips_Grouped_DateTime_After_Clustering.pkl')
trips_grp_by_time_date.head()

In [118]:
summary_df = trips_grp_by_time_date.drop(columns = ['Total Clusters Method_1', 
                                                    'Single Clusters Method_1', 
                                                    'Total Clusters Method_2',
                                                   'Single Clusters Method_2'])

summary_df['Method_1 Single Ride Efficiency'] = (summary_df['Single Trips before Agg'] - 
                                                 summary_df['Single Trips after Agg Method_1']) \
                                                 /summary_df['Single Trips before Agg']

summary_df['Method_2 Single Ride Efficiency'] = (summary_df['Single Trips before Agg'] - 
                                                 summary_df['Single Trips after Agg Method_2']) \
                                                 /summary_df['Single Trips before Agg']

summary_df.head()

Unnamed: 0,Date,Time,Total Trips,Single Trips before Agg,Total Passengers,Total Trips after Agg Method_1,Single Trips after Agg Method_1,Single Clusters Method_1,Mean Dropoff Dist Method_1,Single Trips after Agg Method_2,Total Clusters Method_2,Mean Pickup Dist_2,Method_1 Single Ride Efficiency,Method_2 Single Ride Efficiency
0,2016-06-06,00:00:00,1181,917.0,1555.0,878.0,460.0,578.0,0.1122,335.0,738.0,0.096467,0.498364,0.634678
1,2016-06-06,00:15:00,1143,898.0,1484.0,833.0,434.0,534.0,0.062048,351.0,714.0,0.107952,0.516704,0.609131
2,2016-06-06,00:30:00,1065,826.0,1405.0,763.0,376.0,482.0,0.054917,314.0,665.0,0.084762,0.544794,0.619855
3,2016-06-06,00:45:00,942,701.0,1279.0,677.0,322.0,420.0,0.129105,266.0,588.0,0.106,0.540656,0.620542
4,2016-06-06,01:00:00,773,598.0,1019.0,536.0,249.0,326.0,0.110133,236.0,483.0,0.0936,0.583612,0.605351


In [None]:
for i in range(5):
    
    df_train_raw = trips_yc_2016_june_wk1.loc[(trips_yc_2016_june_wk1['Date']==trips_grp_by_time_date['Date'][i]) & 
                                 (trips_yc_2016_june_wk1['Time']==trips_grp_by_time_date['Time'][i])]
    df_train = fill_boro(df_train_raw)
    
    cluster_df, kmeans_model = kmeans_cluster_method2(df_train)
    total_passengers,total_trips, total_single_rides = after_cluster_data(cluster_df)
    single_cluster_count,total_cluster_count = get_single_cluster_count(cluster_df)
    
    trips_grp_by_time_date.loc[i,'Total Trips after Agg Method_2'] = total_trips
    trips_grp_by_time_date.loc[i,'Single Trips after Agg Method_2'] = total_single_rides
    trips_grp_by_time_date.loc[i,'Total Clusters Method_2'] = total_cluster_count
    trips_grp_by_time_date.loc[i,'Single Clusters Method_2'] = single_cluster_count
    
    trips_grp_by_time_date.loc[i,'Mean Pickup Dist_2'] = mean_pickup_dropoff_dist(df_train,kmeans_model)[0]
    
    if i==0:
        generate_map('Method_2',cluster_df,kmeans_model)
    
trips_grp_by_time_date.head()

In [None]:
df_train_raw = trips_yc_2016_june_wk1.loc[(trips_yc_2016_june_wk1['Date']==trips_grp_by_time_date['Date'][0]) & 
                                 (trips_yc_2016_june_wk1['Time']==trips_grp_by_time_date['Time'][0])]
df_train = fill_boro(df_train_raw)

In [None]:
train = df_train[['pickup_latitude' ,'pickup_longitude','Bearing Angle' , 'Distance']].copy()
step_size = int((len(train)//1.2 - len(train)//1.5)//3)
cluster_range = np.arange(int(len(train)//1.5), int(len(train)//1.2),step_size)
for i in cluster_range:
    kmeans = KMeans(n_clusters=i, random_state=0).fit(train)
    print('Number of Clusters = ' , i , '| Distortion = ',kmeans.inertia_ )
    if kmeans.inertia_ < 0.006:
        break
df_train['cluster'] = kmeans.labels_

In [None]:
y= df_train.groupby(['cluster'])['passenger_count'].agg(['sum']).sort_values(['sum'], ascending=False)
y['trips'] = np.ceil(y['sum']/4)
print('Total Passengers = ', y['sum'].sum(), ' and Total Trips = ', y['trips'].sum())
single_ride_count = len(y.loc[(y['sum']==1)])
print('Total single rides = ' ,count)
y.head()

In [None]:
df_train['cluster'].hist(bins=100)

In [None]:
cl_count = df_train['cluster'].value_counts()
cl_count= cl_count.reset_index()
cl_count = cl_count.rename(columns={'index': 'Cluster_Label', 'cluster': 'Count'})
max_cluster = cl_count['Cluster_Label'][0]
print(len(cl_count.loc[cl_count['Count']==1]))
cl_count

In [None]:
df_max_cluster = df_train.loc[df_train['cluster']==max_cluster]

list_pu = df_max_cluster[['pickup_latitude', 'pickup_longitude']].values.tolist()
list_do = df_max_cluster[['dropoff_latitude', 'dropoff_longitude']].values.tolist()
pu_cent_lat = (kmeans.cluster_centers_[max_cluster])[0]
pu_cent_long = (kmeans.cluster_centers_[max_cluster])[1]

do_cent_lat = (kmeans.cluster_centers_[max_cluster])[2]
do_cent_long = (kmeans.cluster_centers_[max_cluster])[3]

df_max_cluster['Pickup_Distance_from_Centroid'] = haversine_np(pu_cent_long,pu_cent_lat,
                                                               df_max_cluster['pickup_longitude'],  
                                                               df_max_cluster['pickup_latitude'] )

df_max_cluster['Dropoff_Distance_from_Centroid'] = haversine_np(do_cent_long,do_cent_lat,
                                                               df_max_cluster['dropoff_longitude'],  
                                                               df_max_cluster['dropoff_latitude'] )



print('Mean pickup distance is',df_max_cluster['Pickup_Distance_from_Centroid'].mean() )
print('Mean dropoff distance is',df_max_cluster['Dropoff_Distance_from_Centroid'].mean() )

df_max_cluster

In [None]:
base_map = generateBaseMap(default_zoom_start=12)

for location in list_pu:
    folium.Marker(location,
              icon=folium.Icon(color='red', icon='info-sign')
              ).add_to(base_map)
    
for location in list_do:
    folium.Marker(location,
              icon=folium.Icon(color='blue', icon='info-sign')
              ).add_to(base_map)
    
folium.Marker(kmeans.cluster_centers_[max_cluster][:2],
              icon=folium.Icon(color='green', icon = 'home')
               ).add_to(base_map)
folium.Marker(kmeans.cluster_centers_[max_cluster][2:4],
              icon=folium.Icon(color='orange', icon = 'home')
               ).add_to(base_map)

base_map

In [None]:
col_list = list(trips_yc_2013.columns) 
x= pd.DataFrame(columns=col_list)
for i in range(5):
    if trips_yc_2013['passenger_count'][i]>3:
        trip_count+=1
    else:
        trips_yc_2013['passenger_count'][i]
        

In [None]:
trips_rate_month=trips_yc_2013.groupby(['PU_Hour'])['Rate'].agg(['mean']).sort_values(['mean'], ascending=True)
trips_rate_month['Total_Trips'] = trips_yc_2013.groupby(['PU_Hour'])['Rate'].agg(['count']).sort_values(['count'])
trips_rate_month = trips_rate_month.rename(columns={'mean': 'Mean Trip Mile/Min'})
trips_rate_month['Avg Trip Distance']=trips_yc_2013.groupby(['PU_Hour'])['trip_distance'].agg(['mean']).sort_values(['mean'])
trips_rate_month = trips_rate_month.reset_index()
#cat_month = ['January', 'February', 'March', 'April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
#trips_rate_month['Month'] = pd.Categorical(trips_rate_month['Month'], ordered=True, categories=cat_month)

#colors = cm.nipy_spectral(np.linspace(0, 1, trips_rate_month['Month'].nunique()+2))
fig, ax = plt.subplots(2,figsize=(15,20), sharex=True)

plot_1 = sns.lineplot(x="PU_Hour", y="Total_Trips",palette=colors, linewidth=4,
                  data=trips_rate_month, ax=ax[0],marker='o',legend=False)
plot_2 = sns.lineplot(x="PU_Hour", y="Mean Trip Mile/Min",linewidth=4,palette=colors ,
                  data=trips_rate_month,marker='o', ax= ax[1]);
plt.xlabel(xlabel = 'Time of the Day (24HRS FORMAT)')
plt.legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0.)
plot_1.xaxis.set_major_locator(MultipleLocator(2))

In [None]:
trips_yc_2013['PU_Hour'].hist()

In [None]:
trips_yc_2013['passenger_count'].hist()

In [None]:
trips_yc_2013 = trips_yc_2013.round(3)

In [None]:
trips_yc_2013.groupby(['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude']).ngroups

In [None]:
haversine_np(45.00,45.00,45.005,45.005)

In [None]:
import json
from shapely.geometry import shape, Point
# depending on your version, use: from shapely.geometry import shape, Point

# load GeoJSON file containing sectors
with open('./Borough Boundaries.geojson') as f:
    js = json.load(f)

# construct point based on lon/lat returned by geocoder
point = Point(-73.9911, 40.7359)

# check each polygon to see if it contains the point
for feature in js['features']:
    polygon = shape(feature['geometry'])
    if polygon.contains(point):
        print ('Found containing polygon:', feature['properties']['boro_name'])

In [None]:
top10_pickup = df_prep.groupby(['pickup_latitude','pickup_longitude'])['medallion'].agg(['count']).sort_values(['count'],ascending=False).head(20)


In [None]:

base_map = generateBaseMap(default_zoom_start=12)
for lat,long in top10_pickup.index:
    folium.Marker([lat,long], popup='<i>Chicago City Center</i>').add_to(base_map)
base_map.save('./Top5_pickup_locations.html')
base_map

In [None]:
top10_dropoffs = df_prep.groupby(['dropoff_latitude','dropoff_longitude'])['Count'].agg(['count']).sort_values(['count'],ascending=False).head(20)


In [None]:
base_map = generateBaseMap(default_zoom_start=12)
for lat,long in top10_pickup.index:
    folium.Marker([lat,long], popup='<i>Chicago City Center</i>').add_to(base_map)
base_map.save('./Top5_pickup_locations.html')
base_map