## Name: Swami Venkat
## Date: 12/3/2019


In [1]:
# Importing important libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import json
import folium
from folium.plugins import HeatMapWithTime
from folium.plugins import HeatMap
import os.path
from os import path
import matplotlib.cm as cm
import warnings
from matplotlib.ticker import MultipleLocator
warnings.filterwarnings('ignore')
from time import process_time

In [2]:
trips_yc_2016 = pd.read_csv('./Data/trip_data/yellow_tripdata_2016-06.csv' , low_memory=False)
trips_yc_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11135470 entries, 0 to 11135469
Data columns (total 19 columns):
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RatecodeID               int64
store_and_fwd_flag       object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.6+ GB


In [None]:
trips_yc_2013_p1 = pd.read_csv('./Data/trip_data/trip_data_1.csv', low_memory=False)
trips_yc_2013_p1.info()

In [4]:
with open('./Borough Boundaries.geojson') as f:
    js = json.load(f)

man_cord_list = js['features'][4]['geometry']['coordinates']
flat_list1 = [item for sublist in man_cord_list for item in sublist]
man_cord = [item for sublist in flat_list1 for item in sublist]
max_lt,max_lng,min_lt,min_lng =0,0,999,999
for a,b in man_cord:
    
    if abs(a) > max_lng:
        max_lng = a
    if abs(b) > max_lt: 
        max_lt = b
    if abs(a) < min_lng:
        min_lng = a
    if abs(b) < min_lt: 
        min_lt = b
print(max_lng, max_lt, min_lng, min_lt)

-73.90665099473267 40.87903804730722 -74.01092841268031 40.68291694544512


In [None]:
def filter_df(df):
    df = df.drop(columns = ['medallion', 'hack_license', 'vendor_id', 'store_and_fwd_flag',
                            'dropoff_datetime','trip_time_in_secs','rate_code' ])
    df = df.loc[df['trip_distance']>0]
    df = df.loc[df['trip_distance']>0]
    
    df = df[(df != 0).all(1)]
    df = df.loc[(df['passenger_count']>0) & (df['passenger_count']<5)]
    df = df.sort_values(by='pickup_datetime',kind='heapsort', ascending=True)
    
    
    df = df.loc[(df['pickup_latitude']>=min_lt) & 
                (df['pickup_latitude']<= max_lt) & 
                (df['pickup_longitude']>=min_lng) & 
                (df['pickup_longitude']<= max_lng)]
    df = df.loc[(df['dropoff_latitude']>=min_lt) & 
                (df['dropoff_latitude']<= max_lt) & 
                (df['dropoff_longitude']>=min_lng) & 
                (df['dropoff_longitude']<= max_lng)]
    
 
    df = df.round(4)    

    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [None]:
def parse_timestamp(df):
    
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'],format='%Y-%m-%d %H:%M:%S', errors='coerce')
    df['Timestamp'] = pd.to_datetime(df['pickup_datetime'].dt.round(freq = '15 min'))
    df['Date'] = (df['Timestamp']).dt.date
    df['Time'] = (df['Timestamp']).dt.time
    df['Day'] = df['pickup_datetime'].dt.strftime('%A')
    return df

In [None]:
trips_yc_2013 = filter_df(parse_timestamp(trips_yc_2013_p1))
trips_yc_2013.info()

In [None]:
import json
from shapely.geometry import shape, Point


def fill_boro(df):
    with open('./Borough Boundaries.geojson') as f:
        js = json.load(f)

    pu_point_list = df[['pickup_longitude', 'pickup_latitude']].values.tolist()
    do_point_list = df[['dropoff_longitude', 'dropoff_latitude']].values.tolist()

    boro_list = [0] * len(df)
    
    count = 0
    for pu,do in zip(pu_point_list,do_point_list):
        point_pu = Point(pu)
        point_do = Point(do)
        feature = js['features'][4]
        polygon = shape(feature['geometry'])
        if polygon.contains(point_pu) and polygon.contains(point_do):
                boro_list[pu_point_list.index(pu)] = feature['properties']['boro_name']

    df['Ride_Boro'] = boro_list
    df= df.loc[df.Ride_Boro == 'Manhattan']
    
    return df

In [None]:
grouped_df = trips_yc_2013.groupby(["Date", "Time"]).size().reset_index().sort_values(by = [0], ascending=False)
grouped_df= grouped_df.reset_index(drop=True)
grouped_df

In [None]:
from sklearn.cluster import KMeans

df_train_raw = trips_yc_2013.loc[(trips_yc_2013['Date']==grouped_df['Date'][0]) & 
                                 (trips_yc_2013['Time']==grouped_df['Time'][0])]

df_train = fill_boro(df_train_raw)
train = df_train[['pickup_latitude' ,'pickup_longitude','dropoff_latitude' , 'dropoff_longitude']].copy()

In [None]:
distortions = []
step_size = int((len(train)//1.25 - len(train)//1.5)//10)
cluster_range = np.arange(int(len(train)//1.5), int(len(train)//1.25),step_size)
for i in cluster_range:
    kmeans = KMeans(n_clusters=i, random_state=0).fit(train)
    distortions.append(kmeans.inertia_)
    if kmeans.inertia_ < 0.0025:
        break
plt.plot(cluster_range, distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()
df_train['cluster'] = kmeans.labels_

In [None]:
df_train['cluster'].hist(bins=100)

In [None]:
xy = df_train['cluster'].value_counts()
xy.index[0]

In [None]:
cl_count = df_train['cluster'].value_counts()
cl_count= cl_count.reset_index()
cl_count = cl_count.rename(columns={'index': 'Cluster_Label', 'cluster': 'Count'})
max_cluster = cl_count['Cluster_Label'][0]
print(len(cl_count.loc[cl_count['Count']==1]))
cl_count

In [None]:
y= df_train.groupby(['cluster'])['passenger_count'].agg(['sum']).sort_values(['sum'], ascending=False)
y['trips'] = np.ceil(y['sum']/4)
print('Total Passengers = ', y['sum'].sum(), ' and Total Trips = ', y['trips'].sum())
y.head()

In [None]:
df_train.loc[df_train['cluster']==max_cluster]

In [None]:
max_cluster = cl_count['Cluster_Label'][100]
df_cluster10 = df_train.loc[df_train['cluster']==max_cluster]
lis_pu = df_cluster10[['pickup_latitude', 'pickup_longitude']].values.tolist()
lis_do = df_cluster10[['dropoff_latitude', 'dropoff_longitude']].values.tolist()

In [None]:
all_clusters_pu = kmeans.cluster_centers_[:,:2]
all_clusters_do = kmeans.cluster_centers_[:,2:4]

In [None]:
nyc_coor = [40.7128, -74.0060]

def generateBaseMap(default_location=nyc_coor, default_zoom_start=8.5):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
base_map = generateBaseMap(default_zoom_start=12)
for location in lis_pu:
    folium.Marker(location,
              icon=folium.Icon(color='red', icon='info-sign')
              ).add_to(base_map)
for location in lis_do:
    folium.Marker(location,
              icon=folium.Icon(color='blue', icon='info-sign')
              ).add_to(base_map)
    
folium.Marker(kmeans.cluster_centers_[max_cluster][:2],
              icon=folium.Icon(color='green', icon = 'home')
               ).add_to(base_map)
folium.Marker(kmeans.cluster_centers_[max_cluster][2:4],
              icon=folium.Icon(color='orange', icon = 'home')
               ).add_to(base_map)

base_map

In [None]:
col_list = list(trips_yc_2013.columns) 
x= pd.DataFrame(columns=col_list)
for i in range(5):
    if trips_yc_2013['passenger_count'][i]>3:
        trip_count+=1
    else:
        trips_yc_2013['passenger_count'][i]
        

In [None]:
trips_rate_month=trips_yc_2013.groupby(['PU_Hour'])['Rate'].agg(['mean']).sort_values(['mean'], ascending=True)
trips_rate_month['Total_Trips'] = trips_yc_2013.groupby(['PU_Hour'])['Rate'].agg(['count']).sort_values(['count'])
trips_rate_month = trips_rate_month.rename(columns={'mean': 'Mean Trip Mile/Min'})
trips_rate_month['Avg Trip Distance']=trips_yc_2013.groupby(['PU_Hour'])['trip_distance'].agg(['mean']).sort_values(['mean'])
trips_rate_month = trips_rate_month.reset_index()
#cat_month = ['January', 'February', 'March', 'April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
#trips_rate_month['Month'] = pd.Categorical(trips_rate_month['Month'], ordered=True, categories=cat_month)

#colors = cm.nipy_spectral(np.linspace(0, 1, trips_rate_month['Month'].nunique()+2))
fig, ax = plt.subplots(2,figsize=(15,20), sharex=True)

plot_1 = sns.lineplot(x="PU_Hour", y="Total_Trips",palette=colors, linewidth=4,
                  data=trips_rate_month, ax=ax[0],marker='o',legend=False)
plot_2 = sns.lineplot(x="PU_Hour", y="Mean Trip Mile/Min",linewidth=4,palette=colors ,
                  data=trips_rate_month,marker='o', ax= ax[1]);
plt.xlabel(xlabel = 'Time of the Day (24HRS FORMAT)')
plt.legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0.)
plot_1.xaxis.set_major_locator(MultipleLocator(2))

In [None]:
trips_yc_2013['PU_Hour'].hist()

In [None]:
trips_yc_2013['passenger_count'].hist()

In [None]:
trips_yc_2013 = trips_yc_2013.round(3)

In [None]:
trips_yc_2013.groupby(['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude']).ngroups

In [None]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    distance = 3958.8 * c
    return distance

In [None]:
haversine_np(45.00,45.00,45.005,45.005)

In [None]:
import json
from shapely.geometry import shape, Point
# depending on your version, use: from shapely.geometry import shape, Point

# load GeoJSON file containing sectors
with open('./Borough Boundaries.geojson') as f:
    js = json.load(f)

# construct point based on lon/lat returned by geocoder
point = Point(-73.9911, 40.7359)

# check each polygon to see if it contains the point
for feature in js['features']:
    polygon = shape(feature['geometry'])
    if polygon.contains(point):
        print ('Found containing polygon:', feature['properties']['boro_name'])

In [None]:
top10_pickup = df_prep.groupby(['pickup_latitude','pickup_longitude'])['medallion'].agg(['count']).sort_values(['count'],ascending=False).head(20)


In [None]:

base_map = generateBaseMap(default_zoom_start=12)
for lat,long in top10_pickup.index:
    folium.Marker([lat,long], popup='<i>Chicago City Center</i>').add_to(base_map)
base_map.save('./Top5_pickup_locations.html')
base_map

In [None]:
top10_dropoffs = df_prep.groupby(['dropoff_latitude','dropoff_longitude'])['Count'].agg(['count']).sort_values(['count'],ascending=False).head(20)


In [None]:
base_map = generateBaseMap(default_zoom_start=12)
for lat,long in top10_pickup.index:
    folium.Marker([lat,long], popup='<i>Chicago City Center</i>').add_to(base_map)
base_map.save('./Top5_pickup_locations.html')
base_map