# Processing & Visualizations for Gridwise Data

## About this notebook

The purpose of this notebook is to parse and visualize the Preliminary data given by Gridwise in 2023. For questions surrounding the design or implementation of this project, please email Katie.Rischpater@nrel.gov ! 

## File I/O, Dataframe setup

### File Imports

In [None]:
# Data Processing
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Geospacial Data
import geopandas as gpd
import geoplot
import geoplot.crs as gcrs

# Visualization
import matplotlib.pyplot as plt
import osmnx as ox
# import networkx as nx

# Reload lets us test local imported functions
# without restarting the kernel
from importlib import reload

# Local Imports
import data_parsing as dtp
import constants as c


### Load & Parse CSV 

This is currently configured to work with a single CV: syntax may change as more data is provided!

Relavant fields in the Gridwise_Data CSV Include:
- Timestamps ('start_time', 'end_time')
- Trip Category (We're interested in 'Rideshare')
- Driver id (driver_id)
- Lat / Long data, including:
    - start_block_group_internal_point_lat, start_block_group_internal_point_lng
    - end_block_group_internal_point_lat, end_block_group_internal_point_lng

In [None]:
CITY_NAME = 'Los Angeles, California, USA'

FOLDER_PATH = './__Gridwise_Data/Gridwise full dataset/gwa-exports-national-renewable-energy-laboratory-2024-01-23_part_001'
BLOCK_FNAME=['blocks-historical-2024-01-23-000000000000.csv']
FILES=['trips-historical-2024-01-23-000000000000.csv', 'trips-historical-2024-01-23-000000000001.csv', 'trips-historical-2024-01-23-000000000002.csv', 'trips-historical-2024-01-23-000000000003.csv', 'trips-historical-2024-01-23-000000000004.csv', 'trips-historical-2024-01-23-000000000005.csv', 'trips-historical-2024-01-23-000000000006.csv', 'trips-historical-2024-01-23-000000000007.csv', 'trips-historical-2024-01-23-000000000008.csv', 'trips-historical-2024-01-23-000000000009.csv', 'trips-historical-2024-01-23-000000000010.csv']

block_df = pd.read_csv(FOLDER_PATH + '/' + BLOCK_FNAME[0])
dfs = []
for f in FILES:
  file_path = FOLDER_PATH + '/' + f
  tdf = pd.read_csv(file_path)
  dfs.append(tdf)

gridwise_df = pd.concat(dfs, ignore_index=True)
rideshare_df =  gridwise_df[gridwise_df['service_type'] == 'Rideshare']

# Inspect only rideshare trips with nonempty start, end, and request times
rideshare_with_data_df = rideshare_df.dropna(subset=['start_time', 'end_time', 'request_time'])

#### Convert the current rideshare_df to use `datetime`

In [None]:
# We'll use `driver_id_counts` for this calculation, as it already contains / ride data
# For each driver, calculate the average amount of time btwn trips: 

converted_time_df = rideshare_with_data_df.copy()
converted_time_df['start_time'] = pd.to_datetime(rideshare_with_data_df['start_time'], utc=True)
converted_time_df['end_time'] = pd.to_datetime(rideshare_with_data_df['end_time'], utc=True)
converted_time_df['request_time'] = pd.to_datetime(rideshare_with_data_df['request_time'], utc=True)

#### Filtering out duplicate Trips

In [None]:
id_grouped = converted_time_df.groupby('driver_id')
duplicate_index = []
# There _has_ to be a better way of doing this, but for the time being, it's tolerable
# First, group by id...
for driver_id, id_group in id_grouped: 
    start_grouped =  id_group.groupby('start_time')
    # Then, group by start time 
    for start_time, start_group in start_grouped: 
        # If there is more than one value in a given start_time group, then there must necessarily be a duplicate
        if(len(start_group.index) > 1):
            duplicate_index = [*duplicate_index, *start_group.index[1:].values]

print('Old length is', len(converted_time_df.index))
print(len(duplicate_index), ' trips are duplicates, removing...')

converted_time_df = converted_time_df.loc[np.setdiff1d(converted_time_df.index, duplicate_index)]

print('New Length: ' , len(converted_time_df.index))

## MatPlotLib Visualizations

## Normal Rideshare Visualizations

#### Number of Trips / User

In [None]:
# Finds maximum # of rides driven by a single user
driverid_counts = converted_time_df['driver_id'].value_counts() 
driver_id_counts_df = driverid_counts.reset_index()
driver_id_counts_df.columns = ['driver_id', 'count']

all_days_grouped_df = driver_id_counts_df.groupby('count')['driver_id'].nunique().reset_index()
all_days_grouped_df.columns = ['Number of Trips', 'Number of Users']

#### Plot Unfiltered Trips/User data

In [None]:
print(all_days_grouped_df)
print(rideshare_with_data_df)
plt.scatter(all_days_grouped_df['Number of Trips'], all_days_grouped_df['Number of Users'], s=2)
plt.xlabel('Number of Trips') 
plt.ylabel('Number of Drivers')
plt.title('Number of Drivers vs Number of Rideshare Trips')

plt.yticks(np.arange(0, all_days_grouped_df['Number of Users'].max()+1, step=10))
plt.xticks(np.arange(0, all_days_grouped_df['Number of Trips'].max()+1, step=1000))
# plt.xlim(left=0) # Move

plt.show()

#### Plot Filtered Data

In [None]:
filtered_grouped_df = all_days_grouped_df[(all_days_grouped_df['Number of Users'] >= 3) & (all_days_grouped_df['Number of Users'] <= 82)]
print(filtered_grouped_df)

plt.scatter(filtered_grouped_df['Number of Trips'], filtered_grouped_df['Number of Users'], s=2)
plt.xlabel('Number of Trips') # Just Oct. 30th
plt.ylabel('Number of Drivers')
plt.title('Number of Drivers vs Number of Rideshare Trips: Drivers w/ trips 3 <= t <= 3640')

plt.yticks(np.arange(0, filtered_grouped_df['Number of Users'].max()+1, step=10))
# plt.xticks(np.arange(0, filtered_grouped_df['Number of Trips'].max()+1, step=1000))

plt.show()

#### Number Trips / Day


~~With the current Preliminary Dataset, this doesn't do anything, since all of the data comes from a single day... Once we have more data, this may prove useful!~~

In [None]:
start_as_datetime_df = converted_time_df

daily_count_df = start_as_datetime_df.groupby(['driver_id', start_as_datetime_df['start_time'].dt.date])['start_time'].count()
max_inter_per_user_df = daily_count_df.groupby('driver_id').max().reset_index()
max_inter_per_user_df.columns = ['driver_id', 'MaxTripsInSingleDay']

single_day_grouped_df = max_inter_per_user_df.groupby('MaxTripsInSingleDay')['driver_id'].nunique().reset_index()
single_day_grouped_df.columns = ['Number of Trips / Day', 'Number of Users']

# Display data as bar graph
plt.bar(single_day_grouped_df['Number of Trips / Day'], single_day_grouped_df['Number of Users'] )
plt.xlabel('Number of Trips') # Just Oct. 30th
plt.ylabel('Number of Drivers')
plt.title('Number of Drivers vs Number of Ridehailing Trips / Day')

plt.yticks(np.arange(0, single_day_grouped_df['Number of Users'].max()+1, step=10))
# plt.xlim(left=0) # Moves

plt.show()


## Intermission Visualizations
The goal of this is to find the average amount of time a given rider spends idling between trips. 

### Calculate & Filter Intermissions

##### Calculate Intermission Periods

In [None]:
# This sort is needed to ensure we calculate the intermissions correctly
converted_time_df.sort_values(by='start_time', inplace=True)

# Set up the correct fields for the intermission dataframe
intermission_times_df = converted_time_df.copy()

intermission_times_df.drop(['start_time', 'end_time'], axis=1, inplace=True)

intermission_times_df['intermission_start'] = pd.NaT
intermission_times_df['intermission_end'] = pd.NaT

grouped = converted_time_df.groupby('driver_id')

# This group-loop took me embarassingly long to figure out...
# 
# The first line calculates the `interm_starts` by taking the end values of each corresponding index (e.g., unique trip)
# E.g., inter_start[0] == end_time[0]
#
# The second line calculates the `interm_ends` by taking the start values of the _next_ available start time,
# and assigning it to the index of the previous value.  
# E.g., inter_end[0] == start_time[1]

for driver_id, group in grouped: 
    intermission_times_df.loc[group['end_time'].iloc[:-1].index, 'intermission_start'] = group['end_time'].iloc[:-1].values
    intermission_times_df.loc[group['start_time'].iloc[:-1].index, 'intermission_end'] = group['start_time'].iloc[1:].values


#### Re-Calculate the Duration Statistic

In [None]:
## Re-Calculate the Duration statistic
intermission_times_df['duration'] = intermission_times_df['intermission_end'] - intermission_times_df['intermission_start']

# Drop remaining NaT Values
intermission_times_df.dropna(subset=['intermission_start', 'intermission_end'], how='any', inplace=True)

##### Debug: Checking for negative values, 2x check intermission
~22,536

In [None]:
print(intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() < 0)][['driver_id', 'intermission_start', 'intermission_end', 'duration']])
print(len(intermission_times_df.index))


##### Debug: Diagnosing Remaining Negative Trips
It appears that certain trips overlap, which shouldn't be possible... See the following output:
```
1456   2019-01-01 04:04:57+00:00 2019-01-01 04:55:19+00:00    3022.0
1450   2019-01-01 04:06:08+00:00 2019-01-01 04:44:43+00:00    2315.0
1510   2019-01-01 04:17:14+00:00 2019-01-01 04:36:21+00:00    1147.0
```


In [None]:
trip_id = '4a2413b8-abf7-a287-30a2-1cb1482eedae' # trip 1546 , 4:55 - 4:06, negative somehow
# test_driver_id = '9fee0976-5899-deaf-e010-5c030057aa97' # Driver ID # trip 1550 also negative from same driver

# print(intermission_times_df.loc[intermission_times_df['driver_id'] == test_driver_id][['intermission_start', 'intermission_end','duration']].iloc[5:])
# print(converted_time_df.loc[converted_time_df['driver_id'] == test_driver_id][['id','start_time', 'end_time']].iloc[5:])

x = '4a2413b8-abf7-a287-30a2-1cb1482eedae'
y = 'be4ca665-518f-736d-71cc-19ce7568818c'
print(converted_time_df.loc[converted_time_df['id'] == x][['service']])
print(converted_time_df.loc[converted_time_df['id'] == y].columns)

##### Debug: Checking specific trips that are Out of Order
(e.g., because they're out of order, our re-order should have 984 _after_ 999) 

In [None]:
# 53e5bed7-2dbc-3452-11cb-068ceab085c5 had a dupe trip
test_trip_id = '4fe52ac4-6eb4-a406-8d07-fe79af5cce10' 
test_driver_id = 'db021f96-d7bc-42c7-6cee-dcbf067c5e8a'

x = intermission_times_df.loc[intermission_times_df['id'] == test_trip_id]
y = converted_time_df.loc[converted_time_df['id'] == test_trip_id] 
# print(x[['id','intermission_start', 'intermission_end', 'duration']])
print(y[['driver_id','start_time', 'end_time', 'duration']]) #383jA

#984 has index 5

print(converted_time_df.loc[converted_time_df['driver_id'] == test_driver_id][['start_time', 'end_time','duration']].iloc[5:])

print(intermission_times_df.loc[intermission_times_df['driver_id'] == test_driver_id][['intermission_start', 'intermission_end','duration']].iloc[5:])


# print(len(intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() < 0)][['id', 'duration']].index))
# print(len(intermission_times_df.index))

# print(y[['start_time', 'end_time', 'duration', 'driver_id']])
# print(x[['intermission_start', 'intermission_end', 'duration']])

DEV_CHOICE: Filter out Negative Trips

I currently don't have a good solution for how to deal with the overlapping trips described in the
sections above.  As such, let's filter out the ~20k that overlap, and do some re-calculations as normal.

In [None]:
intermission_times_df = intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() > 0)]
# print(intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() < 0)][['driver_id', 'intermission_start', 'intermission_end', 'duration']])

#### Get number of drivers with only 1 trip

In [None]:
grouped = converted_time_df.groupby('driver_id')
single_drivers = []
for driver_id, group in grouped: 
    if(len(group.index) <= 1):
        single_drivers.append(driver_id)

print(single_drivers)
print('# Drivers with only one trip:', len(single_drivers))
print('Total Number of Drivers:',len(converted_time_df['driver_id'].unique()))

# Sample
#  print(converted_time_df.loc[converted_time_df['driver_id'] == '02857bdc-fef3-3317-f5f6-eb7423807696'][['id','start_time', 'end_time']])

#### Filter Intermission Data
Certain breaks are _incredibly_ long -- e.g., some drivers will go a week or two without driving.  In order
to make the data a little clearer, I'm filtering out any breaks over 24 hours

In [None]:
day_in_seconds = 24 * 60 * 60
extensive_breaks_df = intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() > day_in_seconds)]

print('Current number of intermissions', len(intermission_times_df.index))
print('Number of breaks over 24 hours: ', len(extensive_breaks_df.index))

intermission_times_df.drop(extensive_breaks_df.index, inplace=True)

print('Final number of intermissions', len(intermission_times_df.index))

# In the past, I was filtering by both date boundaries and intermission length.  Keeping that code in case we need it later


# This line is pretty ugly, so let's break it down:
# We are dropping "All breaks that occur across a date boundary ("overnight"), that are longer that `break_filter` hours".
# 
# We do not drop all "overnight" breaks (e.g., those that span from one day to the next), as some individuals
# may work night shifts -- e.g., an Uber from LAX after a late flight.  We can tweak the threshold for this as 
# necessary -- 7 hours feels like a happy medium.
#
# [this](https://pdf.sciencedirectassets.com/272127/1-s2.0-S1389945723X00112/1-s2.0-S1389945723003611/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEKD%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQCBaD3%2BRuTnEUiaAMmjoN4uZsOV2hgQfac29JI7OFWWYwIhAOT3Lf%2BQYx1WbXnXZc%2Bp5J6Dl5BB2I7vD61AWzNlnJSwKrwFCNn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQBRoMMDU5MDAzNTQ2ODY1IgwKEHNdFLFI%2F0qlWqYqkAVCQ4Oin%2FbYubAE12bjKuSq42CPTB0ed8DtetXDVEXb%2BJqL%2FxrYvKDW%2BuT2pfZ17sb4IWX7%2FSzHcE2iypoeYZ733a9EoVtr7Nbpz8pAqJmseP33lvFgzE6ImrKSsXhKChM9hCx9uj98ppYkakn5POoB49Bvc7VFcfAnL6igg43jDJXH0YmCpV45N9ue6mpRi74p8K1SxSSjLHjsOZiadpdzV7RCjmqK2zjf3dZlTj9dMF%2FQ3Ebb%2FberVxpRTeLSbAqzFKZEcmizDQyfN7%2BmRiQC8so1AT5pUXcUNpkNxNwW%2BBQeXIgxmsD8tnDo2OmRssLKgmRgpb%2FXCeqMjwDrhriqfSiXBRUBa%2FlVDGajNI85YK2XCjDI%2BT8jH91b8PK6VvCqzKnSfC1%2B%2BBvie1Q%2B9BbTcy%2Fhrb%2BTL%2F9GsBpylOFtiJBYcLLGsm3IIwoKimpCkYZVZVyCiVMSv3rLyrZCbRZNJ3R7HPG59Gvw8cjJVYT%2BXjAdcWDCfvwuuh3nkjQ5i%2BOIuQYg%2ByGhIW0fowae7m08ryzvrhTbT0vecLhMH0eevY8Zj5gYVwrcMfQTwtd6INTooJRjbKZcYVDxoKATDVn%2BjU3IEKZQDzEvVcFoemyAW%2F5jKbmLQKu%2FEziAUwaJ1MvqbjlU4fJlZC9KCh03T5lBT6QT0slRW%2Bhbx5CZ1r93hJraFKQfTElB3Tzu4dKQjWgskFsWCsRDH1r91pZA8YpMi5gI4n6KTpczmmVKSLHEJpKknV%2BbU5VGXebl6y7fry%2F5b3pKklNWK5HMS5EEJenRWSVL6z%2FyELSkVyoXB2N%2FGuDcciBLdxLarzB0VnJfpS6tZo%2FI11uU%2BJbX4Vl17R8ZGEPBBU9jm09Lw0LlfVIlUTDW4P%2BwBjqwAY4LCZ%2FbI%2FYi361aQgFHIK%2FGW0FhFW3QW6DyBaDIuFdQld7N3lUo1Xbwc4I0pnzCSMdv0kvTT6f3PD2GOjRE25sHEBDzmejCxwXuaDP4b%2BWJNtX6Ghur1HSDI0wxVC1Q8hY63tfBrUVbteLnNKlsFZsIk4Qhv%2BesFz%2F99HnNP9ljcgrRUL76wZn%2F5TzuDkvIB4qQ%2B48mGxuCBiSsWc8Ki%2B9nz8NgT45QXhx%2BS%2FkOfpLO&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240417T164254Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYYJRWWKKR%2F20240417%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=f3f0c55d8bbcbd25a8a6f3b5d06f8ecae80eb24073a0c12426452a9ccc8a5458&hash=1e8ff7db47ceabb46c640d626ee001c67c7126bcdc3253987ee8a2642338f6fc&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1389945723003611&tid=spdf-27e34c51-ec60-4aa3-a4f0-e7104dc447a3&sid=3acec3a88a624740f10992754dbab31e377egxrqa&type=client&ua=0f1559595a0b0a0a050607&rr=875ddefae8b14761&cc=us)
# study had a population of Uni students, who self reported sleeping 5.96 +- 0.72 hours per day.  The high end of that is 7 hours -- we can assume
# some people "roll out of bed for work", so the upper bound of this (6.68 hours, -> 7 hours) should probably suffice.
#
# In the future, we will change the break filter to better reflect the already existing breaks within the data.
# For now, the "drop overnight breaks" line is commented out, and we're including long-term breaks in our analysis 

# break_filter = 7 * 60 * 60 # X hours in seconds
# print('Number of Intermission Values', len(intermission_times_df.index))
# overnight_breaks_df = intermission_times_df[(intermission_times_df['intermission_start'].dt.date != intermission_times_df['intermission_end'].dt.date) &
#                            (intermission_times_df['duration'].dt.total_seconds() > break_filter)]
# print('Number of overnight breaks: ', len(overnight_breaks_df.index))
# intermission_times_df.drop(overnight_breaks_df.index, inplace=True)

#### Aggregate Intermission Period By Driver Intervals / Day

In [None]:

daily_inter_count_df = intermission_times_df.groupby(['driver_id', intermission_times_df['intermission_start'].dt.date])['intermission_start'].count()
max_inter_per_user_df = daily_inter_count_df.groupby('driver_id').max().reset_index()
max_inter_per_user_df.columns = ['driver_id', 'MaxTripsInSingleDay']

single_day_inter_grouped_df = max_inter_per_user_df.groupby('MaxTripsInSingleDay')['driver_id'].nunique().reset_index()
single_day_inter_grouped_df.columns = ['Number of Trips / Day', 'Number of Users']


#### Intermissions over 3 Hours
 filter of intermissions in intermission_times_df s.t. 24hrs > b > 3hrs

In [None]:
three_hours = 3 * 60 * 60
three_hour_breaks = intermission_times_df[(intermission_times_df['duration'].dt.total_seconds() > three_hours)]

print('Number of filtered interm: ', len(intermission_times_df.index))
print('Number of intermissions over 3 Hours: ', len(three_hour_breaks.index))

### Visualize Intermissions, Intermission Statistics

#### Intermission Duration Statistics 

In [None]:
def display_cutoff_data(partition_df, full_df, cutoff, type):
    print('About %.2f%% (%i) of all intermissions (%i) are %s %i hours.' 
        % (len(partition_df.index) / len(full_df.index) * 100, 
            len(partition_df.index),
            len(full_df.index),
            type,
            cutoff / 60))

intermission_cutoff_value = 2 * 60 * 60

breaks_over_cutoff_df = intermission_times_df[intermission_times_df['duration'].dt.total_seconds() >= (intermission_cutoff_value)]
breaks_under_cutoff = intermission_times_df[intermission_times_df['duration'].dt.total_seconds() < (intermission_cutoff_value)]

mean_intermission_duration = intermission_times_df.loc[:, 'duration'].mean() 
intermission_std_dev = intermission_times_df.loc[:, 'duration'].std()

print('Average Break time is %.2f hours, with a standard deviation of %.2f hours\n' 
      % ((mean_intermission_duration.total_seconds() / 60 / 60),
         (intermission_std_dev.total_seconds() / 60 / 60)))

print('-----\n')
print('Quantiles:')

print(intermission_times_df['duration'].quantile([0.05,0.10,0.15, 0.925, 0.95, 0.975]), '\n')

# display_cutoff_data(breaks_over_cutoff_df, intermission_times_df, intermission_cutoff_value, 'over or equal to')
# display_cutoff_data(breaks_under_cutoff, intermission_times_df, intermission_cutoff_value, 'under')


#### Visualize Intermission Data 

In [None]:
# print(intermission_times_df[['driver_id', 'intermission_start', 'intermission_end']])

# Display data as bar graph
plt.bar(single_day_inter_grouped_df['Number of Trips / Day'], single_day_inter_grouped_df['Number of Users'] )
plt.xlabel('Number of Intermissions') # Just Oct. 30th
plt.ylabel('Number of Drivers')
plt.title('Number of Drivers vs Number of Ridehailing Intermissions / Day')

plt.yticks(np.arange(0, single_day_inter_grouped_df['Number of Users'].max()+1, step=10))
# plt.xlim(left=0) # Moves

plt.show()

#### Intermissions over 3 hours, Mean break duration by start hour

In [None]:
average_durations = three_hour_breaks.groupby(three_hour_breaks['intermission_start'].dt.hour)['duration'].mean() / pd.Timedelta(minutes=1)
times_two = []
for d in average_durations:
    times_two += [d/ 60]
print(times_two)
plt.scatter(average_durations.index, times_two, color='blue')
plt.xlabel('Start of Intermission (24 Hours)')
plt.ylabel('Average Duration (in Hours)')
plt.title('Average Duration of Intermission Over 3hrs, by Start Time')
plt.xticks(range(0, 24, 2))
plt.show()

#### Mean Intermission Duration / Day, Visualized

In [None]:
average_durations = intermission_times_df.groupby(intermission_times_df['intermission_start'].dt.hour)['duration'].mean() / pd.Timedelta(minutes=1)
plt.scatter(average_durations.index, average_durations.values, color='blue')
plt.xlabel('Start of Intermission (24 Hours)')
plt.ylabel('Average Duration (in Minutes)')
plt.title('Average Duration of Breaks by Start Time')
plt.xticks(range(0, 24, 2))
plt.show()

##### Alternate Plot

In [None]:

# plt.scatter(average_durations.index, average_durations.values, color='blue')
# plt.scatter((average_durations.index + (average_durations.values / 60)), average_durations.values, color='red')
plt.figure(figsize=(10,6))
for index, val in enumerate(average_durations):
    line_to_plot_x = [index, (index + (val / 60))]
    line_to_plot_y = [val, val]
    plt.plot(line_to_plot_x, line_to_plot_y, '|', linestyle="--")

plt.xlabel('Time of Day (24 Hours)')
plt.ylabel('Average Duration (in Minutes)')

plt.title('Average Duration of Breaks by Start Time')
plt.xticks(range(0, 25, 1))
plt.show()

#### By Hour Explorations

##### Boxplots by Hour

In [None]:
dtp.makeBoxPlot(intermission_times_df, 24*c.HOURS)
dtp.makeBoxPlot(intermission_times_df, 20*c.MINUTES, '<=', 'minutes')
dtp.makeBoxPlot(intermission_times_df, 20*c.MINUTES, '>', 'minutes', 59*c.MINUTES)
dtp.makeBoxPlot(intermission_times_df, 1*c.HOURS, '>', 'hours', 2.5*c.HOURS)
dtp.makeBoxPlot(intermission_times_df, 2.5*c.HOURS, '>', 'hours', 24*c.HOURS)

## GeoPlot Visualizations 

### Fetch city graph

TODO:
- Because `geoplot` and `geopandas` have Kernel Density Equation (KDE) plotting built in, I may switch to using one of those packages (As, from my understanding, OSMNX does not have KDE).

In [None]:
city_graph = ox.graph_from_place(CITY_NAME, network_type="drive")

### Convert lat/lon points of dataframe

TODO:
  - As written, this does not work! Will fix later.

In [None]:
# Since we do this twice (For Origin / Destination), abstract this block to a function
def parse_points(category_to_parse):
    lat_string = category_to_parse + '_lat'
    lon_string = category_to_parse + '_lng'
    rideshare_points_df =  rideshare_with_data_df[((rideshare_with_data_df[lat_string].notna()) & (rideshare_with_data_df[lon_string].notna()))].reset_index(drop=True)
    point_geometry = gpd.points_from_xy(pd.to_numeric(rideshare_points_df[lat_string]), pd.to_numeric(rideshare_points_df[lon_string]))
    return gpd.GeoDataFrame(rideshare_points_df, geometry=point_geometry)


# Calculation & Execution Functions
ORIGIN_COLUMN = 'start_block_group_internal_point'
DEST_COLUMN = 'end_block_group_internal_point'
PROJECTION = gcrs.AlbersEqualArea()

origin_points_gdf = parse_points(ORIGIN_COLUMN)
dest_points_gdf = parse_points(DEST_COLUMN)

print(origin_points_gdf)

gdf_city = ox.graph_to_gdfs(city_graph, edges=False)

ax = gdf_city.plot(figsize=(10,10), edgecolor='white')
origin_points_gdf.plot(ax=ax, color='red', markersize=10)
# geoplot.pointplot(origin_points_gdf, ax=ax, hue=origin_points_gdf.geometry.buffer(0.01).unary_union.convex_hull.area / origin_points_gdf.geometry.buffer(0.01).area, legend=True, legend_var='hue', cmap='inferno', legend_kwargs={'label': 'Point Density'})

### Future Work:
- Other ways to cluster:
- TODO: Look at length of trips / driver
- Stronger variables to consider:
  - Time bwn rides
  - Time of day
  - Location of O-D Pairs

- Is the data "clean" enough for the MEP analysis?  We can look at improving the accuracy of the data