# Bikeshare Evaluation Dataset and Baseline

*Anna, Tiffany, Tina, Tres, Giulia*

This baseline is based on data from August 29-31.
RMSE for zero movement is:
* 1.20 for number of bikes left
* 1.17 for number of bikes returned

## Import Data

In [2]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt 
from datetime import datetime
from datetime import timedelta
import numpy as np
from matplotlib.ticker import PercentFormatter

import geopandas as gpd
from itertools import combinations
import itertools
import random
from random import sample
import time

plt.rcParams["figure.figsize"] = (15,10)


In [11]:
path = os.getcwd() + "/../data/raw/trips"
csv_files = glob.glob(os.path.join(path, "*.csv"))

# trips_df = pd.read_csv('data/raw/trips/trips_df_mini.csv')
  
trips_df = pd.DataFrame()

# loop over the list of csv files
for f in csv_files:
      
#     read the csv file
    current_csv = pd.read_csv(f)
    trips_df = trips_df.append(current_csv)
#     print the location and filename
    print('Location:', f)
    print('File Name:', f.split("\\")[-1])

#     print the content
    print('Content:')
    display(trips_df)
    print()


  current_csv = pd.read_csv(f)
  trips_df = trips_df.append(current_csv)


Location: /Users/tinafang/Documents/Berkeley/W210/capstone_bikeshare/baselines/../data/raw/trips/trips_df.csv
File Name: /Users/tinafang/Documents/Berkeley/W210/capstone_bikeshare/baselines/../data/raw/trips/trips_df.csv
Content:


Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender
0,0,259,2020-07-01 00:01:15.0430,2020-07-01 00:05:34.1010,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.076570,26,Washington St at Waltham St,42.341575,-71.068904,6059,Subscriber,02118,,
1,1,436,2020-07-01 00:03:39.1810,2020-07-01 00:10:55.4600,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,Customer,02114,,
2,2,1346,2020-07-01 00:04:27.0790,2020-07-01 00:26:53.2030,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,Customer,,,
3,3,2069,2020-07-01 00:04:56.0140,2020-07-01 00:39:25.1100,436,Maverick St at Massport Path,42.367741,-71.033360,436,Maverick St at Massport Path,42.367741,-71.033360,3858,Subscriber,02128,,
4,4,1266,2020-07-01 00:05:43.0180,2020-07-01 00:26:49.0580,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,Customer,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15276597,236071,775,2018-08-31 23:58:25.5780,2018-09-01 00:11:20.8620,79,Beacon St at Washington / Kirkland,42.378420,-71.105668,87,Harvard University Housing - 115 Putnam Ave at...,42.366621,-71.114214,3392,Subscriber,,1989,1.0
15276598,236072,358,2018-08-31 23:58:28.0730,2018-09-01 00:04:26.8610,100,Davis Square,42.396969,-71.123024,111,Packard Ave at Powderhouse Blvd,42.404490,-71.123413,3345,Subscriber,,1977,1.0
15276599,236073,518,2018-08-31 23:58:35.2490,2018-09-01 00:07:13.5640,177,University Park,42.362648,-71.100061,96,Cambridge Main Library at Broadway / Trowbridg...,42.373379,-71.111075,2943,Subscriber,,1980,1.0
15276600,236074,370,2018-08-31 23:59:06.3960,2018-09-01 00:05:17.2460,105,Lower Cambridgeport at Magazine St / Riverside Rd,42.357219,-71.113872,97,Harvard University River Houses at DeWolfe St ...,42.369190,-71.117141,850,Subscriber,,1994,2.0





In [19]:
trips_df.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
stations = pd.read_csv("../data/raw/stations/current_bluebikes_stations.csv")
stations.columns = stations.iloc[0]
stations = stations.drop(stations.index[0])

In [12]:
# Set as datetime object
trips_df['starttime'] = pd.to_datetime(trips_df["starttime"], format='%Y-%m-%d %H:%M:%S')
trips_df['stoptime'] = pd.to_datetime(trips_df["stoptime"], format='%Y-%m-%d %H:%M:%S')


In [13]:
print(len(trips_df), ": original length") 

#trip duration

trips_df = trips_df[trips_df['tripduration'] < 90000]  #threshhold of 25 hrs....

#lat/long 



trips_df = trips_df[trips_df['start station longitude'] != 0  & 
                    
                   (trips_df['start station longitude'] < 72)]

trips_df = trips_df[(trips_df['start station latitude'] != 0) & 
                    
                   (trips_df['start station latitude'] < 43)]

trips_df = trips_df[(trips_df['end station longitude'] != 0) &
                   (trips_df['end station longitude'] != r"\N")]

trips_df = trips_df[(trips_df['end station latitude'] != 0) &
                   (trips_df['end station latitude'] != r"\N")]

print(len(trips_df), ": after removal")

15276602 : original length
15263061 : after removal


## Create evaluation Dataset

### Create helper functions

In [14]:
def get_movement_starts(station_id, prediction_time, prediction_window):
    trips_df_subset_start = trips_df[trips_df['start station id'] == station_id]
    target_time = prediction_time + pd.to_timedelta(prediction_window, unit='m')
    trips_df_station_starts = trips_df_subset_start[(trips_df_subset_start['starttime'] > prediction_time) & 
                                                (trips_df_subset_start['starttime'] < target_time)].sort_values(by='starttime')

    return len(trips_df_station_starts)


In [15]:
def get_movement_returns(station_id, prediction_time, prediction_window):
    trips_df_subset_return = trips_df[trips_df['end station id'] == station_id]
    target_time = prediction_time + pd.to_timedelta(prediction_window, unit='m')
    trips_df_station_returns = trips_df_subset_return[(trips_df_subset_return['stoptime'] > prediction_time) & 
                                                   (trips_df_subset_return['stoptime'] < target_time)].sort_values(by='stoptime')

    return len(trips_df_station_returns)


In [16]:
def convert_to_nearest_15_mins(sample_trip_timestamp, delta):
    sample_trip_epoch = sample_trip_timestamp.timestamp()
    converted_datetime = datetime.utcfromtimestamp(sample_trip_epoch)
    converted_15_datetime = converted_datetime + (datetime.min - converted_datetime) % delta
    return converted_15_datetime

### Creating the full list of combinations to be filled

#### Create timestamps for prediction
Set to be in 15 minute intervals

* August 29 2022 - August 31 2022

In [180]:
lst_timestamps = pd.Series(trips_df[(trips_df.starttime >= '2022-08-29') & (trips_df.starttime < '2022-09-01')].starttime.apply(convert_to_nearest_15_mins, 
                                                                                                                                 args=(timedelta(minutes=15),)).unique())


In [181]:
trip_df_station_counts = trips_df.groupby('start station id').count().reset_index()[['start station id', 'starttime']].rename(columns={"starttime": "station_counts"})
# get the counts of each station to weigh the sample

#### Create list of stations weighted by trip count

In [183]:
random.seed(321)
station_ids_reweighted = random.choices(trip_df_station_counts['start station id'], weights = trip_df_station_counts.station_counts, k = 1000)


In [186]:
lst_timestamps.iloc[:-1]

0     2022-08-29 00:15:00
1     2022-08-29 00:30:00
2     2022-08-29 00:45:00
3     2022-08-29 01:00:00
4     2022-08-29 01:15:00
              ...        
282   2022-08-31 22:45:00
283   2022-08-31 23:00:00
284   2022-08-31 23:15:00
285   2022-08-31 23:30:00
286   2022-08-31 23:45:00
Length: 287, dtype: datetime64[ns]

In [193]:
lst_timestamps_subset = lst_timestamps.iloc[:-1] # all the august 29-31 records
lst_timestamps_subset

0     2022-08-29 00:15:00
1     2022-08-29 00:30:00
2     2022-08-29 00:45:00
3     2022-08-29 01:00:00
4     2022-08-29 01:15:00
              ...        
282   2022-08-31 22:45:00
283   2022-08-31 23:00:00
284   2022-08-31 23:15:00
285   2022-08-31 23:30:00
286   2022-08-31 23:45:00
Length: 287, dtype: datetime64[ns]

In [205]:
station_ids = np.array(trip_df_station_counts['start station id']) # all stations
prediction_windows = [15] # 15 minutes only
prediction_start_timestamps = lst_timestamps_subset # 1000

#### Create the combination of station ids, prediction start timestamps, and prediction windows

In [206]:
lst_combinations = list(itertools.product(station_ids, prediction_start_timestamps, prediction_windows))


### Taking a subset of the full list of combinations

In [207]:
# random.seed(321)
# lst_combinations_subset = random.sample(lst_combinations, 10000)

In [208]:
lst_combinations_subset = lst_combinations

In [209]:
len(lst_combinations_subset)

144648

## Filling the movement information for starts and returns

Filling out the full dataset took 70 minutes

In [210]:
start_time = time.time()

lst_num_of_bikes_left = [get_movement_starts(station_id=trip_combination[0],
                                             prediction_time=trip_combination[1],
                                             prediction_window=trip_combination[2]) for trip_combination in lst_combinations_subset]

print("--- %s seconds ---" % (round(time.time() - start_time)))


--- 2147 seconds ---


In [211]:
start_time = time.time()

lst_num_of_bikes_returned = [get_movement_returns(station_id=trip_combination[0],
                                                  prediction_time=trip_combination[1],
                                                  prediction_window=trip_combination[2]) for trip_combination in lst_combinations_subset]

print("--- %s seconds ---" % (round(time.time() - start_time)))


--- 2028 seconds ---


### Amount of time it takes to run
* 2 seconds for 10 rows
* 8 seconds for 100 rows
* 57 seconds for 1,000 rows
* 563 seconds for 10,000 rows (10 minutes)

### Create dataframe based on the lists

In [212]:
data = {'station': [trip_combination[0] for trip_combination in lst_combinations_subset],
        'prediction_time': [trip_combination[1] for trip_combination in lst_combinations_subset], 
        'prediction_window': [trip_combination[2] for trip_combination in lst_combinations_subset],
        'number_of_bikes_left': lst_num_of_bikes_left,
        'number_of_bikes_returned': lst_num_of_bikes_returned}
df = pd.DataFrame(data=data)
df['net_difference'] = df['number_of_bikes_returned'] - df['number_of_bikes_left']

### Pickle it

In [213]:
df.to_pickle('../datasets/evaluation_df_2022_08.pkl')

In [223]:
df = pd.read_pickle("../datasets/evaluation_df_2022_08.pkl") 

## Evaluation Metrics on the Baseline

In [86]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

In [228]:
df

Unnamed: 0,station,prediction_time,prediction_window,number_of_bikes_left,number_of_bikes_returned,net_difference
0,1,2022-08-29 00:15:00,15,0,0,0
1,1,2022-08-29 00:30:00,15,0,0,0
2,1,2022-08-29 00:45:00,15,0,0,0
3,1,2022-08-29 01:00:00,15,0,0,0
4,1,2022-08-29 01:15:00,15,0,0,0
...,...,...,...,...,...,...
144643,572,2022-08-31 22:45:00,15,0,0,0
144644,572,2022-08-31 23:00:00,15,0,0,0
144645,572,2022-08-31 23:15:00,15,0,0,0
144646,572,2022-08-31 23:30:00,15,0,0,0


### The RMSE for zero movement baseline
* 1.20 for number of bikes left
* 1.17 for number of bikes returned

In [224]:
y_true = np.array(df.number_of_bikes_left)
y_pred = np.zeros(df.shape[0]) # predicting 0 movement for every row

In [225]:
print(f"The mean squared error is {mean_squared_error(y_true, y_pred)}.")
print(f"The root mean squared error is {mean_squared_error(y_true, y_pred, squared=False)}.")
print(f"The r^2 is {round(r2_score(y_true, y_pred), 4)}.")

The mean squared error is 1.4470231181903657.
The root mean squared error is 1.202922739909079.
The r^2 is -0.1327.


In [226]:
y_true = np.array(df.number_of_bikes_returned)
y_pred = np.zeros(df.shape[0]) # predicting 0 movement for every row

In [227]:
print(f"The mean squared error is {mean_squared_error(y_true, y_pred)}.")
print(f"The root mean squared error is {mean_squared_error(y_true, y_pred, squared=False)}.")
print(f"The r^2 is {round(r2_score(y_true, y_pred), 4)}.")

The mean squared error is 1.3786986339251148.
The root mean squared error is 1.1741799836162745.
The r^2 is -0.1399.
