# Bikeshare Evaluation Dataset and Baseline

*Anna, Tiffany, Tina, Tres, Giulia*

## Import Data

In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt 
from datetime import datetime
from datetime import timedelta
import numpy as np
from matplotlib.ticker import PercentFormatter

import geopandas as gpd
from itertools import combinations
import itertools
import random
from random import sample
import time

plt.rcParams["figure.figsize"] = (15,10)


In [2]:
path = os.getcwd() + "/data/raw/trips"
csv_files = glob.glob(os.path.join(path, "*.csv"))

# trips_df = pd.read_csv('data/raw/trips/trips_df_mini.csv')
  
trips_df = pd.DataFrame()

# loop over the list of csv files
for f in csv_files:
      
#     read the csv file
    current_csv = pd.read_csv(f)
    trips_df = trips_df.append(current_csv)
#     print the location and filename
    print('Location:', f)
    print('File Name:', f.split("\\")[-1])

#     print the content
    print('Content:')
    display(trips_df)
    print()


  current_csv = pd.read_csv(f)
  trips_df = trips_df.append(current_csv)


Location: /Users/tinafang/Documents/Berkeley/W210/capstone_bikeshare/data/raw/trips/trips_df.csv
File Name: /Users/tinafang/Documents/Berkeley/W210/capstone_bikeshare/data/raw/trips/trips_df.csv
Content:


Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender
0,0,259,2020-07-01 00:01:15.0430,2020-07-01 00:05:34.1010,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.076570,26,Washington St at Waltham St,42.341575,-71.068904,6059,Subscriber,02118,,
1,1,436,2020-07-01 00:03:39.1810,2020-07-01 00:10:55.4600,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,Customer,02114,,
2,2,1346,2020-07-01 00:04:27.0790,2020-07-01 00:26:53.2030,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,Customer,,,
3,3,2069,2020-07-01 00:04:56.0140,2020-07-01 00:39:25.1100,436,Maverick St at Massport Path,42.367741,-71.033360,436,Maverick St at Massport Path,42.367741,-71.033360,3858,Subscriber,02128,,
4,4,1266,2020-07-01 00:05:43.0180,2020-07-01 00:26:49.0580,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,Customer,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15276597,236071,775,2018-08-31 23:58:25.5780,2018-09-01 00:11:20.8620,79,Beacon St at Washington / Kirkland,42.378420,-71.105668,87,Harvard University Housing - 115 Putnam Ave at...,42.366621,-71.114214,3392,Subscriber,,1989,1.0
15276598,236072,358,2018-08-31 23:58:28.0730,2018-09-01 00:04:26.8610,100,Davis Square,42.396969,-71.123024,111,Packard Ave at Powderhouse Blvd,42.404490,-71.123413,3345,Subscriber,,1977,1.0
15276599,236073,518,2018-08-31 23:58:35.2490,2018-09-01 00:07:13.5640,177,University Park,42.362648,-71.100061,96,Cambridge Main Library at Broadway / Trowbridg...,42.373379,-71.111075,2943,Subscriber,,1980,1.0
15276600,236074,370,2018-08-31 23:59:06.3960,2018-09-01 00:05:17.2460,105,Lower Cambridgeport at Magazine St / Riverside Rd,42.357219,-71.113872,97,Harvard University River Houses at DeWolfe St ...,42.369190,-71.117141,850,Subscriber,,1994,2.0





In [3]:
trips_df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
stations = pd.read_csv("data/raw/stations/current_bluebikes_stations.csv")
stations.columns = stations.iloc[0]
stations = stations.drop(stations.index[0])

In [5]:
# Set as datetime object
trips_df['starttime'] = pd.to_datetime(trips_df["starttime"], format='%Y-%m-%d %H:%M:%S')
trips_df['stoptime'] = pd.to_datetime(trips_df["stoptime"], format='%Y-%m-%d %H:%M:%S')


In [6]:
print(len(trips_df), ": original length") 

#trip duration

trips_df = trips_df[trips_df['tripduration'] < 90000]  #threshhold of 25 hrs....

#lat/long 



trips_df = trips_df[trips_df['start station longitude'] != 0  & 
                    
                   (trips_df['start station longitude'] < 72)]

trips_df = trips_df[(trips_df['start station latitude'] != 0) & 
                    
                   (trips_df['start station latitude'] < 43)]

trips_df = trips_df[(trips_df['end station longitude'] != 0) &
                   (trips_df['end station longitude'] != r"\N")]

trips_df = trips_df[(trips_df['end station latitude'] != 0) &
                   (trips_df['end station latitude'] != r"\N")]

print(len(trips_df), ": after removal")

15276602 : original length
15263061 : after removal


## Create evaluation Dataset

### Create helper functions

In [7]:
def get_movement_starts(station_id, prediction_time, prediction_window):
    trips_df_subset_start = trips_df[trips_df['start station id'] == station_id]
    target_time = prediction_time + pd.to_timedelta(prediction_window, unit='m')
    trips_df_station_starts = trips_df_subset_start[(trips_df_subset_start['starttime'] > prediction_time) & 
                                                (trips_df_subset_start['starttime'] < target_time)].sort_values(by='starttime')

    return len(trips_df_station_starts)


In [8]:
def get_movement_returns(station_id, prediction_time, prediction_window):
    trips_df_subset_return = trips_df[trips_df['end station id'] == station_id]
    target_time = prediction_time + pd.to_timedelta(prediction_window, unit='m')
    trips_df_station_returns = trips_df_subset_return[(trips_df_subset_return['stoptime'] > prediction_time) & 
                                                   (trips_df_subset_return['stoptime'] < target_time)].sort_values(by='stoptime')

    return len(trips_df_station_returns)


In [9]:
def convert_to_nearest_15_mins(sample_trip_timestamp, delta):
    sample_trip_epoch = sample_trip_timestamp.timestamp()
    converted_datetime = datetime.utcfromtimestamp(sample_trip_epoch)
    converted_15_datetime = converted_datetime + (datetime.min - converted_datetime) % delta
    return converted_15_datetime

### Creating the full list of combinations to be filled

#### Create timestamps for prediction
Set to be in 15 minute intervals

In [10]:
# random timestamps that we want to predict from
lst_timestamps = trips_df.starttime.sample(1000, random_state=321).apply(convert_to_nearest_15_mins, 
                                                     args=(timedelta(minutes=15),))

# lst_timestamps = [pd.Timestamp(2021, 5, 5, 13)] - one example

In [11]:
trip_df_station_counts = trips_df.groupby('start station id').count().reset_index()[['start station id', 'starttime']].rename(columns={"starttime": "station_counts"})
# get the counts of each station to weigh the sample

#### Create list of stations weighted by trip count

In [12]:
random.seed(321)
station_ids_reweighted = random.choices(trip_df_station_counts['start station id'], weights = trip_df_station_counts.station_counts, k = 1000)


In [13]:
station_ids = station_ids_reweighted # all stations
prediction_windows = [15] # 15 minutes only
prediction_start_timestamps = lst_timestamps # 1000

#### Create the combination of station ids, prediction start timestamps, and prediction windows

In [14]:
lst_combinations = list(itertools.product(station_ids, prediction_start_timestamps, prediction_windows))


In [15]:
len(lst_combinations) # 1000 * 1 * 1000

1000000

### Taking a subset of the full list of combinations

In [16]:
random.seed(321)
lst_combinations_subset = random.sample(lst_combinations, 10000)

## Filling the movement information for starts and returns

In [17]:
start_time = time.time()

lst_num_of_bikes_left = [get_movement_starts(station_id=trip_combination[0],
                                             prediction_time=trip_combination[1],
                                             prediction_window=trip_combination[2]) for trip_combination in lst_combinations_subset]

print("--- %s seconds ---" % (round(time.time() - start_time)))


--- 268 seconds ---


In [18]:
start_time = time.time()

lst_num_of_bikes_returned = [get_movement_returns(station_id=trip_combination[0],
                                                  prediction_time=trip_combination[1],
                                                  prediction_window=trip_combination[2]) for trip_combination in lst_combinations_subset]

print("--- %s seconds ---" % (round(time.time() - start_time)))


--- 263 seconds ---


### Amount of time it takes to run
* 2 seconds for 10 rows
* 8 seconds for 100 rows
* 57 seconds for 1,000 rows
* 563 seconds for 10,000 rows (10 minutes)

### Create dataframe based on the lists

In [19]:
data = {'station': [trip_combination[0] for trip_combination in lst_combinations_subset],
        'prediction_time': [trip_combination[1] for trip_combination in lst_combinations_subset], 
        'prediction_window': [trip_combination[2] for trip_combination in lst_combinations_subset],
        'number_of_bikes_left': lst_num_of_bikes_left,
        'number_of_bikes_returned': lst_num_of_bikes_returned}
df = pd.DataFrame(data=data)
df['net_difference'] = df['number_of_bikes_returned'] - df['number_of_bikes_left']

In [20]:
df

Unnamed: 0,station,prediction_time,prediction_window,number_of_bikes_left,number_of_bikes_returned,net_difference
0,379,2019-10-09 08:15:00,15,0,1,1
1,6,2021-06-14 05:15:00,15,0,0,0
2,398,2021-07-16 01:00:00,15,0,0,0
3,46,2022-06-16 22:45:00,15,4,0,-4
4,70,2020-09-29 19:45:00,15,1,0,-1
...,...,...,...,...,...,...
9995,46,2021-10-29 14:30:00,15,2,1,-1
9996,107,2021-07-16 19:45:00,15,0,0,0
9997,211,2015-09-02 10:30:00,15,0,0,0
9998,553,2018-06-24 17:00:00,15,0,0,0


### Pickle it

In [21]:
df.to_pickle('./datasets/evaluation_df.pkl')

In [22]:
pd.read_pickle("./datasets/evaluation_df.pkl") 

Unnamed: 0,station,prediction_time,prediction_window,number_of_bikes_left,number_of_bikes_returned,net_difference
0,379,2019-10-09 08:15:00,15,0,1,1
1,6,2021-06-14 05:15:00,15,0,0,0
2,398,2021-07-16 01:00:00,15,0,0,0
3,46,2022-06-16 22:45:00,15,4,0,-4
4,70,2020-09-29 19:45:00,15,1,0,-1
...,...,...,...,...,...,...
9995,46,2021-10-29 14:30:00,15,2,1,-1
9996,107,2021-07-16 19:45:00,15,0,0,0
9997,211,2015-09-02 10:30:00,15,0,0,0
9998,553,2018-06-24 17:00:00,15,0,0,0


### Checking the results

In [23]:
prediction_time = pd.Timestamp(2022, 6, 16, 22, 45)
trips_df_subset_start = trips_df[trips_df['start station id'] == 46]
target_time = prediction_time + pd.to_timedelta(15, unit='m')
trips_df_station_starts = trips_df_subset_start[(trips_df_subset_start['starttime'] > prediction_time) & 
                                                (trips_df_subset_start['starttime'] < target_time)].sort_values(by='starttime')

In [24]:
prediction_time = pd.Timestamp(2022, 6, 16, 22, 45)
trips_df_subset_return = trips_df[trips_df['end station id'] == 46]
target_time = prediction_time + pd.to_timedelta(15, unit='m')
trips_df_station_returns = trips_df_subset_return[(trips_df_subset_return['stoptime'] > prediction_time) & 
                                                   (trips_df_subset_return['stoptime'] < target_time)].sort_values(by='stoptime')


In [25]:
trips_df_station_starts

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender
10737172,428,2022-06-16 22:50:00.206,2022-06-16 22:57:08.619,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,364,Tremont St at Northampton St,42.338432,-71.08169,7792,Customer,31322.0,,
10737199,1746,2022-06-16 22:51:29.020,2022-06-16 23:20:35.596,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,54,Tremont St at West St,42.354979,-71.063348,6862,Subscriber,2120.0,,
10737205,2608,2022-06-16 22:51:56.924,2022-06-16 23:35:25.493,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,2719,Customer,624.0,,
10737213,1725,2022-06-16 22:52:13.192,2022-06-16 23:20:58.300,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,54,Tremont St at West St,42.354979,-71.063348,2810,Customer,,,


In [26]:
trips_df_station_returns

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender


### Check another example

In [34]:
prediction_time = pd.Timestamp(2018, 6, 24, 17)
trips_df_subset_start = trips_df[trips_df['start station id'] == 553]
target_time = prediction_time + pd.to_timedelta(15, unit='m')
trips_df_station_starts = trips_df_subset_start[(trips_df_subset_start['starttime'] > prediction_time) & 
                                                (trips_df_subset_start['starttime'] < target_time)].sort_values(by='starttime')

In [35]:
prediction_time = pd.Timestamp(2018, 6, 24, 17)
trips_df_subset_return = trips_df[trips_df['end station id'] == 553]
target_time = prediction_time + pd.to_timedelta(15, unit='m')
trips_df_station_returns = trips_df_subset_return[(trips_df_subset_return['stoptime'] > prediction_time) & 
                                                   (trips_df_subset_return['stoptime'] < target_time)].sort_values(by='stoptime')


In [36]:
trips_df_station_starts

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender


In [37]:
trips_df_station_returns

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,birth year,gender


## Evaluation Metrics on the Baseline

In [38]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

In [39]:
y_true = np.array(df.net_difference)
y_pred = np.zeros(df.shape[0]) # predicting 0 movement for every row

In [40]:
print(f"The mean squared error is {mean_squared_error(y_true, y_pred)}.")
print(f"The r^2 is {round(r2_score(y_true, y_pred), 4)}.")

The mean squared error is 2.8525.
The r^2 is -0.0.
