## Data Processing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from deepar_model_utils import prep_station_data
from deepar_model_utils import get_station_data
from deepar_model_utils import deepar_station_data
from deepar_model_utils import write_dicts_to_file

%matplotlib inline

In [None]:
#bucket = ""

In [None]:
#file = "cleaned_historical_trips_2015_2022.csv"

#s3_data_location = f"s3://{bucket}/{file}*"
#trips = pd.read_csv(s3_data_location, parse_dates = True)

In [None]:
#trips = trips[(trips["starttime"] > "2017-09-01") & (trips["stoptime"] < "2022-08-31")]

In [None]:
#trips_start = trips[["starttime", "start station id", "start station name"]]
#trips_stop = trips[["stoptime", "end station id", "end station name"]]

In [None]:
#trips_start.to_csv("model_trips_start_station_2017_2022.csv")
#trips_stop.to_csv("model_trips_stop_station_2017_2022.csv")

### Trip Start Station

Aka how many bikes left a station.

In [None]:
start_file = "../model_trips_start_station_2017_2022.csv"

#s3_start_location = f"s3://{bucket}/{start_file}*"
#trips_start = pd.read_csv(s3_start_location, parse_dates = True)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

Check start station id matches up with start station name. In this case, some stations have changed names due to location changes or due to a lack of data standardization. `trips_start_lookup` is a lookup table to match between the different station ids and station names.

In [None]:
#trips_start["start station id"].nunique()

In [None]:
#trips_start["start station name"].nunique()

In [None]:
#trips_start.drop_duplicates(subset = ["start station id", "start station name"]).to_csv("unique_start.csv")

In [None]:
#trips_start_lookup = trips_start.drop(["Unnamed: 0", "starttime"], axis = 1).drop_duplicates()

In [None]:
#trips_start_lookup.to_csv("trip_start_station_id_lookup.csv", index = False)

Will use start station id and not start station name. From manually looking at the data, station name has more variation and very similar station names have the same station id.

501 unique stations and 12,072,690 trips.

Although not terribly useful now, grouping by and getting the size will help with the resampling later.

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

Transform data into the format required by DeepAR. Not all series start at the same time or end at the same time. DeepAR allows series to start at different times, but I assume that all series have to end at the same time (or else how is prediction supposed to happen?).

Training period is first 4 years of the data and testing period is final year of the data. Also, to train the initial model, I filtered out any stations that did not exist prior to the `test_date`. This ensures that there is corresponding training and testing data for every station.

In [None]:
freq = "15min" # group and sum trips by a set increment
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time
train_date = "2021-08-31"
test_date = "2021-09-01"

In [None]:
train_data_start, test_data_start = deepar_station_data(trips_start_all_group, "start station id", "starttime", freq, max_date, train_date, test_date)

In [None]:
print(len(train_data_start))
print(len(test_data_start))

In [None]:
# make sure all test data is the same length
test_length = 0
for i in range(len(test_data_start)):
    test_length += len(test_data_start[i]["target"])
test_length / len(test_data_start) # should be 35,040

In [None]:
# check number of trips
trips = 0
for i in range(len(train_data_start)):
    trips += sum(train_data_start[i]["target"])
for i in range(len(test_data_start)):
    trips += sum(test_data_start[i]["target"])
trips # lost 85,515 trips

In [None]:
# save to json lines format
write_dicts_to_file("train_start.json", train_data_start)
write_dicts_to_file("test_start.json", test_data_start)

In [None]:
fig, axs = plt.subplots(4, 1, figsize = (20, 20), sharex = True)
axx = axs.ravel()
for i in range(0, 4):
    temp_station = [177, 436, 572, 67][i]
    get_station_data(trips_start_all_group, "start station id", "starttime", temp_station, freq, max_date).plot(ax = axx[i])
    axx[i].set_xlabel("date")
    axx[i].set_ylabel("trip count")
    axx[i].set_title(str(temp_station))
    axx[i].grid(which = "minor", axis = "x")

### Trip End Station

In [None]:
#end_file = "../model_trips_stop_station_2017_2022.csv"

#s3_end_location = f"s3://{bucket}/{end_file}*"
#trips_end = pd.read_csv(s3_end_location, parse_dates = True)

#trips_end = pd.read_csv(end_file, parse_dates = True)
#trips_end.shape