## K-Means Station Clustering

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

from deepar_model_utils import prep_station_data
from deepar_model_utils import get_station_data
from deepar_model_utils import get_cluster_data
from deepar_model_utils import deepar_station_data
from deepar_model_utils import write_dicts_to_file

%matplotlib inline

## Trip Start Station

In [None]:
start_file = "../model_trips_start_station_20208029_20220831.csv"

#s3_start_location = f"s3://{bucket}/{start_file}*"
#trips_start = pd.read_csv(s3_start_location, parse_dates = True)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

In [None]:
freq = "15min" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series end at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
start_list = []
for station in tqdm(trips_start_all_group["start station id"].unique()):
    start_list.append(get_cluster_data(trips_start_all_group, "start station id", "starttime", station, freq, min_date, max_date))

In [None]:
print(len(start_list))
print(sum([sum(i) for i in zip(*start_list)]))

In [None]:
formatted_start_list = to_time_series_dataset(start_list)
formatted_start_list.shape

In [None]:
%%time
start_km = TimeSeriesKMeans(n_clusters = 21, metric = "dtw", max_iter = 10)
start_km_labels = start_km.fit_predict(formatted_start_list)

## Trip End Station

In [None]:
stop_file = "../model_trips_stop_station_20208029_20220831.csv"

#s3_end_location = f"s3://{bucket}/{end_file}*"
#trips_end = pd.read_csv(s3_end_location, parse_dates = True)

trips_stop = pd.read_csv(stop_file, parse_dates = True)
trips_stop.shape

In [None]:
trips_stop_all_group = prep_station_data(trips_stop, "end station id", "stoptime")
print(sum(trips_stop_all_group["size"]))

In [None]:
freq = "15min" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series end at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
stop_list = []
for station in tqdm(trips_stop_all_group["end station id"].unique()):
    stop_list.append(get_cluster_data(trips_stop_all_group, "end station id", "stoptime", station, freq, min_date, max_date))

In [None]:
print(len(stop_list))
print(sum([sum(i) for i in zip(*stop_list)]))

In [None]:
formatted_stop_list = to_time_series_dataset(stop_list)
formatted_stop_list.shape

In [None]:
%%time
stop_km = TimeSeriesKMeans(n_clusters = 21, metric = "dtw", max_iter = 10)
stop_km_labels = stop_km.fit_predict(formatted_stop_list)